In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os

# import fitz
# from PyPDF2 import PdfReader

import sys
from pyresparser import ResumeParser

import torch as tr
from transformers import DistilBertModel, DistilBertTokenizer

from sklearn.metrics.pairwise import cosine_similarity
# sys.executable

In [3]:
resume_data = "./Resume-Data/data/"
skills_data = "./Resume-Data/skills.csv"
job_data = "./Resume-Data/training_data.csv"

In [4]:
def embedding_text(text):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    model = DistilBertModel.from_pretrained("distilbert-base-cased")
    encoded_dict = tokenizer.encode_plus(
                            text,                      
                            add_special_tokens = True, 
                            max_length = 64,           
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                            truncation = True
                       )
    output = model(encoded_dict['input_ids'],encoded_dict['attention_mask'])
    return output

In [None]:
path = [os.path.join(resume_data,i) for i in os.listdir(resume_data)]
path_list = [os.path.join(items, x).replace("\\","/") for items in path for x in os.listdir(items)]
res_data = []
for i in range(len(path_list)):
# for i in range(10):
    data = ResumeParser(path_list[i]).get_extracted_data()
    res_data.append([path_list[i].split('/')[-1].split('.')[0], data['name'], (" ".join(data['skills'])).strip(","), path_list[i]])

res_data = pd.DataFrame(res_data, columns = ['Res_id','Position','Skills', 'Path'])
res_data
                        

In [17]:
len(res_data)

10

In [6]:
jd = pd.read_csv(job_data)
jd['job_description']

0      minimum qualifications\nbachelors degree or eq...
1      description\nas an asc you will be highly infl...
2      its an amazing time to be joining netflix as w...
3      description\n\nweb designers looking to expand...
4      at trackfive weve got big goals were on a miss...
                             ...                        
848    job description\n\nparttime\n\nmake big money ...
849    responsibilities\nparkers internship program w...
850     the borgen project is an innovative national ...
851    put the world on vacation\n\nat wyndham destin...
852    this job handles customer inquiries by telepho...
Name: job_description, Length: 853, dtype: object

In [7]:
# jd['job_description'].sample(10)

In [8]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nilesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
skills_list = pd.read_csv(skills_data)
skills_list = skills_list.columns
skills_list

Index(['technical skills', 'ajenti', 'django-suit', 'django-xadmin',
       'flask-admin', 'flower', 'grappelli', 'wooey', 'algorithms',
       'pypattyrn',
       ...
       'virtualized networks', 'network automation', 'cloud management', 'ai',
       'salesforce', 'mango db', 'math', 'calculus', 'product launch', 'mvp'],
      dtype='object', length=1249)

In [10]:
# Lower case
jd['clean_job_description'] = jd['job_description'].apply(lambda x: " ".join(x.lower()for x in x.split()))
# remove tabulation and punctuation
jd['clean_job_description'] = jd['clean_job_description'].str.replace('[^\w\s]',' ')
## digits
jd['clean_job_description'] = jd['clean_job_description'].str.replace('\d+', '')

#remove stop words
jd['clean_job_description'] = jd['clean_job_description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

## lemmatization
# jd['job_description'] = jd['job_description'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))

jd['clean_job_description'] = jd['clean_job_description'].apply(lambda x: " ".join([word for word in x.split() if word in skills_list]))

In [11]:
# ## Delete more stop words
# other_stop_words = ['junior', 'senior','experience','etc','job','work','company','technique',
#                     'candidate','skill','skills','language','menu','inc','new','plus','years',
#                    'technology','organization','ceo','cto','account','manager','data','scientist','mobile',
#                     'developer','product','revenue','strong', 'description', 'qualification', 'minimum', 'maximum', 'joining', 'amazing']

# jd['job_description'] = jd['job_description'].apply(lambda x: " ".join(x for x in x.split() if x not in other_stop_words))

In [12]:
jd.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response,clean_job_description
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo...",saas sales partnerships plan communication clo...
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ...",visual merchandising communication sales
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus...",tv content partnerships licensing content cont...
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs...",marketing process brand mobile interactive tes...
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo...",recruiting blaze startup process lifecycle cod...


In [19]:
recommended_data = []
for l in range(15):
    em1 = np.squeeze(embedding_text(jd['clean_job_description'][l])[0].detach().numpy())
    for i in range(len(res_data)):
        # score_list = []
        # recommended_data.append(i)
        em2 = np.squeeze(embedding_text(res_data['Skills'][i])[0].detach().numpy())
        similarity_score = cosine_similarity(em1, em2)
        score = similarity_score.mean()
        recommended_data.append([jd['job_description'][l], res_data['Path'][i], score]) 
    

In [20]:
res_df = pd.DataFrame(recommended_data, columns = ['Job_Description','Resume_Path','Score'])
res_df                

Unnamed: 0,Job_Description,Resume_Path,Score
0,minimum qualifications\nbachelors degree or eq...,./Resume-Data/data/ACCOUNTANT/10554236.pdf,0.663024
1,minimum qualifications\nbachelors degree or eq...,./Resume-Data/data/ACCOUNTANT/10674770.pdf,0.656870
2,minimum qualifications\nbachelors degree or eq...,./Resume-Data/data/ACCOUNTANT/11163645.pdf,0.682992
3,minimum qualifications\nbachelors degree or eq...,./Resume-Data/data/ACCOUNTANT/11759079.pdf,0.674160
4,minimum qualifications\nbachelors degree or eq...,./Resume-Data/data/ACCOUNTANT/12065211.pdf,0.688330
...,...,...,...
95,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12202337.pdf,0.637351
96,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12338274.pdf,0.622540
97,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12442909.pdf,0.612729
98,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12780508.pdf,0.635038


In [31]:
# res_df.groupby('Job_Description').apply(lambda x: x.sort_values(["Score"], ascending = False)).reset_index(drop=True)

Unnamed: 0,Job_Description,Resume_Path,Score
0,about the position\n\nthe web designer is resp...,./Resume-Data/data/ACCOUNTANT/10554236.pdf,0.627723
1,about the position\n\nthe web designer is resp...,./Resume-Data/data/ACCOUNTANT/12065211.pdf,0.627545
2,about the position\n\nthe web designer is resp...,./Resume-Data/data/ACCOUNTANT/12202337.pdf,0.625108
3,about the position\n\nthe web designer is resp...,./Resume-Data/data/ACCOUNTANT/12802330.pdf,0.623151
4,about the position\n\nthe web designer is resp...,./Resume-Data/data/ACCOUNTANT/12780508.pdf,0.621724
...,...,...,...
95,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/11759079.pdf,0.629078
96,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/11163645.pdf,0.627468
97,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/10674770.pdf,0.625511
98,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12338274.pdf,0.622540


In [34]:
final_df = res_df.sort_values(['Job_Description', 'Score'],ascending=False).groupby('Job_Description').head(5)
final_df

Unnamed: 0,Job_Description,Resume_Path,Score
94,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12065211.pdf,0.641517
95,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12202337.pdf,0.637351
90,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/10554236.pdf,0.637283
99,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12802330.pdf,0.636252
98,type of requisition regular\n\nclearance level...,./Resume-Data/data/ACCOUNTANT/12780508.pdf,0.635038
84,tuff is a growth marketing team working with c...,./Resume-Data/data/ACCOUNTANT/12065211.pdf,0.586827
88,tuff is a growth marketing team working with c...,./Resume-Data/data/ACCOUNTANT/12780508.pdf,0.586509
80,tuff is a growth marketing team working with c...,./Resume-Data/data/ACCOUNTANT/10554236.pdf,0.586314
85,tuff is a growth marketing team working with c...,./Resume-Data/data/ACCOUNTANT/12202337.pdf,0.585919
89,tuff is a growth marketing team working with c...,./Resume-Data/data/ACCOUNTANT/12802330.pdf,0.585192


In [5]:
# with open(path_list[0],'rb') as f:
#     reader = PdfReader(f)
# #     info = reader.getDocumentInfo()
# #     print(info)
#     for page in reader.pages:
#         txt = page.extract_text()
        
# doc = fitz.open(path_list[1])
# text = ""
# for page in doc:
#    text+=page.get_text()
