## Resume Recommendation

#### Importing libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os

# import fitz
# from PyPDF2 import PdfReader

import sys
from pyresparser import ResumeParser

import torch as tr
from transformers import DistilBertModel, DistilBertTokenizer

import re
import nltk
# nltk.download('wordnet')
# from nltk.stem import WordNetLemmatizer
# wordnet_lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.metrics.pairwise import cosine_similarity
# sys.executable

#### Getting Data Path

In [3]:
resume_data = "./Resume-Data/data/"
skills_data = "./Resume-Data/skills.csv"
job_data = "./Resume-Data/training_data.csv"

#### Using pyresparser to extract skills from PDF

In [4]:
path = [os.path.join(resume_data,i) for i in os.listdir(resume_data)]
path_list = [os.path.join(items, x).replace("\\","/") for items in path for x in os.listdir(items)]
res_data = []
# for i in range(len(path_list)):
for i in range(500):
    data = ResumeParser(path_list[i]).get_extracted_data()
    res_data.append([path_list[i].split('/')[-1].split('.')[0], data['name'], (" ".join(data['skills'])).strip(","), path_list[i]])

res_data = pd.DataFrame(res_data, columns = ['Res_id','Position','Skills', 'Path'])
res_data
                        

Unnamed: 0,Res_id,Position,Skills,Path
0,10554236,ACCOUNTANT Summary,Transportation Documentation Erp Safety Metric...,./Resume-Data/data/ACCOUNTANT/10554236.pdf
1,10674770,STAFF ACCOUNTANT,Inventory Documentation Deposits Customer serv...,./Resume-Data/data/ACCOUNTANT/10674770.pdf
2,11163645,ACCOUNTANT Professional,Mortgage Inventory Customer service Transactio...,./Resume-Data/data/ACCOUNTANT/11163645.pdf
3,11759079,SENIOR ACCOUNTANT,Mortgage Inventory Controls Process Improvemen...,./Resume-Data/data/ACCOUNTANT/11759079.pdf
4,12065211,SENIOR ACCOUNTANT,Inventory Documentation Controls Cms Transacti...,./Resume-Data/data/ACCOUNTANT/12065211.pdf
...,...,...,...,...
495,93576192,Â Accomplishments,Design Testing Video Data analysis English Tea...,./Resume-Data/data/ARTS/93576192.pdf
496,94230796,LIBRARY AIDE,Documentation Safety Transactions Communicatio...,./Resume-Data/data/ARTS/94230796.pdf
497,99033098,CASHIER Receptionist,Data collection Documentation Deposits Custome...,./Resume-Data/data/ARTS/99033098.pdf
498,99561379,ASSISTANT DIRECTOR,Documentation Chemistry Safety Controls Staffi...,./Resume-Data/data/ARTS/99561379.pdf


In [5]:
len(res_data)

500

#### Reading Job_Description

In [6]:
jd = pd.read_csv(job_data)
jd['job_description']

0      minimum qualifications\nbachelors degree or eq...
1      description\nas an asc you will be highly infl...
2      its an amazing time to be joining netflix as w...
3      description\n\nweb designers looking to expand...
4      at trackfive weve got big goals were on a miss...
                             ...                        
848    job description\n\nparttime\n\nmake big money ...
849    responsibilities\nparkers internship program w...
850     the borgen project is an innovative national ...
851    put the world on vacation\n\nat wyndham destin...
852    this job handles customer inquiries by telepho...
Name: job_description, Length: 853, dtype: object

In [7]:
# jd['job_description'].sample(10)

#### Getting Skills list to extarct skills from job-description

In [8]:
skills_list = pd.read_csv(skills_data)
skills_list = skills_list.columns
skills_list

Index(['technical skills', 'ajenti', 'django-suit', 'django-xadmin',
       'flask-admin', 'flower', 'grappelli', 'wooey', 'algorithms',
       'pypattyrn',
       ...
       'virtualized networks', 'network automation', 'cloud management', 'ai',
       'salesforce', 'mango db', 'math', 'calculus', 'product launch', 'mvp'],
      dtype='object', length=1249)

#### Text preprocessing

In [9]:
# converting to lower case
jd['clean_job_description'] = jd['job_description'].apply(lambda x: " ".join(x.lower()for x in x.split()))
# removing tabulation and punctuation
jd['clean_job_description'] = jd['clean_job_description'].str.replace('[^\w\s]',' ')
# removing digits
jd['clean_job_description'] = jd['clean_job_description'].str.replace('\d+', '')
# removing stop words
jd['clean_job_description'] = jd['clean_job_description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
# lemmatization
# jd['clean_job_description'] = jd['clean_job_description'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))

jd['clean_job_description'] = jd['clean_job_description'].apply(lambda x: " ".join([word for word in x.split() if word in skills_list]))

In [10]:
# Deleting more stop words
# other_stop_words = ['junior', 'senior','experience','etc','job','work','company','technique',
#                     'candidate','skill','skills','language','menu','inc','new','plus','years',
#                    'technology','organization','ceo','cto','account','manager','data','scientist','mobile',
#                     'developer','product','revenue','strong', 'description', 'qualification', 'minimum', 'maximum', 'joining', 'amazing']

# jd['clean_job_description'] = jd['clean_job_description'].apply(lambda x: " ".join(x for x in x.split() if x not in other_stop_words))

In [11]:
jd.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response,clean_job_description
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo...",saas sales partnerships plan communication clo...
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ...",visual merchandising communication sales
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus...",tv content partnerships licensing content cont...
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs...",marketing process brand mobile interactive tes...
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo...",recruiting blaze startup process lifecycle cod...


#### Using DistillBERT to calculate Text Embedding/Vector

In [12]:
def embedding_text(text):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    model = DistilBertModel.from_pretrained("distilbert-base-cased")
    encoded_dict = tokenizer.encode_plus(
                            text,                      
                            add_special_tokens = True, 
                            max_length = 64,           
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                            truncation = True
                       )
    output = model(encoded_dict['input_ids'],encoded_dict['attention_mask'])
    return output

#### Calculating Similarity MAtrix

In [13]:
recommended_data = []
for l in range(15):
    em1 = np.squeeze(embedding_text(jd['clean_job_description'][l])[0].detach().numpy())
    for i in range(len(res_data)):
        # score_list = []
        # recommended_data.append(i)
        em2 = np.squeeze(embedding_text(res_data['Skills'][i])[0].detach().numpy())
        similarity_score = cosine_similarity(em1, em2)
        score = similarity_score.mean()
        recommended_data.append([jd['job_description'][l], res_data['Path'][i], score]) 
    

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: b111124d-e97b-4223-872c-535a599b2432)')' thrown while requesting HEAD https://huggingface.co/distilbert-base-cased/resolve/main/config.json


In [14]:
res_df = pd.DataFrame(recommended_data, columns = ['Job_Description','Resume_Path','Score'])              

In [15]:
# res_df.groupby('Job_Description').apply(lambda x: x.sort_values(["Score"], ascending = False)).reset_index(drop=True)

#### Final result with path to top 5 resume as per given job description

In [16]:
final_df = res_df.sort_values(['Job_Description', 'Score'],ascending=False).groupby('Job_Description').head(5)
final_df

Unnamed: 0,Job_Description,Resume_Path,Score
5257,web developer\n\njob id \n\n positions \n\nloc...,./Resume-Data/data/AGRICULTURE/19851252.pdf,0.738522
5429,web developer\n\njob id \n\n positions \n\nloc...,./Resume-Data/data/ARTS/20488267.pdf,0.737607
5420,web developer\n\njob id \n\n positions \n\nloc...,./Resume-Data/data/ARTS/18319061.pdf,0.737114
5445,web developer\n\njob id \n\n positions \n\nloc...,./Resume-Data/data/ARTS/25561640.pdf,0.734299
5251,web developer\n\njob id \n\n positions \n\nloc...,./Resume-Data/data/AGRICULTURE/17312146.pdf,0.733815
...,...,...,...
6757,design develop and test high quality software...,./Resume-Data/data/AGRICULTURE/19851252.pdf,0.739425
6929,design develop and test high quality software...,./Resume-Data/data/ARTS/20488267.pdf,0.722486
6743,design develop and test high quality software...,./Resume-Data/data/AGRICULTURE/15053703.pdf,0.717453
6939,design develop and test high quality software...,./Resume-Data/data/ARTS/23752500.pdf,0.714248


In [17]:
# with open(path_list[0],'rb') as f:
#     reader = PdfReader(f)
# #     info = reader.getDocumentInfo()
# #     print(info)
#     for page in reader.pages:
#         txt = page.extract_text()
        
# doc = fitz.open(path_list[1])
# text = ""
# for page in doc:
#    text+=page.get_text()
