In [23]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score   


from PyPDF2 import PdfReader

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.collocations import *

from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
# Step 1: 
# get data (single resume intake + jobs dataset) into dataframes with NLP PreProcessings: stop words removal, tokenization, and lemmetizing 

# Import Dataset 
job_Descriptions = pd.read_csv('jobs_data.csv')
print(job_Descriptions.head())

reader = PdfReader("RandomResume2.pdf") #More specific Resume for data science (more relevant jobs)
#reader = PdfReader("RandomResume.pdf") #Less specific Resume for CS Developer (less relevant jobs)
number_of_pages = len(reader.pages)
page = reader.pages[0]
resume = page.extract_text()
resume

                   Job Title              Salary Estimate  \
0             Data Scientist   $53K-$91K (Glassdoor est.)   
1  Healthcare Data Scientist  $63K-$112K (Glassdoor est.)   
2             Data Scientist   $80K-$90K (Glassdoor est.)   
3             Data Scientist   $56K-$97K (Glassdoor est.)   
4             Data Scientist  $86K-$143K (Glassdoor est.)   

                                     Job Description  Rating  \
0  Data Scientist\nLocation: Albuquerque, NM\nEdu...     3.8   
1  What You Will Do:\n\nI. General Summary\n\nThe...     3.4   
2  KnowBe4, Inc. is a high growth information sec...     4.8   
3  *Organization and Job ID**\nJob ID: 310709\n\n...     3.8   
4  Data Scientist\nAffinity Solutions / Marketing...     2.9   

                                 Company Name         Location  \
0                      Tecolote Research\n3.8  Albuquerque, NM   
1  University of Maryland Medical System\n3.4    Linthicum, MD   
2                                KnowBe4\n4.8   Cl

"Malik Rabb\nSeattle, WA | (123) 456-7891 | mrabb@email.com\nSummary\nData Scientist with strong math background and 3+ years of experience using predictive\nmodeling, data processing, and data mining algorithms to solve challenging business problems.\nInvolved in Python open source community and passionate about deep reinforcement learning.\nEducation\nCoral Springs University\nCurrent - Current\nBachelor of Science in Mathematics\nExperience\nRiver Tech, Data Scientist\nJul '19 - Current\n●\nBuilt fuzzy matching algorithm using k-nearest neighbors to identify non-exact matching\nduplicates\n●\nDesigned and developed real time recommendation engine to rank sales leads for upsell\nopportunities\n●\nRefined personalization algorithms for 1M+ customers on web and mobile\n●\nTransformed raw data into MySQL with custom-made ETL application to prepare unruly\ndata for machine learning\nRetail Ocean, Data Scientist\nAug '15 - Jul '19\n●\nLeveraged 200M+ tweets to develop sentiment analysis m

In [26]:
# Part of user query + Resume
num_jobs = 20
location = "New York, NY"

In [27]:
def preProcessing(resume):
    resume1 = resume.lower().replace("\n"," ") #rid of all nextLines
    resume2 = resume1.replace("●", "")  #rid of all bullet points
    return resume2

In [28]:
def tokenize(resume):
    tokens = word_tokenize(resume) 
    stop_words = set(stopwords.words('english') + list(punctuation))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [29]:
def lemmatize(tokenized_resume):
    lemmatizer = WordNetLemmatizer()
    processed_resume = [lemmatizer.lemmatize(token) for token in word_tokenize(" ".join(tokenized_resume))]
    return processed_resume

In [30]:
processed_resume = preProcessing(resume)
processed_resume = tokenize(processed_resume)
processed_resume = lemmatize(processed_resume)
processed_resume

['malik',
 'rabb',
 'seattle',
 'wa',
 '123',
 '456-7891',
 'mrabb',
 'email.com',
 'summary',
 'data',
 'scientist',
 'strong',
 'math',
 'background',
 '3+',
 'year',
 'experience',
 'using',
 'predictive',
 'modeling',
 'data',
 'processing',
 'data',
 'mining',
 'algorithm',
 'solve',
 'challenging',
 'business',
 'problem',
 'involved',
 'python',
 'open',
 'source',
 'community',
 'passionate',
 'deep',
 'reinforcement',
 'learning',
 'education',
 'coral',
 'spring',
 'university',
 'current',
 'current',
 'bachelor',
 'science',
 'mathematics',
 'experience',
 'river',
 'tech',
 'data',
 'scientist',
 'jul',
 "'19",
 'current',
 'built',
 'fuzzy',
 'matching',
 'algorithm',
 'using',
 'k-nearest',
 'neighbor',
 'identify',
 'non-exact',
 'matching',
 'duplicate',
 'designed',
 'developed',
 'real',
 'time',
 'recommendation',
 'engine',
 'rank',
 'sale',
 'lead',
 'upsell',
 'opportunity',
 'refined',
 'personalization',
 'algorithm',
 '1m+',
 'customer',
 'web',
 'mobile',
 't

In [31]:
resume_frame = pd.DataFrame()
#only comparing job description + change to 'text'
resume_frame['job description'] = [""]
resume_frame['ResumeText'] = " ".join(processed_resume)

In [32]:
# Step 2: 
# perform functions of KNN and Cos Sim using tfidf vectorization 
# fit and transform

# Vectorize + Training the data using fit() on both job descriptions and resume
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_jobDesc = tfidf_vectorizer.fit_transform(job_Descriptions['Job Description'].drop_duplicates())
tfidf_Resume = tfidf_vectorizer.transform(resume_frame['ResumeText'])

In [33]:
# Compare the data with Cosine Similarity 
def cos_sim_compare(i):
    return cosine_similarity(tfidf_Resume, i)

cos_sim = list(map(cos_sim_compare, tfidf_jobDesc))

In [34]:
#threshold at 10%
cos_sim_thres = []
thres = .10
for i in range(len(cos_sim)):
  if(cos_sim[i] > thres):
    cos_sim_thres.append(cos_sim[i])

cos_sim_thres

[array([[0.11046938]]),
 array([[0.11595241]]),
 array([[0.12641864]]),
 array([[0.11536095]]),
 array([[0.11288281]]),
 array([[0.14013615]]),
 array([[0.1353946]]),
 array([[0.12002797]]),
 array([[0.16144298]]),
 array([[0.11915952]]),
 array([[0.16863188]]),
 array([[0.10349261]]),
 array([[0.10833322]]),
 array([[0.12768316]]),
 array([[0.12947301]]),
 array([[0.11454843]]),
 array([[0.11503973]]),
 array([[0.12033914]]),
 array([[0.14426142]]),
 array([[0.1049492]]),
 array([[0.10951006]]),
 array([[0.11260404]]),
 array([[0.15156449]]),
 array([[0.12043419]]),
 array([[0.1818972]]),
 array([[0.1621772]]),
 array([[0.16521567]]),
 array([[0.10751626]]),
 array([[0.15672911]]),
 array([[0.17231314]]),
 array([[0.16142429]]),
 array([[0.10319318]]),
 array([[0.10675393]]),
 array([[0.11859341]]),
 array([[0.1086336]]),
 array([[0.12643612]]),
 array([[0.11670927]]),
 array([[0.11478684]]),
 array([[0.14069174]]),
 array([[0.10216605]]),
 array([[0.10849908]]),
 array([[0.10706968]]

In [35]:
#Fit KNN model on the job descriptions and find NNs of Resume
knn_model = NearestNeighbors(n_neighbors = num_jobs) # of 463 unique values using eulcidean distance 

# Train KNN Model
knn_model.fit(tfidf_jobDesc)

# Predict (Find Scores) KNN Model / Note: stores reference for job desc, and the score 
nns = knn_model.kneighbors(tfidf_Resume)

In [36]:
# Step 3: 
# print results of top jobs 
# Change the data here to return top 100 jobs and that they meet the 10% similarity threshold 

# Rank based on cos sim score and knn score in ascending order

#Jobs
def cos_sim_key(i):
    return cos_sim[i]
highest_CosSim = sorted(range(len(cos_sim)), key=cos_sim_key, reverse=True)[:num_jobs] #change amount of reccomendations 
#Scores
scores = [cos_sim[i][0][0] for i in highest_CosSim]


top_jobs_cosSim = pd.DataFrame()
sc = 0
# Get Data from original csv
for i in highest_CosSim:
    top_jobs_cosSim.at[sc, 'Company Name'] = job_Descriptions['Company Name'][i].replace("\n", "")
    top_jobs_cosSim.at[sc, 'Job Title'] = job_Descriptions['Job Title'][i]
    top_jobs_cosSim.at[sc, 'Score'] =  scores[sc]
    top_jobs_cosSim.at[sc, 'Job Description'] = job_Descriptions['Job Description'][i]
    top_jobs_cosSim.at[sc, 'Salary Estimate'] = job_Descriptions['Salary Estimate'][i]
    sc += 1

top_jobs_cosSim

Unnamed: 0,Company Name,Job Title,Score,Job Description,Salary Estimate
0,Centauri4.7,Spectral Scientist/Engineer,0.181897,Thank you for your interest in joining the Cen...,$56K-$117K (Glassdoor est.)
1,Applied Research Laboratories3.7,Geospatial Software Developer and Data Scientist,0.17317,Job Posting Title:\n\nGeospatial Software Deve...,$82K-$129K(Employer est.)
2,OneMagnify4.3,Analytics Manager,0.173085,Responsible for both high level and granular m...,$59K-$116K (Glassdoor est.)
3,HP Inc.4.0,R&D Data Analysis Scientist,0.172313,Applies developed level of subject matter know...,$65K-$130K (Glassdoor est.)
4,Porch3.2,Data Scientist,0.168632,About Our Team\n\nWe understand that the first...,$81K-$130K (Glassdoor est.)
5,The David J. Joseph Company4.4,Senior Data Scientist,0.165216,Overview\n\n\nEveryone wants to work with peop...,$82K-$132K (Glassdoor est.)
6,Sartorius3.5,Data Scientist,0.162177,"Sartorius Stedim Data Analytics, a market lead...",$56K-$95K (Glassdoor est.)
7,Advanced BioScience Laboratories2.7,Staff Scientist- Upstream PD,0.161729,As a Staff Scientist in Upstream Process Devel...,$49K-$113K (Glassdoor est.)
8,Takeda Pharmaceuticals3.7,Data Scientist,0.161443,Job Description\n\n\nTakeda is looking for a D...,$83K-$144K (Glassdoor est.)
9,Grainger3.6,"Director, Data Science",0.161424,"Reporting to the Sr. Director, Analytics & Dat...",$139K-$220K (Glassdoor est.)


In [37]:
#Jobs
closest_NN = nns[1][0][0:] # 3rd
#Scores
scores = nns[0][0][0:]

top_jobs_knn = pd.DataFrame()
sc = 0
# Get Corresponding Data from original csv
for i in closest_NN:
    top_jobs_knn.at[sc, 'Company Name'] = job_Descriptions['Company Name'][i].replace("\n", "")
    top_jobs_knn.at[sc, 'Job Title'] = job_Descriptions['Job Title'][i]
    top_jobs_knn.at[sc, 'Score'] =  scores[sc]
    top_jobs_knn.at[sc, 'Job Description'] = job_Descriptions['Job Description'][i]
    top_jobs_knn.at[sc, 'Salary Estimate'] = job_Descriptions['Salary Estimate'][i]
    sc += 1

top_jobs_knn

Unnamed: 0,Company Name,Job Title,Score,Job Description,Salary Estimate
0,Centauri4.7,Spectral Scientist/Engineer,1.279143,Thank you for your interest in joining the Cen...,$56K-$117K (Glassdoor est.)
1,Applied Research Laboratories3.7,Geospatial Software Developer and Data Scientist,1.285947,Job Posting Title:\n\nGeospatial Software Deve...,$82K-$129K(Employer est.)
2,OneMagnify4.3,Analytics Manager,1.286013,Responsible for both high level and granular m...,$59K-$116K (Glassdoor est.)
3,HP Inc.4.0,R&D Data Analysis Scientist,1.286613,Applies developed level of subject matter know...,$65K-$130K (Glassdoor est.)
4,Porch3.2,Data Scientist,1.289471,About Our Team\n\nWe understand that the first...,$81K-$130K (Glassdoor est.)
5,The David J. Joseph Company4.4,Senior Data Scientist,1.292118,Overview\n\n\nEveryone wants to work with peop...,$82K-$132K (Glassdoor est.)
6,Sartorius3.5,Data Scientist,1.294467,"Sartorius Stedim Data Analytics, a market lead...",$56K-$95K (Glassdoor est.)
7,Advanced BioScience Laboratories2.7,Staff Scientist- Upstream PD,1.294814,As a Staff Scientist in Upstream Process Devel...,$49K-$113K (Glassdoor est.)
8,Takeda Pharmaceuticals3.7,Data Scientist,1.295034,Job Description\n\n\nTakeda is looking for a D...,$83K-$144K (Glassdoor est.)
9,Grainger3.6,"Director, Data Science",1.295049,"Reporting to the Sr. Director, Analytics & Dat...",$139K-$220K (Glassdoor est.)


Job Reccomender System:

Use Kaggle Dataset found: https://www.kaggle.com/code/deetisood/dataset-for-jobs/data?select=salary_data_cleaned.csv
Download and unzip dataset for salary_data_cleaned.csv

What this Reccomender System will do: -Compares Job Descriptions and Resumes -The user query is the resume, the descriptions are the data pool

use TFIDF and KNN (SkLearn) algorithms to determine similarity for top jobs


Process: 


1) Get Data from CSV 
2) Apply stop word removal, lemmetization, tokenization 
3) Use cleaned set to load job descriptions into the vectors for: 
KNN, TFIDF, and Cos Sim
4) Send user query (resume) to through the model 
5) Return top ten Results based on top similarity coeffecients 


In [38]:
# Step 1: 
# get data (single resume intake + jobs dataset) into dataframes with NLP PreProcessings: stop words removal, tokenization, and lemmetizing 

In [39]:
# Step 2: 
# perform functions of KNN and Cos Sim using tfidf vectorization 
# fit and tranform

In [40]:
# Step 3: 
# print results of top jobs in data frames

In [41]:
# Future:
# Implement user query by title and Location (currently only does by resume and number of jobs desired)