# Import Libraries

In [1]:
# ! pip install pypdf

In [22]:
# calculations
import numpy as np
import pandas as pd

# visuals
import matplotlib.pyplot as plt

# parsing
from pypdf import PdfReader

# tokenizing
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer

# vectorizing
from sklearn.feature_extraction.text import TfidfVectorizer

# similarity score
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import re

# Dataset

In [3]:
df = pd.read_csv('job_data.csv')[['Job Description']]
df = df.sample(1000, ignore_index=True, random_state=22)
df.shape

(1000, 1)

In [4]:
df.head()

Unnamed: 0,Job Description
0,"<div id=""jobDescriptionText"" class=""jobsearch-..."
1,"<div id=""jobDescriptionText"" class=""jobsearch-..."
2,"<div id=""jobDescriptionText"" class=""jobsearch-..."
3,"<div id=""jobDescriptionText"" class=""jobsearch-..."
4,"<div id=""jobDescriptionText"" class=""jobsearch-..."


# Parse Job Listings

In [5]:
#function to process job listings
def clean_text(resumeText):
    resumeText = re.sub(r'<[^>]+>', '', resumeText)     # remove html tags
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    resumeText = re.sub(r'(\w)(?<![A-Z])([A-Z])(?![A-Z])', r'\1 \2', resumeText)
    resumeText = resumeText.lower() #remove capital letters
    words = resumeText.split(' ')
    words = [word for word in words if len(word)>1]
    resumeText = ' '.join(words)
    return resumeText

In [6]:
# clean job description and add each to dataframe
df['Clean Job Description'] = df['Job Description'].apply(lambda x: clean_text(x))
df['Clean Job Description'][0]

'ount executive job number 50955730 description spread your wings we are the duck we inspire and are inspired listen and respond empower our people give back to our community and most importantly celebrate every su ess along the way we do it all the aflac way aflac fortune 500 company is an industry leader in voluntary insurance products that pay cash directly to policyholders and one of america best known brands aflac has been recognized by fortune magazine as one of the 100 best companies to work for in america for 20 consecutive years one of the best workplaces for millennials in 2015 the inaugural year of the award and one of america most admired companies for 18 years our business is about being there for people in need so ask yourself are you the duck if so there home and flourishing career for you at aflac the company aflac the location atlanta columbus ga the division communicorp the opportunity ount executive principal duties amp responsibilities provides ongoing sales and ser

# Parse Resume

In [7]:
# convert resume PDF into string
reader = PdfReader("resume.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[0]
pdf_text = page.extract_text()

In [8]:
pdf_text

'Justin Carville\nData Scientist\nAbout Me\nProfessional Experience"Every day is a new adventure." This philosophy brought\nme to Japan in 2017, where I have since leveraged my\nlove of languages to make a living. My experience\nworking in marketing and business operations got me\nexcited about coding and data, so I changed gears and\nam now on a mission to become fluent in this new field.\nTechnical Skills\nPython\nScikit-Learn\nMachine\nLearning\nNLP\nDeep Learning\nLanguages\nEnglish : native\nJapanese : business\nSpanish : conversational\nEducation\nLe Wagon - Tokyo (2023)\n#1 ranked bootcamp worldwide\n9-week intensive data science\nbootcamp\nKICL - Kyoto (2017-2019)\nJapanese language school\nPassed JLPT N2\nUniversity of Rhode Island (2010-2013)\nBachelor of Arts in Spanish, Journalism\n Graduated Magna Cum LaudeContact Info\njccarville@gmail.comTokyo, Japan\nwww.linkedin.com/in/jccarville/\nhttps://github.com/just1nt1me\nLink Academy (2019-2023)\nFreelance Writer, Editor, Trans

In [9]:
# clean resume text
resume = clean_text(pdf_text)
resume

'justin carville data scientist about me professional experience every day is new adventure this philosophy brought me to japan in 2017 where have since leveraged my love of languages to make living my experience working in marketing and business operations got me excited about coding and data so changed gears and am now on mission to become fluent in this new field technical skills python scikit learn machine learning nlp deep learning languages english native japanese business spanish conversational education le wagon tokyo 2023 ranked bootcamp worldwide week intensive data science bootcamp kicl kyoto 2017 2019 japanese language school passed jlpt n2 university of rhode island 2010 2013 bachelor of arts in spanish journalism graduated magna cum laude contact info arville japan www linkedin com in arville link academy 2019 2023 freelance writer editor translator 2018 2023 vipkid esl teacher 2017 2019 we love osaka link to articles sns video content creation for you tube instagram kpi 

# Tokenizing Texts

In [10]:
# funciton to tokenize text
stemmer = PorterStemmer()
def tokenize(df, column):
    for i in range (0, df.shape[0]):
        res = df[column][i]
        res = res.split()
        res = [stemmer.stem(word) for word in res if word not in stopwords.words('english') and word not in string.punctuation]
        df[column][i] = ' '.join(res)
    return df

In [12]:
tokenize(df, 'Clean Job Description')

Unnamed: 0,Job Description,Clean Job Description
0,"<div id=""jobDescriptionText"" class=""jobsearch-...",ount execut job number 50955730 descript sprea...
1,"<div id=""jobDescriptionText"" class=""jobsearch-...",primari purpos work part product manag team en...
2,"<div id=""jobDescriptionText"" class=""jobsearch-...",descript hire enterpris sale develop repres re...
3,"<div id=""jobDescriptionText"" class=""jobsearch-...",vice presid ad sale market nbc olymp respons w...
4,"<div id=""jobDescriptionText"" class=""jobsearch-...",gener summari senior compens analyst key partn...
...,...,...
995,"<div id=""jobDescriptionText"" class=""jobsearch-...",want help peopl feel better want work top rate...
996,"<div id=""jobDescriptionText"" class=""jobsearch-...",job open id 00315276 logist manag open job tit...
997,"<div id=""jobDescriptionText"" class=""jobsearch-...",morningstar busi develop team seek highli moti...
998,"<div id=""jobDescriptionText"" class=""jobsearch-...",schult compani seek task forc director sale jo...


In [13]:
#tokenize resume
res = resume
res = res.split()
res = [stemmer.stem(word) for word in res if word not in stopwords.words('english') and word not in string.punctuation]
tokenized_resume = ' '.join(res)
df['Resume'] = tokenized_resume

In [14]:
df.head()

Unnamed: 0,Job Description,Clean Job Description,Resume
0,"<div id=""jobDescriptionText"" class=""jobsearch-...",ount execut job number 50955730 descript sprea...,justin carvil data scientist profession experi...
1,"<div id=""jobDescriptionText"" class=""jobsearch-...",primari purpos work part product manag team en...,justin carvil data scientist profession experi...
2,"<div id=""jobDescriptionText"" class=""jobsearch-...",descript hire enterpris sale develop repres re...,justin carvil data scientist profession experi...
3,"<div id=""jobDescriptionText"" class=""jobsearch-...",vice presid ad sale market nbc olymp respons w...,justin carvil data scientist profession experi...
4,"<div id=""jobDescriptionText"" class=""jobsearch-...",gener summari senior compens analyst key partn...,justin carvil data scientist profession experi...


# Feature Engineering

## Number of Words in Job Description / Resume

In [15]:
df['JD_num_words'] = df['Clean Job Description'].apply(lambda x: len(x.split(' ')))
df['Resume_num_words'] = df['Resume'].apply(lambda x: len(x.split(' ')))
df.head()

Unnamed: 0,Job Description,Clean Job Description,Resume,JD_num_words,Resume_num_words
0,"<div id=""jobDescriptionText"" class=""jobsearch-...",ount execut job number 50955730 descript sprea...,justin carvil data scientist profession experi...,458,235
1,"<div id=""jobDescriptionText"" class=""jobsearch-...",primari purpos work part product manag team en...,justin carvil data scientist profession experi...,590,235
2,"<div id=""jobDescriptionText"" class=""jobsearch-...",descript hire enterpris sale develop repres re...,justin carvil data scientist profession experi...,498,235
3,"<div id=""jobDescriptionText"" class=""jobsearch-...",vice presid ad sale market nbc olymp respons w...,justin carvil data scientist profession experi...,430,235
4,"<div id=""jobDescriptionText"" class=""jobsearch-...",gener summari senior compens analyst key partn...,justin carvil data scientist profession experi...,351,235


## Number of Words in Common

In [16]:
def normalized_words_common(row):
    jd = set(map(lambda word: word.lower().strip(),row['Clean Job Description'].split(' ')))
    rez = set(map(lambda word: word.lower().strip(),row['Resume'].split(' ')))
    return 1.0 * len(jd & rez)
df['word_common'] = df.apply(normalized_words_common,axis = 1)
df.head()

Unnamed: 0,Job Description,Clean Job Description,Resume,JD_num_words,Resume_num_words,word_common
0,"<div id=""jobDescriptionText"" class=""jobsearch-...",ount execut job number 50955730 descript sprea...,justin carvil data scientist profession experi...,458,235,19.0
1,"<div id=""jobDescriptionText"" class=""jobsearch-...",primari purpos work part product manag team en...,justin carvil data scientist profession experi...,590,235,25.0
2,"<div id=""jobDescriptionText"" class=""jobsearch-...",descript hire enterpris sale develop repres re...,justin carvil data scientist profession experi...,498,235,33.0
3,"<div id=""jobDescriptionText"" class=""jobsearch-...",vice presid ad sale market nbc olymp respons w...,justin carvil data scientist profession experi...,430,235,14.0
4,"<div id=""jobDescriptionText"" class=""jobsearch-...",gener summari senior compens analyst key partn...,justin carvil data scientist profession experi...,351,235,19.0


## Number of Words in Total

In [17]:
def normalized_words_total(row):
    jd = set(map(lambda word: word.lower().strip(),row['Clean Job Description'].split(' ')))
    rez = set(map(lambda word: word.lower().strip(),row['Resume'].split(' ')))
    return 1.0 * (len(jd) + len(rez))
df['word_total'] = df.apply(normalized_words_total,axis = 1)
df.head()

Unnamed: 0,Job Description,Clean Job Description,Resume,JD_num_words,Resume_num_words,word_common,word_total
0,"<div id=""jobDescriptionText"" class=""jobsearch-...",ount execut job number 50955730 descript sprea...,justin carvil data scientist profession experi...,458,235,19.0,446.0
1,"<div id=""jobDescriptionText"" class=""jobsearch-...",primari purpos work part product manag team en...,justin carvil data scientist profession experi...,590,235,25.0,537.0
2,"<div id=""jobDescriptionText"" class=""jobsearch-...",descript hire enterpris sale develop repres re...,justin carvil data scientist profession experi...,498,235,33.0,469.0
3,"<div id=""jobDescriptionText"" class=""jobsearch-...",vice presid ad sale market nbc olymp respons w...,justin carvil data scientist profession experi...,430,235,14.0,437.0
4,"<div id=""jobDescriptionText"" class=""jobsearch-...",gener summari senior compens analyst key partn...,justin carvil data scientist profession experi...,351,235,19.0,388.0


## Percentage of Shared Words

In [19]:
def normalized_words_share(row):
    jd = set(map(lambda word: word.lower().strip(),row['Clean Job Description'].split(' ')))
    rez = set(map(lambda word: word.lower().strip(),row['Resume'].split(' ')))
    return 1.0 * len(jd & rez)/(len(jd) + len(rez))
df['word_share'] = df.apply(normalized_words_share,axis = 1)
df.head()

Unnamed: 0,Job Description,Clean Job Description,Resume,JD_num_words,Resume_num_words,word_common,word_total,word_share
0,"<div id=""jobDescriptionText"" class=""jobsearch-...",ount execut job number 50955730 descript sprea...,justin carvil data scientist profession experi...,458,235,19.0,446.0,0.042601
1,"<div id=""jobDescriptionText"" class=""jobsearch-...",primari purpos work part product manag team en...,justin carvil data scientist profession experi...,590,235,25.0,537.0,0.046555
2,"<div id=""jobDescriptionText"" class=""jobsearch-...",descript hire enterpris sale develop repres re...,justin carvil data scientist profession experi...,498,235,33.0,469.0,0.070362
3,"<div id=""jobDescriptionText"" class=""jobsearch-...",vice presid ad sale market nbc olymp respons w...,justin carvil data scientist profession experi...,430,235,14.0,437.0,0.032037
4,"<div id=""jobDescriptionText"" class=""jobsearch-...",gener summari senior compens analyst key partn...,justin carvil data scientist profession experi...,351,235,19.0,388.0,0.048969


## Fuzzy Wuzzy ??

# Vectorization

In [21]:
# Initialize vectorizer
vectorizer1 = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()

jd_text = list(list(df['Clean Job Description']))
resume_text = list(df['Resume'])

# Fit the vectorizer
print('Fitting')
vectorizer1.fit(jd_text)
vectorizer2.fit(resume_text)

print('Vectorizing jd')
jd_vecs_tfidf = vectorizer1.transform(df['Clean Job Description'].values)

print('Vectorizing resume')
resume_vecs_tfidf = vectorizer2.transform(df['Resume'].values)

Fitting
Vectorizing jd
Vectorizing resume


In [25]:
jd_vecs_tfidf.shape, resume_vecs_tfidf.shape

((1000, 9809), (1000, 175))

# USE embeddings to calculate similarity

In [27]:
!pip install tensorflow tensorflow-hub

Collecting tensorflow-hub
  Downloading tensorflow_hub-0.13.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.6/100.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.13.0


In [28]:
import tensorflow_hub as hub
import tensorflow as tf

2023-06-16 15:34:41.916582: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-16 15:34:42.565187: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-16 15:34:42.717907: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-16 15:34:42.717940: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [29]:
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

KeyboardInterrupt: 