# Compare your CV to a job post
Determine if your CV has the correct keywords needed to get the job that you want.  This project was inspired by https://towardsdatascience.com/ai-is-working-against-your-job-application-bec65d496d22 .  It is possible that your CV may not be even looked at by a real person for the first selection process, the resumes could be choosen by a computer program that is looking for key text related to the available position.  In this case, it is important to have both the correct keywords for the computer algorithm and have your CV look aesthetically pleasing to a human.  ^_^'

This notebook reads in a .pdf or .txt CV, and a job post.  And compares the word, giving a similarity score for all the words and/or the most frequent words used in each document.

<img src="neutral_CV.png" alt="Drawing" style="width: 200px;"/>

In [1]:
import sys
import pathlib   # Needed for reading in both your CV and jobpost
from pathlib import Path

import nltk
#nltk.download('stopwords') # You only have to do this once
#nltk.download('punkt')       # only run one time
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import numpy as np      # linear algebra numpy library

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from collections import namedtuple
def size(vec):
    rvec = []
    cvec = []
    coutr = 0
    
    for r in vec:
        rvec = rvec + [int(1)]

        coutr += 1
        coutc = 0
    
        for col in r:
            coutc += 1
        
        cvec = cvec + [coutc]
    
    row = coutr
    col = max(cvec)
    result = namedtuple('result', 'row00 col00')
    return result(row00=row, col00=col)

In [27]:
def remove_stopwords(wordtokens):
    
    # Put words that are 4 characters long or more, like a name, location, etc that you do not want to process
    list_to_remove = ["gmail", "gmail.com", "https"]
    
    # first let's do a marker method
    marker_vec = np.zeros((len(wordtokens), 1))

    # search for the remove tokens in tok, an put a 1 in the marker_vec
    for i in range(len(wordtokens)):
        for j in range(len(list_to_remove)):
            if wordtokens[i] == list_to_remove[j]:
                marker_vec[i] = 1

    word_tokens0 = []
    for i in range(len(marker_vec)):
        if (marker_vec[i] == 0) & (len(wordtokens[i]) > 4): # this will remove tokens that are 3 characters or less 
            word_tokens0.append(wordtokens[i])
            
    # 4. Removing stopwords using sklearn              
    stop_words = set(stopwords.words('english')) # does not remove "and" or "or"
    word_tokens1 = []
    for w in word_tokens0: 
        if w not in stop_words: 
            word_tokens1.append(w)
    
    return word_tokens1

In [28]:
def preprocessing(text):
    # -------------------------------------
    # Creating word tokens (recognizing each word separately)
    # -------------------------------------
    # 1. Put the text into string format
    Content = ""
    for t in text:
        Content = Content + t.lower().replace("'",'')

    # 2. Tokenize first to get each character separate
    tok = nltk.word_tokenize(Content)
    print('length of tok: ' + str(len(tok)))
    
    # 3. Remove undesireable words from MY OWN stopword list
    word_tokens1 = remove_stopwords(tok)
    
    # 5. Combining word stems 
    ps = PorterStemmer()
    word_tokens2 = []
    for w in word_tokens1:
        word_tokens2.append(ps.stem(w))
    
    return word_tokens2

In [32]:
def get_word_count_uniquewords(word_tokens):
    
    # -------------------------------------
    # Process word tokens
    # -------------------------------------
    vectorizer = CountVectorizer()

    # -------------------------------------
    # 1. Count word tokens and get a unique list of words : count how many times a word appears
    # Get the document-term frequency array: you have to do this first because it links cvtext to vectorizer
    X = vectorizer.fit_transform(word_tokens)
    word_count0 = np.ravel(np.sum(X, axis=0)) # sum vertically
    
    # Get document-term frequency list : returns unique words in the document that are mentioned at least once
    unique_words0 = np.ravel(vectorizer.get_feature_names())
    # -------------------------------------
    # 3. Remove undesireable words AGAIN and adjust the unique_words and word_count vectors
    list_to_remove = ["sallyjon", "zendesk", "rosevil", "sacramento"]
    
    # first let's do a marker method
    marker_vec = np.zeros((len(unique_words0), 1))

    # search for the remove tokens in tok, an put a 1 in the marker_vec
    for i in range(len(unique_words0)):
        for j in range(len(list_to_remove)):
            if unique_words0[i] == list_to_remove[j]:
                marker_vec[i] = 1
    
    unique_words = []
    word_count = []
    for i in range(len(marker_vec)):
        if (marker_vec[i] == 0) & (len(unique_words0[i]) > 4):
            unique_words.append(unique_words0[i])
            word_count.append(word_count0[i])
    
    m = len(np.ravel(word_count))
    # -------------------------------------
    
    # Matrix of unique words and how many times they appear
    mat = np.concatenate([np.reshape(np.ravel(word_count), (m,1)), np.reshape(unique_words, (m,1))], axis=1)
    #print(mat)

    print('There are ' + str(len(word_tokens)) + ' word tokens, but ' + str(len(unique_words)) + ' words are unique.')

    # 2. (Option) sort the unique_words by the word_count such that most frequent words are 1st
    # Gives the index of unique_word_count sorted from min to max
    sort_index = np.argsort(word_count)
    
    # Convert from matrix to array, so we can manipulate the entries
    # Puts the response vector in an proper array vector
    A = np.array(sort_index.T)

    # But we want the index of unique_word_count sorted max to min
    Ainvert = A[::-1]
    
    # Convert the array to a list : this is a list where each entry is a list
    Ainv_list = []
    for i in range(len(Ainvert)):
        Ainv_list.append(Ainvert[i])
        
    # Top num_of_words counted words in document : cvkeywords
    keywords = []
    wc = []
    p = np.ravel(word_count)
    
    #print('Ainv_list' + str(Ainv_list))
    
    top_words = len(Ainv_list)  # 20
    for i in range(top_words):
        keywords.append(unique_words[Ainv_list[i]])
        wc.append(p[Ainv_list[i]])
    
    # Matrix of unique words and how many times they appear
    mat_sort = np.concatenate([np.reshape(np.ravel(wc), (top_words,1)), np.reshape(np.ravel(keywords), (top_words,1))], axis=1)
    print(mat_sort)
    # -------------------------------------
    
    return wc, keywords, mat_sort

In [30]:
# -------------------------------------
# Analysis with a single words
# -------------------------------------

# -------------------------------------
# Load CV document
# -------------------------------------
# Load the job description
file_loc = "C:\\Users\\HP EliteBook\\Documents\\Sourceforge_PROJECTS\\Resume_compare_python\\CV.txt"
print('Reading CV file : ' + file_loc)

with open(file_loc, encoding='utf8', errors="surrogateescape") as cvinfo:
    cvtext = cvinfo.readlines()   #  the raw text with \n for returns following lines

# -------------------------------------
# Creating word tokens (recognizing each word separately)
# -------------------------------------
word_tokensCV = preprocessing(cvtext)

wc_CV, keywords_CV, mat_sort_CV = get_word_count_uniquewords(word_tokensCV)

# -------------------------------------
# Load Job post document
# -------------------------------------
# Load the job description
file_loc = "C:\\Users\\HP EliteBook\\Documents\\Sourceforge_PROJECTS\\Resume_compare_python\\job_post.txt"
print('Reading job file : ' + file_loc)

with open(file_loc, encoding='utf8', errors="surrogateescape") as job:
    jobtext = job.readlines()   #  the raw text with \n for returns following lines

# -------------------------------------
# Creating word tokens (recognizing each word separately)
# -------------------------------------
word_tokensJOB = preprocessing(jobtext)

wc_JOB, keywords_JOB, mat_sort_JOB = get_word_count_uniquewords(word_tokensJOB)

Reading CV file : C:\Users\HP EliteBook\Documents\Sourceforge_PROJECTS\Resume_compare_python\CV.txt
length of tok: 368
There are 193 word tokens, but 98 words are unique.
[['19' 'custom']
 ['11' 'servic']
 ['5' 'experi']
 ['3' 'account']
 ['3' 'manag']
 ['3' 'employe']
 ['3' 'softwar']
 ['2' 'month']
 ['2' 'conduct']
 ['2' 'product']
 ['2' 'direct']
 ['2' 'complaint']
 ['2' 'inform']
 ['2' 'posit']
 ['2' 'return']
 ['2' 'satisfact']
 ['2' 'septemb']
 ['2' 'maintain']
 ['2' 'skill']
 ['2' 'excel']
 ['2' 'specialist']
 ['2' 'kayako']
 ['2' 'averag']
 ['2' 'friendli']
 ['1' 'futur']
 ['1' 'higher']
 ['1' 'desir']
 ['1' 'handl']
 ['1' 'daili']
 ['1' 'field']
 ['1' 'highli']
 ['1' 'disposit']
 ['1' 'employ']
 ['1' 'discount']
 ['1' 'compani']
 ['1' 'comput']
 ['1' 'comprehens']
 ['1' 'addit']
 ['1' 'administr']
 ['1' 'almost']
 ['1' 'america']
 ['1' 'answer']
 ['1' 'assist']
 ['1' 'attain']
 ['1' 'attitud']
 ['1' 'award']
 ['1' 'bachelor']
 ['1' 'basic']
 ['1' 'bookstor']
 ['1' 'california'

In [31]:
# -------------------------------------
# Construct a joint document-term frequency matrix of (CV vs job_post) words
# -------------------------------------
# 1) Get a Percentage of how many common words of JOB are used in CV
# You do not care if keywords are repeated a lot, because in a CV you do not repeat words often.  A measure to see
# if you are just using the correct words to match with the job you want.
tot = np.zeros((len(keywords_JOB),1))
for i in range(len(keywords_JOB)):
    for j in range(len(keywords_CV)):
        if keywords_JOB[i] == keywords_CV[j]:
            tot[i] = 1
            break

per = sum(np.ravel(tot))/len(keywords_JOB) * 100
print('Percentage of common words between CV and JOB: ' + str(per) + '%')

Percentage of common words between CV and JOB: 23.076923076923077%


In [34]:
# 2) Similarity of common word usage (similarity)
# Of the words that are common, how often are common words in CV and JOB are used?  
# Use similarity measure to see how CV is worded similarly as JOB.  
# But this does not tell you how to improve your CV.

# filler is a number that is NOT in the index list and greater than the length of JOB and CV
filler = np.maximum(len(keywords_CV), len(keywords_JOB)) + 100

index_CV = filler*np.ones((len(wc_CV),1))
for i in range(len(wc_JOB)):
    for j in range(len(wc_CV)):
        if keywords_JOB[i] == keywords_CV[j]:
            index_CV[i] = j

# Reshuffle index_CV
index_CV = np.ravel(index_CV)
# print('index_CV : ' + str(index_CV)) 

keywords_CV_orderofJOB = []
wc_CV_orderofJOB = []
for i in range(len(index_CV)):
    if index_CV[i] == filler:
        # keep same word and set word count to zero (because the word is not the same)
        keywords_CV_orderofJOB = keywords_CV_orderofJOB + ['']
        wc_CV_orderofJOB = wc_CV_orderofJOB + [0]
    else:
        keywords_CV_orderofJOB = keywords_CV_orderofJOB + [keywords_CV[int(index_CV[i])]]
        wc_CV_orderofJOB = wc_CV_orderofJOB + [wc_CV[int(index_CV[i])]]

# print('length of keywords_CV_orderofJOB : ' + str(len(keywords_CV_orderofJOB)))
# print('length of keywords_JOB : ' + str(len(keywords_JOB))) 

minlen = np.minimum(len(keywords_CV_orderofJOB), len(keywords_JOB))
# print('minlen : ' + str(minlen))

a = wc_JOB[0:minlen]
b = wc_CV_orderofJOB[0:minlen]

a0 = np.reshape(np.ravel(keywords_JOB[0:minlen]), (minlen,1))
a1 = np.reshape(np.ravel(a), (minlen,1))
a2 = np.reshape(np.ravel(keywords_CV_orderofJOB[0:minlen]), (minlen,1))
a3 = np.reshape(np.ravel(b), (minlen,1))
mat_sort = np.concatenate([a0, a1, a2, a3], axis=1)
print(mat_sort)

# Cosine similarity of similarly aligned words using the frequency count of words
cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
print('Of the words that are in common with your CV and the JOB post, cos_sim with respect to 1 tells how similarly common words were frequently used within both documents.')
print('cos_sim : ' + str(cos_sim))

[['custom' '15' 'custom' '19']
 ['servic' '10' 'servic' '11']
 ['problem' '4' '' '0']
 ['inform' '4' 'inform' '2']
 ['phone' '3' '' '0']
 ['peopl' '3' '' '0']
 ['skill' '3' 'skill' '2']
 ['becom' '2' '' '0']
 ['cancel' '2' 'cancel' '1']
 ['think' '2' '' '0']
 ['commun' '2' 'commun' '1']
 ['product' '2' 'product' '2']
 ['provid' '2' 'provid' '1']
 ['listen' '2' '' '0']
 ['quickli' '2' '' '0']
 ['answer' '2' 'answer' '1']
 ['requir' '2' '' '0']
 ['polici' '2' 'polici' '1']
 ['repres' '2' 'repres' '1']
 ['employ' '1' 'employ' '1']
 ['essenti' '1' '' '0']
 ['eventu' '1' '' '0']
 ['youll' '1' '' '0']
 ['evolv' '1' '' '0']
 ['expect' '1' '' '0']
 ['email' '1' '' '0']
 ['fulfil' '1' '' '0']
 ['greet' '1' '' '0']
 ['guarante' '1' '' '0']
 ['identifi' '1' '' '0']
 ['includ' '1' 'includ' '1']
 ['intent' '1' '' '0']
 ['experi' '1' 'experi' '5']
 ['customer' '1' '' '0']
 ['diploma' '1' '' '0']
 ['complain' '1' '' '0']
 ['across' '1' '' '0']
 ['angri' '1' '' '0']
 ['associ' '1' '' '0']
 ['bachelor'

In [26]:
# 3) Non-common frequently used words in Job are how you improve your CV similarity 
# MISSING KEYWORDS :
# the non-common CV and job words that are frequently used in job, are the words you should put on your CV and 
# the number of times they are repeated in JOB is how many times you might want to put them on your CV.

# Decide the number of top words in JOB to consider
topwords = 20

# Look in wc_CV_orderofJOB for words that are zero
missing_word = []
missing_word_count = []
for i in range(topwords):
    if wc_CV_orderofJOB[i] == 0:
        missing_word = missing_word + [np.ravel(keywords_JOB[i])]
        missing_word_count = missing_word_count + [np.ravel(wc_JOB[i])]

r = len(missing_word)

a4 = np.reshape(np.ravel(missing_word), (r,1))
a5 = np.reshape((np.ravel(missing_word_count)/wc_JOB[0])*100, (r,1))
miss_sort = np.concatenate([a4, a5], axis=1)
print('You do not have these words on YOUR CV, but they are mentioned in the JOB post frequently :')
print('[word, word count percentage in JOB]')
print(miss_sort)
print('You might want to add these words to your CV and your CV will be more similar to the JOB post!')

You do not have these words on YOUR CV, but they are mentioned in the JOB post frequently :
[word, word count percentage in JOB]
[['problem' '26.666666666666668']
 ['phone' '20.0']
 ['peopl' '20.0']
 ['becom' '13.333333333333334']
 ['think' '13.333333333333334']
 ['listen' '13.333333333333334']
 ['quickli' '13.333333333333334']
 ['requir' '13.333333333333334']]
You might want to add these words to your CV and your CV will be more similar to the JOB post!


So Sally might want to put on her resume that she "addressed PPROBLEMS by talking with PEOPLE on the PHONE" or that she is able to "THINK QUICKLY and is a good LISTENER".