In [1]:
import numpy as np
import re
import scipy
from nltk.corpus import stopwords
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

In [2]:
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model

In [3]:
# import nltk
# nltk.download('stopwords')
def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    return cleaned_words

In [4]:
def calculate_heat_matrix_for_two_sentences(s1,s2):
    s1 = preprocess(s1)
    s2 = preprocess(s2)
    result_list = [[cosine_distance_between_two_words(word1, word2) for word2 in s2] for word1 in s1]
    result_df = pd.DataFrame(result_list)
    result_df.columns = s2
    result_df.index = s1
    return result_df

In [5]:
def cosine_distance_between_two_words(word1, word2):
    return (1- scipy.spatial.distance.cosine(model[word1], model[word2]))

In [6]:
def cosine_distance_wordembedding_method(s1, s2):
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')

In [7]:
model = loadGloveModel('glove.6b/glove.6B.300d.txt')

Loading Glove Model
Done. 400001  words loaded!


In [32]:
ss1 = 'Aspiring human resources'
ss2 = 'Seeking Human Resources and Generalist Positions'

In [35]:
cosine_distance_wordembedding_method(ss1, ss2)

Word Embedding method with a cosine distance asses that our two sentences are similar to 77.0 %


In [26]:
df = pd.read_excel("potential-talents-glove.xlsx")
job_titles = df['job_title']
keyword = 'Aspiring human resources'

In [31]:
for i in job_titles:
    vector_1 = np.mean([model[word] for word in preprocess(keyword)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(i)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
#     print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
    print('Word Embeddings for ',keyword,' and',i,' is',cosine)
    

Word Embeddings for  Aspiring human resources  and 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional  is 0.44848275685501204
Word Embeddings for  Aspiring human resources  and Native English Teacher at (English Program in Korea)  is 0.5721073442641142
Word Embeddings for  Aspiring human resources  and Aspiring Human Resources Professional  is 0.07433727990532446
Word Embeddings for  Aspiring human resources  and People Development Coordinator at Ryan  is 0.5126069067646002
Word Embeddings for  Aspiring human resources  and Advisory Board Member at Celal Bayar University  is 0.7371812079227311
Word Embeddings for  Aspiring human resources  and Aspiring Human Resources Specialist  is 0.06530969126181263
Word Embeddings for  Aspiring human resources  and Student at Humber College and Aspiring Human Resources Generalist  is 0.2660399492756056
Word Embeddings for  Aspiring human resources  and HR Senior Specialist  is 0.7055341427025774