### Import packages and libraries

In [1]:
! pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 k

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd

import re
import nltk
import gensim
from gensim.models import word2vec

from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pickle
import matplotlib.pyplot as plt


import os
import warnings
warnings.filterwarnings('ignore')

#os.system('mkdir plot')
#os.system('mkdir model')
#os.system('mkdir data')

  # Data exploration

In [4]:
#dataset
#data = '/content/drive/MyDrive/ACME-HappinessSurvey2020.csv'
data = '/content/drive/MyDrive/Apziva/project3/data/potential-talents.xlsx'
model = '/content/drive/MyDrive/Apziva/project3/model/'
plot = '/content/drive/MyDrive/Apziva/project3/plot/'


In [5]:
df = pd.read_excel(data)
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


#Preprocessing

In [68]:
def clean_sentence(val):
  regex = re.compile('([^\s\w]|_)+')
  sentence = regex.sub('', val).lower()
  return sentence

def clean_dataframe(df):
  for col in ['job_title']:
    df[col] = df[col].apply(clean_sentence)
  return df

df = clean_dataframe(df)
df.head(5)

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 ct bauer college of business graduate mag...,"Houston, Texas",85,
1,2,native english teacher at epik english program...,Kanada,500+,
2,3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,
3,4,people development coordinator at ryan,"Denton, Texas",500+,
4,5,advisory board member at celal bayar university,"İzmir, Türkiye",500+,


##Bag of word

In [69]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(df['job_title'])
keyword=['Aspiring human resources']
key = vectorizer.transform(keyword)

similarity = cosine_similarity(bow,key)
df['fit_bow']=similarity
df.sort_values(by=['fit_bow'],ascending=False)

Unnamed: 0,id,job_title,location,connection,fit,fit_bow
48,49,aspiring human resources specialist,Greater New York City Area,1,,0.866025
57,58,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025
16,17,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025
20,21,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025
23,24,aspiring human resources specialist,Greater New York City Area,1,,0.866025
...,...,...,...,...,...,...
22,23,advisory board member at celal bayar university,"İzmir, Türkiye",500+,,0.000000
21,22,people development coordinator at ryan,"Denton, Texas",500+,,0.000000
19,20,native english teacher at epik english program...,Kanada,500+,,0.000000
47,48,advisory board member at celal bayar university,"İzmir, Türkiye",500+,,0.000000


##TF-IDF

In [70]:
tfidf = TfidfVectorizer(stop_words='english')
title_vector = tfidf.fit_transform(df['job_title'])
keyword=['Aspiring human resources']
#keyword=['Director Of Administration at Excellence Logging']
keyword_vector = tfidf.transform(keyword)

#keyword='Aspiring human resources'
similarity = cosine_similarity(title_vector,keyword_vector)
df['fit_tfidf'] = similarity
df.sort_values(by=['fit_tfidf'],ascending=False)

Unnamed: 0,id,job_title,location,connection,fit,fit_bow,fit_tfidf
45,46,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025,0.753591
16,17,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025,0.753591
2,3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025,0.753591
57,58,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025,0.753591
32,33,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,,0.866025,0.753591
...,...,...,...,...,...,...,...
22,23,advisory board member at celal bayar university,"İzmir, Türkiye",500+,,0.000000,0.000000
21,22,people development coordinator at ryan,"Denton, Texas",500+,,0.000000,0.000000
46,47,people development coordinator at ryan,"Denton, Texas",500+,,0.000000,0.000000
17,18,people development coordinator at ryan,"Denton, Texas",500+,,0.000000,0.000000


#Word2vec

In [123]:
from scipy import spatial
import gensim.downloader as api
w2v_model = api.load('word2vec-google-news-300')

key_set = set(w2v_model.index_to_key)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in key_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

keyword='Aspiring human resources'

for i in range(len(df)):
  sentence = df['job_title'][i]
  s1_afv = avg_feature_vector(sentence, model=w2v_model, num_features=300, index2word_set=key_set)
  s2_afv = avg_feature_vector(keyword, model=w2v_model, num_features=300, index2word_set=key_set)
  sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
  df['fit'][i] = sim

df.sort_values(by='fit',ascending=False).head()

Unnamed: 0,id,job_title,location,connection,fit,fit_bow,fit_tfidf
16,17,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.828534,0.866025,0.753591
2,3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.828534,0.866025,0.753591
20,21,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.828534,0.866025,0.753591
57,58,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.828534,0.866025,0.753591
96,97,aspiring human resources professional,"Kokomo, Indiana Area",71,0.828534,0.866025,0.753591


#GLOVE

In [120]:
from scipy import spatial
import gensim.downloader as api
glove_model = api.load('glove-twitter-25')

key_set = set(glove_model.index_to_key)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in key_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

keyword='Aspiring human resources'

for i in range(len(df)):
  sentence = df['job_title'][i]
  s1_afv = avg_feature_vector(sentence, model=glove_model, num_features=25, index2word_set=key_set)
  s2_afv = avg_feature_vector(keyword, model=glove_model, num_features=25, index2word_set=key_set)
  sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
  df['fit'][i] = sim

df.sort_values(by='fit',ascending=False).head()

Unnamed: 0,id,job_title,location,connection,fit,fit_bow,fit_tfidf
27,28,seeking human resources opportunities,"Chicago, Illinois",390,0.9748,0.57735,0.287816
29,30,seeking human resources opportunities,"Chicago, Illinois",390,0.9748,0.57735,0.287816
73,74,human resources professional,Greater Boston Area,16,0.974232,0.666667,0.460159
87,88,human resources management major,"Milpitas, California",18,0.970674,0.57735,0.257531
98,99,seeking human resources position,"Las Vegas, Nevada Area",48,0.964387,0.57735,0.279124


#BERT

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')


Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [7]:
#Encoding:
sen_embeddings = model.encode(df['job_title'])
keyword_embeddings = model.encode(['Aspiring human resources'])
#keyword_embeddings = model.encode([df['job_title'][11]])
similarity = cosine_similarity(sen_embeddings,keyword_embeddings)
df['fit_bert'] = similarity
df.sort_values(by=['fit_bert'],ascending=False)

Unnamed: 0,id,job_title,location,connection,fit,fit_bert
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610
...,...,...,...,...,...,...
11,12,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,,0.299499
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,,0.259422
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,,0.252835
92,93,Admissions Representative at Community medical...,"Long Beach, California",9,,0.164934


In [23]:
#Reranking
#print(sen_embeddings)
#print(keyword_embeddings)
keyword = 'Aspiring human resources'
keyword_embeddings = model.encode([keyword])

#combine keyword with starred candidate job_title then embedding
rerank_key = keyword  + ' ' + df['job_title'][23]
rerank_keyword_embeddings = model.encode([rerank_key])
similarity1 = cosine_similarity(sen_embeddings,rerank_keyword_embeddings)
df['fit_bert_adding_key'] = similarity1

#adding keyword embedding vector to starred_title embedding vector
starred_title_embeddings = model.encode([df['job_title'][23]])
similarity2 = cosine_similarity(sen_embeddings,keyword_embeddings+starred_title_embeddings)
df['fit_bert_adding_embeddings'] = similarity2

df.sort_values(by=['fit_bert_adding_embeddings'],ascending=False).head(100)

#adding two embedding vectors gave higher similarity

Unnamed: 0,id,job_title,location,connection,fit,fit_bert,fit_bert_adding_key,fit_bert_adding_embeddings
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610,0.977292,0.985478
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610,0.977292,0.985478
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610,0.977292,0.985478
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610,0.977292,0.985478
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.942610,0.977292,0.985478
...,...,...,...,...,...,...,...,...
31,32,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.372279,0.351779,0.384040
63,64,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,,0.299499,0.291124,0.307407
11,12,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,,0.299499,0.291124,0.307407
41,42,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,,0.299499,0.291124,0.307407
