# A NLP Content-Based Recommender Systems Using Word2Vec

In [1]:
import warnings
warnings.filterwarnings("ignore")
import datetime
import pandas as pd
import numpy as np
pd.set_option('max_columns', 1000)
pd.set_option('max_row', 300)
import re
import string
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import gensim

# 1. Check the data

In [2]:
df_job_Combined_Jobs_Final = pd.read_csv("data files/Job_Recommendation_dataset/Combined_Jobs_Final.csv")
df_job_Combined_Jobs_Final.head(2)

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,Address,Latitude,Longitude,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,,37.443346,-122.16117,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,,37.78983,-122.404268,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC


In [3]:
df_user_Experience = pd.read_csv("data files/Job_Recommendation_dataset/Experience.csv")
df_user_Experience.head(2)

Unnamed: 0,Applicant.ID,Position.Name,Employer.Name,City,State.Name,State.Code,Start.Date,End.Date,Job.Description,Salary,Can.Contact.Employer,Created.At,Updated.At
0,10001,Account Manager / Sales Administration / Quali...,Barcode Resourcing,Bellingham,Washington,WA,2012-10-15,,,,,2014-12-12 20:10:02 UTC,2014-12-12 20:10:02 UTC
1,10001,Electronics Technician / Item Master Controller,Ryzex Group,Bellingham,Washington,WA,2001-12-01,2012-04-01,,,,2014-12-12 20:10:02 UTC,2014-12-12 20:10:02 UTC


In [4]:
df_user_Job_Views = pd.read_csv("data files/Job_Recommendation_dataset/Job_Views.csv")
df_user_Job_Views.head(2)

Unnamed: 0,Applicant.ID,Job.ID,Title,Position,Company,City,State.Name,State.Code,Industry,View.Start,View.End,View.Duration,Created.At,Updated.At
0,10000,73666,Cashiers & Valets Needed! @ WallyPark,Cashiers & Valets Needed!,WallyPark,Newark,New Jersey,NJ,,2014-12-12 20:12:35 UTC,2014-12-12 20:31:24 UTC,1129.0,2014-12-12 20:12:35 UTC,2014-12-12 20:12:35 UTC
1,10000,96655,Macy's Seasonal Retail Fragrance Cashier - Ga...,Macy's Seasonal Retail Fragrance Cashier - Ga...,Macy's,Garden City,New York,NY,,2014-12-12 20:08:50 UTC,2014-12-12 20:10:15 UTC,84.0,2014-12-12 20:08:50 UTC,2014-12-12 20:08:50 UTC


In [5]:
df_user_Positions_Of_Interest = pd.read_csv("data files/Job_Recommendation_dataset/Positions_Of_Interest.csv")
df_user_Positions_Of_Interest.head(2)

Unnamed: 0,Applicant.ID,Position.Of.Interest,Created.At,Updated.At
0,10003,security officer,2014-12-12 21:20:54 UTC,2014-12-12 21:20:54 UTC
1,10007,Server,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC


# 2. Rebuild a combined user table and a combined job table

In [6]:
# make the user text data table

# df_user_Job_Views add df_job.job.description
# inner join df_user_Experience, df_user_Job_Views, df_user_Positions_Of_Interest ---> users_final.csv
df_user = pd.merge(df_user_Job_Views, df_job_Combined_Jobs_Final, how="left", on=["Job.ID"])
df_user = pd.merge(df_user, df_user_Experience, how="inner", on=["Applicant.ID"])
df_user = pd.merge(df_user, df_user_Positions_Of_Interest, how="inner", on=["Applicant.ID"])

# get combined user final table after merge df1, df2, df4, df5
df_user = df_user[['Applicant.ID','Position.Name','Position.Of.Interest',
               'Title_x','Position_x','Slug','Job.Description_x','Job.Description_y']]

# merge all text data into one column
df_user['user_text'] = df_user.apply(lambda x: str(x['Position.Name']) + ' ' + str(x['Position.Of.Interest']) + ' ' + 
                                               str(x['Title_x']) + ' ' + str(x['Position_x']) + ' ' + str(x['Slug']) + ' ' + 
                                               str(x['Job.Description_x']) + ' ' + str(x['Job.Description_y']), axis=1)
df_user = df_user[['Applicant.ID','user_text']]

# 2. groupby Applicant.ID 
df_user = df_user.groupby(['Applicant.ID'])['user_text'].apply(lambda x: ','.join(x)).reset_index()
df_user.head(5)

Unnamed: 0,Applicant.ID,user_text
0,96,Cashier Server Kitchen Staff @ Izakaya Yuzuki ...
1,153,Photographer Server Valic Financial Advisor In...
2,1877,Registration Coordinator Receptionist Sales As...
3,2075,Manager Server Valet Attendant @ Standard Park...
4,2478,Listing Manager Server Part Time Teller - Corp...


In [7]:
# make the job text data table
df_job = df_job_Combined_Jobs_Final[['Job.ID','Title','Position','Slug','Job.Description']]

df_job['job_text'] = df_job.apply(lambda x: str(x['Title']) + ' ' + str(x['Position']) + ' ' + 
                                            str(x['Slug']) + ' ' + str(x['Job.Description']), axis=1)
df_job = df_job[['Job.ID','job_text']]
df_job.head(5)

Unnamed: 0,Job.ID,job_text
0,111,Server @ Tacolicious Server palo-alto-ca-tacol...
1,113,Kitchen Staff/Chef @ Claude Lane Kitchen Staff...
2,117,Bartender @ Machka Restaurants Corp. Bartender...
3,121,Server @ Teriyaki House Server brisbane-ca-ter...
4,127,Kitchen Staff/Chef @ Rosa Mexicano - Sunset Ki...


# 3. Use the job description text data to train the word2vec model

In [8]:
job_text = df_job_Combined_Jobs_Final['Job.Description']
user_text = df_user_Experience['Job.Description']
text_data = pd.concat([job_text, user_text], ignore_index=True)
text_data = text_data.dropna(how='any')
text_data

0        Tacolicious' first Palo Alto store just opened...
1         \r\n\r\nNew French Brasserie in S.F. Financia...
2        We are a popular Mediterranean wine bar and re...
3         ● Serve food/drinks to customers in a profess...
4        Located at the heart of Hollywood, we are one ...
                               ...                        
92738    I am a cocktail waitress. Its a high volume bl...
92739    It was a popular restaurant in the west villag...
92740    I was a dining room server, a cocktail server,...
92741    This was an establishment that majority of the...
92742    I was a server. This was a new american italia...
Name: Job.Description, Length: 89726, dtype: object

In [9]:
# cleaning process
stop = stopwords.words('english')
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2   

def clean_txt(text):
    clean_text = []
    clean_text2 = []
    text = re.sub("'", "",text)
    text=re.sub("(\\d|\\W)+"," ",text) 
    text = text.replace("nbsp", "")
    clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
    clean_text = [word for word in clean_text if black_txt(word)]
    return " ".join(clean_text)

text_data = text_data.apply(clean_txt)
text_data = text_data.apply(lambda x: x.split())

In [10]:
# training word2vec model and get the vectors of words
w2v_model = gensim.models.Word2Vec(min_count=10,window=10,size=300, 
                                   alpha=0.05, min_alpha=0.0007,                                    
                                   iter = 20,negative=20,workers=3)

w2v_model.build_vocab(text_data, progress_per=2000)
w2v_model.train(text_data,total_examples=w2v_model.corpus_count, epochs=50)

(547604494, 588161150)

In [11]:
w2v_model[w2v_model.wv.vocab].shape

(13470, 300)

# 4. Use the matrix from word2vec to transfer the text in user table and job table

In [12]:
def user_or_job2vec(x):
    s_vec = w_vec = 0
    for word in x:
        if word in w2v_model:
            w_vec = w2v_model[word]
            s_vec = s_vec + w_vec
    return s_vec

In [13]:
# text of users to vector
df_user['text_clean'] = df_user['user_text'].apply(clean_txt)
df_user['text_token'] = df_user['text_clean'].apply(lambda x: x.split())
df_user['user_vector'] = df_user['text_token'].apply(user_or_job2vec)

In [14]:
# text of jobs to vector
df_job = df_job.dropna(how='any')
df_job['text_clean'] = df_job['job_text'].apply(clean_txt)
df_job['text_token'] = df_job['text_clean'].apply(lambda x: x.split())
df_job['job_vector'] = df_job['text_token'].apply(user_or_job2vec)

# 5. Recommend jobs for users

In [15]:
# recommend jobs for this lucky user!
# the original text of user himself/herself
df_user['user_text'][101]

'Esthetician Customer Service Rep Customer Service-Product Distribution Associates, part-time, afternoon/evening shift @ Heartland Blood Centers Customer Service-Product Distribution Associates, part-time, afternoon/evening shift nan nan Skin care specialist. Performed variety of facials, waxing, chemical peels, laser hair removal,Esthetician Call Center Customer Service-Product Distribution Associates, part-time, afternoon/evening shift @ Heartland Blood Centers Customer Service-Product Distribution Associates, part-time, afternoon/evening shift nan nan Skin care specialist. Performed variety of facials, waxing, chemical peels, laser hair removal'

In [16]:
# use this users' vector to calculate all jobs' similarity value and get the largest 10 positions!
user_info = df_user['user_vector'][101] 
                                          
df_job['similarity'] = df_job['job_vector'].apply(lambda x: float(cosine_similarity([user_info], [x])))
# df_job.sort_values("similarity",inplace=True)
df_job.sort_values(by="similarity", ascending=False).head(10)
# df_job.tail(10)

Unnamed: 0,Job.ID,job_text,text_clean,text_token,job_vector,similarity
76059,309824,"RN Radiology, Part-time Evenings @ Larchmont I...",radiology part time even larchmont image assoc...,"[radiology, part, time, even, larchmont, image...","[341.08942, 39.01031, -194.6336, 206.6044, 194...",0.631274
17073,246600,Patient Service Representative- Pediatrics (Pa...,patient service representative pediatrics part...,"[patient, service, representative, pediatrics,...","[51.198566, -50.42614, -309.40994, 63.280857, ...",0.523313
26463,255994,Patient Service Representative/ Pediatrics (Pa...,patient service representative pediatrics part...,"[patient, service, representative, pediatrics,...","[-32.996983, -36.994934, -415.71277, 69.99393,...",0.510839
17042,246569,Patient Service Representative/ Pediatrics (Pa...,patient service representative pediatrics part...,"[patient, service, representative, pediatrics,...","[-32.996983, -36.994934, -415.71277, 69.99393,...",0.510839
75932,309697,Patient Service Representative/ Pediatrics (Pa...,patient service representative pediatrics part...,"[patient, service, representative, pediatrics,...","[-32.996983, -36.994934, -415.71277, 69.99393,...",0.510839
26492,256023,Patient Service Representative- Pediatrics (Pa...,patient service representative pediatrics part...,"[patient, service, representative, pediatrics,...","[30.83912, -41.30272, -409.48816, 129.6821, 45...",0.506264
11622,176433,EVENING PART TIME Customer Service Representat...,even part time customer service representative...,"[even, part, time, customer, service, represen...","[-116.95253, 350.17932, -157.33224, 297.5123, ...",0.499942
64329,295054,Administrative Assistant- Part Time Evenings @...,administrative assistant part time even spheri...,"[administrative, assistant, part, time, even, ...","[36.476295, 424.3153, -123.64921, 313.54807, 2...",0.493263
44707,274249,Key Holders / Shift Leaders / Sales Associates...,key holders shift leaders sales associate stor...,"[key, holders, shift, leaders, sales, associat...","[486.01764, -155.84268, 500.32687, 731.2984, 4...",0.49211
27994,257524,Dishwasher Part time for Days/ and Evening shi...,dishwasher part time days even shift dishwashe...,"[dishwasher, part, time, days, even, shift, di...","[138.8513, -286.61194, -202.11832, 243.82964, ...",0.488569


In [17]:
# check one of these top 10 jobs 
df_job['job_text'][76059]

'RN Radiology, Part-time Evenings @ Larchmont Imaging Associates RN Radiology, Part-time Evenings mount-laurel-nj-larchmont-imaging-associates-rn-radiology-part-time-evenings Larchmont Imaging Associates has offered a complete range of radiology services throughout Burlington County for over 30 years, combining state of the art technology with some of the best professionals in the area.&nbsp; We are a growing organization which recognizes that our people are our biggest asset.&nbsp; \r\n\r\n\r\nLarchmont Imaging currently has part-time evening shifts available for an experienced RN in our Mt. Laurel&nbsp; &amp; Moorestown Offices.&nbsp; Candidates must possess a minimum of 3 years RN experience (preferably in a fast paced and/or a high volume environment) as well as exceptional patient service skills.&nbsp;&nbsp;\r\n\r\nShifts are Tue &amp; Fri&nbsp;evenings:&nbsp; 4:30pm-8:30pm, additional evening shifts &amp; Saturday AM shifts may also be available.'