# **Import Needed Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import nltk
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support, ConfusionMatrixDisplay, confusion_matrix, classification_report,f1_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import gensim
from gensim.models import Phrases
import wordcloud
import pickle
import json
from sklearn.manifold import TSNE

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# **Load saved files**

## **Load Labeled Linkedin Jobs Datasets**

In [3]:
df = pd.read_csv("/content/labeled_linkedin_jobs.csv")
df

Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
0,3757935025,Veterinarian,We are looking to hire an Associate Veterinari...,Senior,Full-time,Michigan,Healthcare,https://www.linkedin.com/jobs/view/3757935025/...,Veterinarian We are looking to hire an Associa...,veterinarian associate veterinarian doctor vet...,1,11
1,3757934327,Veterinarian,We are looking to hire an Associate Veterinari...,Senior,Full-time,Colorado,Healthcare,https://www.linkedin.com/jobs/view/3757934327/...,Veterinarian We are looking to hire an Associa...,veterinarian associate veterinarian doctor vet...,1,11
2,3757933458,Veterinarian,We are looking to hire an Associate Veterinari...,Senior,Full-time,California,Healthcare,https://www.linkedin.com/jobs/view/3757933458/...,Veterinarian We are looking to hire an Associa...,veterinarian associate veterinarian doctor vet...,1,11
3,3757932826,Veterinarian,We are looking to hire an Associate Veterinari...,Senior,Full-time,Pennsylvania,Healthcare,https://www.linkedin.com/jobs/view/3757932826/...,Veterinarian We are looking to hire an Associa...,veterinarian associate veterinarian doctor vet...,1,11
4,3757931806,Veterinarian,We are looking to hire an Associate Veterinari...,Senior,Full-time,New York,Healthcare,https://www.linkedin.com/jobs/view/3757931806/...,Veterinarian We are looking to hire an Associa...,veterinarian associate veterinarian doctor vet...,1,11
...,...,...,...,...,...,...,...,...,...,...,...,...
2233,3685418185,Software Engineer,"Who we are:Founded in 2017, Gatik is the leade...",Senior,Full-time,California,IT,https://www.linkedin.com/jobs/view/3685418185/...,"Software Engineer Who we are:Founded in 2017, ...",software engineer arefounded gatik leader auto...,2,17
2234,3684441939,Accountant,Titus Talent Strategies has partnered with SAR...,Beginner,Full-time,Wisconsin,Finance,https://www.linkedin.com/jobs/view/3684441939/...,Accountant Titus Talent Strategies has partner...,accountant titus talent strategy partnered sar...,0,14
2235,3682818140,Developer,Carefully read the JD before applying!Job Desc...,Senior,Contract,Texas,IT,https://www.linkedin.com/jobs/view/3682818140/...,Developer Carefully read the JD before applyin...,developer carefully read jd applyingjob descri...,2,0
2236,3663878663,Accountant,Bring YOUR energy to Alliant Energy!\n\nAt All...,Beginner,Full-time,Wisconsin,Finance,https://www.linkedin.com/jobs/view/3663878663/...,Accountant Bring YOUR energy to Alliant Energy...,accountant bring energy alliant energy alliant...,0,8


## **Load pickle files - LDA vectorizer and dictionary (champion model)**

In [4]:
with open("/content/lda_vect.pickle", 'rb') as file:
  lda_vect = pickle.load(file)

with open("/content/dictionary.pickle", 'rb') as file:
  dictionary = pickle.load(file)

with open("/content/champ_model.pickle", 'rb') as file:
  champ_model = pickle.load(file)

# **Vectorize the preprocessed text data using the pickled LDA vectorizer and add the cluster labels to it**

In [5]:
docs= list(df['texts_preprocessed'].apply(lambda x: nltk.word_tokenize(x)))

# Compute bigrams.

# Add bigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [6]:
# Initialize an empty list to store the dense vectors representing topic distributions for each document.
df_lda=[]

# Iterate over each document in the tokenized document list.
for i in range(len(docs)):
  # Get the topic distribution for the ith document
  lda_vector= lda_vect[dictionary.doc2bow(docs[i])]

  # Convert the topic distribution to a dense vector
  dense_vector = gensim.matutils.sparse2full(lda_vector,lda_vect.num_topics)

  # Append the dense vector representation of the topic distribution to the data_lda list.
  df_lda.append(dense_vector)

# Convert the list of dense vectors to a pandas DataFrame.
df_lda= pd.DataFrame(df_lda)
df_lda

Unnamed: 0,0,1,2
0,0.000000,0.802033,0.197120
1,0.000000,0.801369,0.197783
2,0.000000,0.806489,0.192675
3,0.000000,0.798025,0.201128
4,0.000000,0.797583,0.201570
...,...,...,...
2233,0.999596,0.000000,0.000000
2234,0.000000,0.000000,0.999680
2235,0.999628,0.000000,0.000000
2236,0.000000,0.000000,0.999551


In [7]:
df_lda["cluster_labels_lda1"] = df["cluster_labels_lda1"]

# **Text preprocessing function (will be needed for user input)**

In [8]:
stop_words = set(nltk.corpus.stopwords.words('english'))

# Common words in job postings that should be removed
words_to_remove = ['characteristic','protected','essential','function','policy','procedure','eligible','also', 'employer', 'meet','ensure','well','monthly','strong','relationship','individual','communication','must','preferred','current','center','may','within','qualification','help','career','community','member','based','offer','professional','application','degree','pay','apply','one','shift','provide','package','employer','every', 'day', 'ideal', 'candidate', 'genetic', 'information','client','industry','team','looking', 'hire', 'life', 'insurance','skill','proficiency','experience', 'ability','job', 'year', 'opportunity', 'including', 'benefit', 'company', 'required', 'requirement', 'need', 'new', 'position' ,'employee' ,'role', 'race', 'color', 'sexual', 'sex', 'identity', 'salary', 'range', 'expression', 'orientation', 'gender', 'religion', 'marital', 'veteran', 'status', 'regard', 'equal', 'employment', 'national', 'origin', 'qualified', 'applicant', 'paid', 'time', 'hour', 'per', 'week', 'join', 'related','field', 'reasonable', 'accommodation', 'best', 'practice', 'work', 'environment', 'united', 'state', 'age', 'sick', 'leave', 'consideration', 'duty', 'responsibility', 'receive', 'dental', 'vision', 'local', 'law', 'disability', 'working', 'using', 'knowledge', 'monthly']

lemmatizer = nltk.WordNetLemmatizer()

def preprocess_doc(doc):

  # Lowercasing
  doc = doc.lower()
  # Removal of punctuation, numbers, and any special characters
  doc = re.sub(r'[^a-z\s]','', doc)
  # Tokenization
  tokens= nltk.word_tokenize(doc)
  # Stopwords removal
  tokens_nostop= [word for word in tokens if word not in stop_words]
  # Lemmatization
  tokens_lemmatized= [lemmatizer.lemmatize(word) for word in tokens_nostop]
  # Remove unnecessary words
  tokens_lemmatized= [word for word in tokens_lemmatized if (word not in words_to_remove)&(len(word)>1)]
  # Join the processed tokens back into a single string
  processed_doc = ' '.join(tokens_lemmatized)

  return processed_doc

# **Recommender System**

## **Creating similarity matrix for recommendater system**

### **Pairwise job similarity matrix using cosine similarity**

In [9]:
similarity_mat = cosine_similarity(df_lda.drop(columns = ["cluster_labels_lda1"]))
similarity_df = pd.DataFrame(similarity_mat)
similarity_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2228,2229,2230,2231,2232,2233,2234,2235,2236,2237
0,1.0,1.0,0.999979,0.999983,0.999979,0.994627,0.954004,0.9711,0.536029,0.976589,...,0.238673,0.238673,0.238673,0.0,0.236557,0.0,0.238673,0.0,0.238673,0.238673
1,1.0,1.0,0.999972,0.999988,0.999985,0.994727,0.953776,0.970868,0.536828,0.976486,...,0.239617,0.239617,0.239617,0.0,0.237492,0.0,0.239617,0.0,0.239617,0.239617
2,0.999979,0.999972,1.0,0.999923,0.999915,0.993935,0.955505,0.972628,0.530681,0.977253,...,0.232367,0.232367,0.232367,0.0,0.230307,0.0,0.232367,0.0,0.232367,0.232367
3,0.999983,0.999988,0.999923,1.0,1.0,0.99522,0.952606,0.969677,0.540865,0.97595,...,0.244389,0.244389,0.244389,0.0,0.242223,0.0,0.244389,0.0,0.244389,0.244389
4,0.999979,0.999985,0.999915,1.0,1.0,0.995283,0.952449,0.969518,0.541399,0.975877,...,0.245022,0.245022,0.245022,0.0,0.24285,0.0,0.245022,0.0,0.245022,0.245022


### **Pairwise job similarity matrix using euclidean distance**

In [10]:
eu_mat = euclidean_distances(df_lda.drop(columns = ["cluster_labels_lda1"]))
eu_df = pd.DataFrame(eu_mat)
eu_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2228,2229,2230,2231,2232,2233,2234,2235,2236,2237
0,0.0,0.000938,0.006294,0.005667,0.006293,0.094424,0.256525,0.278627,0.741095,0.180578,...,1.134142,1.134633,1.134584,1.296788,1.061032,1.296651,1.134618,1.296676,1.134527,1.134209
1,0.000938,0.0,0.007232,0.00473,0.005355,0.093487,0.257133,0.279565,0.74018,0.180756,...,1.133204,1.133695,1.133646,1.296479,1.060103,1.296342,1.13368,1.296367,1.133589,1.133272
2,0.006294,0.007232,0.0,0.011962,0.012587,0.100719,0.252496,0.272333,0.747241,0.179511,...,1.140436,1.140927,1.140878,1.298882,1.067269,1.298745,1.140912,1.29877,1.140821,1.140504
3,0.005667,0.00473,0.011962,0.0,0.000625,0.088757,0.260232,0.284294,0.735565,0.181724,...,1.128474,1.128966,1.128916,1.29493,1.055418,1.294793,1.128951,1.294818,1.128859,1.128542
4,0.006293,0.005355,0.012587,0.000625,0.0,0.088132,0.260645,0.284919,0.734955,0.18186,...,1.127849,1.128341,1.128291,1.294727,1.054799,1.294589,1.128325,1.294614,1.128234,1.127917


In [11]:
industries = df['industry'].values
industries, industries.shape

(array(['Healthcare', 'Healthcare', 'Healthcare', ..., 'IT', 'Finance',
        'Finance'], dtype=object),
 (2238,))

## **Testing Out Similarity Matrices**

### **Cosine Similarity**

In [12]:
# IT example
job_id_it = np.where(industries == "IT")[0][3]
print("ID:",job_id_it)
print("\nJob Title:",df.loc[job_id_it,"job_title"])

# Get similar jobs
job_similarities = similarity_df.iloc[job_id_it]
similar_job_idxs = pd.DataFrame(job_similarities.rename("similarities")).sort_values(by = "similarities", ascending = False)
top_10 = similar_job_idxs[:10]
print('\nTop 10 Similar Jobs:')
df.iloc[top_10.index]

ID: 83

Job Title: Developer

Top 10 Similar Jobs:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
1417,3701311779,Developer,Please find the details of position below: Tit...,Senior,Contract,Texas,IT,https://www.linkedin.com/jobs/view/3701311779/...,Developer Please find the details of position ...,developer please find detail title sr aem deve...,2,15
2108,3693052484,Developer,This position will be responsible for the deve...,Intermediate,Full-time,Georgia,IT,https://www.linkedin.com/jobs/view/3693052484/...,Developer This position will be responsible fo...,developer responsible development testing opti...,2,15
1574,3699097574,Data Engineer,"Title: Sr. Data EngineerLocation: Austin, TxDu...",Senior,Contract,Texas,IT,https://www.linkedin.com/jobs/view/3699097574/...,Data Engineer Title: Sr. Data EngineerLocation...,data engineer title sr data engineerlocation a...,2,0
535,3757455342,UI/UX Designer,Title: UI/UX DesignerDuration: 2.5 months – Pr...,Beginner,Contract,California,IT,https://www.linkedin.com/jobs/view/3757455342/...,UI/UX Designer Title: UI/UX DesignerDuration: ...,uiux designer title uiux designerduration mont...,2,17
998,3754761827,Software Engineer,"About this RoleAs a Software Engineer, you wil...",Senior,Full-time,California,IT,https://www.linkedin.com/jobs/view/3754761827/...,Software Engineer About this RoleAs a Software...,software engineer roleas software engineer par...,2,17
1471,3701199598,Developer,Job Description:As an AWS Java Application Dev...,Senior,Contract,New Jersey,IT,https://www.linkedin.com/jobs/view/3701199598/...,Developer Job Description:As an AWS Java Appli...,developer descriptionas aws java development m...,2,15
264,3757702568,Developer,Job title: SAP HANA DeveloperLocation: Hybrid ...,Senior,Contract,California,IT,https://www.linkedin.com/jobs/view/3757702568/...,Developer Job title: SAP HANA DeveloperLocatio...,developer title sap hana developerlocation hyb...,2,0
262,3757704399,Database Management,Must have a minimum of five (5) years of exper...,Senior,Contract,California,IT,https://www.linkedin.com/jobs/view/3757704399/...,Database Management Must have a minimum of fiv...,database management minimum five applying anal...,2,9
2106,3693052552,Developer,"We are Photon, one of the world's largest Digi...",Intermediate,Contract,Nevada,IT,https://www.linkedin.com/jobs/view/3693052552/...,"Developer We are Photon, one of the world's la...",developer photon world largest digital platfor...,2,17
259,3757705216,Developer,"Job Description:Java:Proficient in Java/J2EE, ...",Senior,Contract,Delaware,IT,https://www.linkedin.com/jobs/view/3757705216/...,Developer Job Description:Java:Proficient in J...,developer descriptionjavaproficient javajee go...,2,15


In [13]:
# Healthcare example
job_id_it = np.where(industries == "Healthcare")[0][200]
print("ID:",job_id_it)
print("\nJob Title:",df.loc[job_id_it,"job_title"])

# Get similar jobs
job_similarities = similarity_df.iloc[job_id_it]
similar_job_idxs = pd.DataFrame(job_similarities.rename("similarities")).sort_values(by = "similarities", ascending = False)
top_10 = similar_job_idxs[:10]
print('\nTop 10 Similar Jobs:')
df.iloc[top_10.index]

ID: 373

Job Title: Dental Hygienist

Top 10 Similar Jobs:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
373,3757491682,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,Georgia,Healthcare,https://www.linkedin.com/jobs/view/3757491682/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
311,3757496305,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New York,Healthcare,https://www.linkedin.com/jobs/view/3757496305/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
365,3757492550,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New York,Healthcare,https://www.linkedin.com/jobs/view/3757492550/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
282,3757498061,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New Hampshire,Healthcare,https://www.linkedin.com/jobs/view/3757498061/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
346,3757493562,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New Hampshire,Healthcare,https://www.linkedin.com/jobs/view/3757493562/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
364,3757492552,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Part-time,Missouri,Healthcare,https://www.linkedin.com/jobs/view/3757492552/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
301,3757497133,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,South Carolina,Healthcare,https://www.linkedin.com/jobs/view/3757497133/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
341,3757493625,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,North Carolina,Healthcare,https://www.linkedin.com/jobs/view/3757493625/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
343,3757493569,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,Maryland,Healthcare,https://www.linkedin.com/jobs/view/3757493569/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
358,3757492656,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Part-time,Washington,Healthcare,https://www.linkedin.com/jobs/view/3757492656/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7


In [14]:
# Finance example
job_id_it = np.where(industries == "Finance")[0][0]
print("ID:",job_id_it)
print("\nJob Title:",df.loc[job_id_it,"job_title"])

# Get similar jobs
job_similarities = similarity_df.iloc[job_id_it]
similar_job_idxs = pd.DataFrame(job_similarities.rename("similarities")).sort_values(by = "similarities", ascending = False)
top_10 = similar_job_idxs[:10]
print('\nTop 10 Similar Jobs:')
df.iloc[top_10.index]

ID: 8

Job Title: Banker

Top 10 Similar Jobs:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
8,3757921025,Banker,"What You Will Do\n\nAs a Teller/Banker, you wi...",Beginner,Full-time,Texas,Finance,https://www.linkedin.com/jobs/view/3757921025/...,"Banker What You Will Do\n\nAs a Teller/Banker,...",banker tellerbanker often deliver first impres...,0,2
13,3757916578,Banker,Job Description\n\nIt’s about interactions mor...,Beginner,Full-time,Texas,Finance,https://www.linkedin.com/jobs/view/3757916578/...,Banker Job Description\n\nIt’s about interacti...,banker description interaction transaction lov...,0,2
200,3757726367,Banker,"Ameris Bank is a purpose-driven company, dedic...",Intermediate,Full-time,Georgia,Finance,https://www.linkedin.com/jobs/view/3757726367/...,Banker Ameris Bank is a purpose-driven company...,banker ameris bank purposedriven dedicated bri...,0,2
878,3755587827,Accountant,Use your skills to transform your local commun...,Intermediate,Full-time,North Carolina,Finance,https://www.linkedin.com/jobs/view/3755587827/...,Accountant Use your skills to transform your l...,accountant use transform goodwill northwest no...,0,3
893,3755585644,Accountant,We are searching for a dynamic Accountant to j...,Intermediate,Full-time,Virginia,Finance,https://www.linkedin.com/jobs/view/3755585644/...,Accountant We are searching for a dynamic Acco...,accountant searching dynamic accountant suppor...,0,14
947,3755575997,Tax Professional,My client is a rapidly growing Boutique CPA fi...,Senior,Full-time,Colorado,Finance,https://www.linkedin.com/jobs/view/3755575997/...,Tax Professional My client is a rapidly growin...,tax rapidly growing boutique cpa firm boulder ...,0,17
1068,3749385938,Tax Professional,"Sony Corporation of America, located in New Yo...",Beginner,Full-time,New York,Finance,https://www.linkedin.com/jobs/view/3749385938/...,"Tax Professional Sony Corporation of America, ...",tax sony corporation america located york ny h...,0,8
919,3755580830,Account Executive,Job Description\n\n Position Title: Sales Acc...,Senior,Full-time,Washington,Finance,https://www.linkedin.com/jobs/view/3755580830/...,Account Executive Job Description\n\n Position...,account executive description title sale accou...,0,18
707,3756131699,Account Executive,Job Description\n\n Position Title: Sales Acc...,Senior,Full-time,Washington,Finance,https://www.linkedin.com/jobs/view/3756131699/...,Account Executive Job Description\n\n Position...,account executive description title sale accou...,0,18
1602,3699084213,Tax Professional,Status Category:\nFull-Time\nExempt/Non-Exempt...,Senior,Full-time,Massachusetts,Finance,https://www.linkedin.com/jobs/view/3699084213/...,Tax Professional Status Category:\nFull-Time\n...,tax category fulltime exemptnonexempt exempt s...,0,14


### **Euclidean Similarity**

In [15]:
# IT example
job_id_it = np.where(industries == "IT")[0][3]
print("ID:",job_id_it)
print("\nJob Title:",df.loc[job_id_it,"job_title"])

# Get similar jobs
job_similarities = eu_df.iloc[job_id_it]
similar_job_idxs = pd.DataFrame(job_similarities.rename("similarities")).sort_values(by = "similarities", ascending = True)
top_10 = similar_job_idxs[:10]
print('\nTop 10 Similar Jobs:')
df.iloc[top_10.index]

ID: 83

Job Title: Developer

Top 10 Similar Jobs:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
83,3757770826,Developer,ECS is seeking a Python Developer for our USP...,Senior,Full-time,Tennessee,IT,https://www.linkedin.com/jobs/view/3757770826/...,Developer ECS is seeking a Python Developer f...,developer ec seeking python developer usps cus...,2,15
2139,3693049197,Developer,"Power BI Developer | HybridOptomi, in partners...",Senior,Full-time,Arizona,IT,https://www.linkedin.com/jobs/view/3693049197/...,"Developer Power BI Developer | HybridOptomi, i...",developer power bi developer hybridoptomi part...,2,0
2101,3693053381,Developer,"Primary Location: Atlanta, Georgia\nV-Soft Con...",Senior,Contract,Georgia,IT,https://www.linkedin.com/jobs/view/3693053381/...,"Developer Primary Location: Atlanta, Georgia\n...",developer primary location atlanta georgia vso...,2,15
794,3756105731,Software Engineer,Minimum qualifications:\n\nBachelor's degree o...,Senior,Full-time,New York,IT,https://www.linkedin.com/jobs/view/3756105731/...,Software Engineer Minimum qualifications:\n\nB...,software engineer minimum bachelor equivalent ...,2,17
1620,3699080459,Software Engineer,Note: By applying to this position you will ha...,Senior,Full-time,Massachusetts,IT,https://www.linkedin.com/jobs/view/3699080459/...,Software Engineer Note: By applying to this po...,software engineer note applying share location...,2,17
1621,3699080440,Software Engineer,Note: By applying to this position you will ha...,Senior,Full-time,California,IT,https://www.linkedin.com/jobs/view/3699080440/...,Software Engineer Note: By applying to this po...,software engineer note applying share location...,2,17
1054,3749901223,Software Engineer,Date Posted:\n\n2023-10-31\n\nCountry:\n\nUnit...,Beginner,Full-time,Virginia,IT,https://www.linkedin.com/jobs/view/3749901223/...,Software Engineer Date Posted:\n\n2023-10-31\n...,software engineer date posted country america ...,2,17
2010,3693067631,Developer,EXPERT LEVEL DEVELOPER AND ARCHITECT EXPERIENC...,Senior,Contract,Indiana,IT,https://www.linkedin.com/jobs/view/3693067631/...,Developer EXPERT LEVEL DEVELOPER AND ARCHITECT...,developer expert level developer architect mic...,2,15
1854,3694105120,Developer,Akkodis is seeking a Senior Informatica Develo...,Senior,Contract,Georgia,IT,https://www.linkedin.com/jobs/view/3694105120/...,Developer Akkodis is seeking a Senior Informat...,developer akkodis seeking senior informatica d...,2,0
2175,3693046152,Data Engineer,Combine your technical expertise and problem-s...,Senior,Full-time,Florida,IT,https://www.linkedin.com/jobs/view/3693046152/...,Data Engineer Combine your technical expertise...,data engineer combine technical expertise prob...,2,0


In [16]:
# Healthcare example
job_id_it = np.where(industries == "Healthcare")[0][200]
print("ID:",job_id_it)
print("\nJob Title:",df.loc[job_id_it,"job_title"])

# Get similar jobs
job_similarities = eu_df.iloc[job_id_it]
similar_job_idxs = pd.DataFrame(job_similarities.rename("similarities")).sort_values(by = "similarities", ascending = True)
top_10 = similar_job_idxs[:10]
print('\nTop 10 Similar Jobs:')
df.iloc[top_10.index]

ID: 373

Job Title: Dental Hygienist

Top 10 Similar Jobs:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
373,3757491682,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,Georgia,Healthcare,https://www.linkedin.com/jobs/view/3757491682/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
311,3757496305,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New York,Healthcare,https://www.linkedin.com/jobs/view/3757496305/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
365,3757492550,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New York,Healthcare,https://www.linkedin.com/jobs/view/3757492550/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
346,3757493562,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New Hampshire,Healthcare,https://www.linkedin.com/jobs/view/3757493562/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
282,3757498061,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,New Hampshire,Healthcare,https://www.linkedin.com/jobs/view/3757498061/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
364,3757492552,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Part-time,Missouri,Healthcare,https://www.linkedin.com/jobs/view/3757492552/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
301,3757497133,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,South Carolina,Healthcare,https://www.linkedin.com/jobs/view/3757497133/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
341,3757493625,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,North Carolina,Healthcare,https://www.linkedin.com/jobs/view/3757493625/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
343,3757493569,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Full-time,Maryland,Healthcare,https://www.linkedin.com/jobs/view/3757493569/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7
358,3757492656,Dental Hygienist,"At Aspen Dental, we put You First with world-c...",Beginner,Part-time,Washington,Healthcare,https://www.linkedin.com/jobs/view/3757492656/...,"Dental Hygienist At Aspen Dental, we put You F...",hygienist aspen put first worldclass developme...,0,7


In [17]:
# Finance example
job_id_it = np.where(industries == "Finance")[0][0]
print("ID:",job_id_it)
print("\nJob Title:",df.loc[job_id_it,"job_title"])

# Get similar jobs
job_similarities = eu_df.iloc[job_id_it]
similar_job_idxs = pd.DataFrame(job_similarities.rename("similarities")).sort_values(by = "similarities", ascending = True)
top_10 = similar_job_idxs[:10]
print('\nTop 10 Similar Jobs:')
df.iloc[top_10.index]

ID: 8

Job Title: Banker

Top 10 Similar Jobs:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
8,3757921025,Banker,"What You Will Do\n\nAs a Teller/Banker, you wi...",Beginner,Full-time,Texas,Finance,https://www.linkedin.com/jobs/view/3757921025/...,"Banker What You Will Do\n\nAs a Teller/Banker,...",banker tellerbanker often deliver first impres...,0,2
13,3757916578,Banker,Job Description\n\nIt’s about interactions mor...,Beginner,Full-time,Texas,Finance,https://www.linkedin.com/jobs/view/3757916578/...,Banker Job Description\n\nIt’s about interacti...,banker description interaction transaction lov...,0,2
878,3755587827,Accountant,Use your skills to transform your local commun...,Intermediate,Full-time,North Carolina,Finance,https://www.linkedin.com/jobs/view/3755587827/...,Accountant Use your skills to transform your l...,accountant use transform goodwill northwest no...,0,3
200,3757726367,Banker,"Ameris Bank is a purpose-driven company, dedic...",Intermediate,Full-time,Georgia,Finance,https://www.linkedin.com/jobs/view/3757726367/...,Banker Ameris Bank is a purpose-driven company...,banker ameris bank purposedriven dedicated bri...,0,2
893,3755585644,Accountant,We are searching for a dynamic Accountant to j...,Intermediate,Full-time,Virginia,Finance,https://www.linkedin.com/jobs/view/3755585644/...,Accountant We are searching for a dynamic Acco...,accountant searching dynamic accountant suppor...,0,14
947,3755575997,Tax Professional,My client is a rapidly growing Boutique CPA fi...,Senior,Full-time,Colorado,Finance,https://www.linkedin.com/jobs/view/3755575997/...,Tax Professional My client is a rapidly growin...,tax rapidly growing boutique cpa firm boulder ...,0,17
1068,3749385938,Tax Professional,"Sony Corporation of America, located in New Yo...",Beginner,Full-time,New York,Finance,https://www.linkedin.com/jobs/view/3749385938/...,"Tax Professional Sony Corporation of America, ...",tax sony corporation america located york ny h...,0,8
1602,3699084213,Tax Professional,Status Category:\nFull-Time\nExempt/Non-Exempt...,Senior,Full-time,Massachusetts,Finance,https://www.linkedin.com/jobs/view/3699084213/...,Tax Professional Status Category:\nFull-Time\n...,tax category fulltime exemptnonexempt exempt s...,0,14
919,3755580830,Account Executive,Job Description\n\n Position Title: Sales Acc...,Senior,Full-time,Washington,Finance,https://www.linkedin.com/jobs/view/3755580830/...,Account Executive Job Description\n\n Position...,account executive description title sale accou...,0,18
707,3756131699,Account Executive,Job Description\n\n Position Title: Sales Acc...,Senior,Full-time,Washington,Finance,https://www.linkedin.com/jobs/view/3756131699/...,Account Executive Job Description\n\n Position...,account executive description title sale accou...,0,18


## **First Recommendation Approach: Find user cluster using champion classification model and evaluate COSINE similarity score between user data and jobs within the same predicted cluster to find the top 10 job recommendations**

In [18]:
# Define an artificial user test data
cols = ["job_title","experience_level","work_type","location","industry"]
test_user1 = df.loc[101,cols]
user_json = test_user1.to_json()
user = json.loads(user_json)

# take skills and education as the "job_desc"
skills = "patient care"
education = "diploma"
user["description"] = skills + ' ' + education

user

{'job_title': 'Nursing Assistant',
 'experience_level': 'Beginner',
 'work_type': 'Part-time',
 'location': 'California',
 'industry': 'Healthcare',
 'description': 'patient care diploma'}

In [19]:
# concatenation of user input
user["texts"] = user["job_title"] + " "+ user["experience_level"] + " " + user["work_type"] + " " + user["location"] +" "+ user["industry"] +" "+ user["description"]
user["texts"]

'Nursing Assistant Beginner Part-time California Healthcare patient care diploma'

In [20]:
# preprocess user input
user["texts_preprocessed"] = preprocess_doc(user["texts"])
user["texts_preprocessed"]

'nursing assistant beginner parttime california healthcare patient care diploma'

In [21]:
# perform LDA vectorization
user_tokens = list(nltk.word_tokenize(user["texts_preprocessed"]))
user_corpus = lda_vect[dictionary.doc2bow(user_tokens)]
dense_vector = gensim.matutils.sparse2full(user_corpus,lda_vect.num_topics)
dense_vector

array([0.       , 0.9862543, 0.       ], dtype=float32)

In [22]:
# convert to data frame
user_lda_df = pd.DataFrame(dense_vector).T
user_lda_df

Unnamed: 0,0,1,2
0,0.0,0.986254,0.0


In [23]:
# get user cluster using champion model
user_cluster = champ_model.predict(user_lda_df)
user_cluster[0]

1

In [24]:
# filter out jobs within the same user cluster and calculate similarity scores
cluster_jobs = df_lda[df_lda["cluster_labels_lda1"] == user_cluster[0]]
similarity_scores =  cosine_similarity(dense_vector.reshape(1,-1), cluster_jobs.drop(columns=['cluster_labels_lda1']))
similarity_df = pd.DataFrame(similarity_scores.reshape(-1,1),index = cluster_jobs.index)
# find top 10 similar jobs to recommend
similarity_sorted = similarity_df.sort_values(by = [0],ascending = False)
top_10 = similarity_sorted[:10]
top_10_jobs = df.iloc[top_10.index,:]
print('Top 10 jobs to be recommended to user:')
top_10_jobs

Top 10 jobs to be recommended to user:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
1027,3753090321,Nurse,Description\n\nIntroduction\n\nAre you looking...,Senior,Full-time,Kansas,Healthcare,https://www.linkedin.com/jobs/view/3753090321/...,Nurse Description\n\nIntroduction\n\nAre you l...,nurse description introduction diversity inclu...,1,16
1118,3749372738,Nurse,Anders Group is seeking a travel nurse RN Card...,Senior,Part-time,California,Healthcare,https://www.linkedin.com/jobs/view/3749372738/...,Nurse Anders Group is seeking a travel nurse R...,nurse anders group seeking travel nurse rn car...,1,5
1116,3749372744,Nurse,Coast Medical Service is seeking a travel nurs...,Beginner,Part-time,New Jersey,Healthcare,https://www.linkedin.com/jobs/view/3749372744/...,Nurse Coast Medical Service is seeking a trave...,nurse coast medical service seeking travel nur...,1,1
1113,3749373698,Nurse,Host Healthcare is seeking a travel nurse RN T...,Beginner,Part-time,Illinois,Healthcare,https://www.linkedin.com/jobs/view/3749373698/...,Nurse Host Healthcare is seeking a travel nurs...,nurse host healthcare seeking travel nurse rn ...,1,1
1112,3749373703,Nurse,Premier Medical Staffing is seeking a travel n...,Senior,Full-time,Ohio,Healthcare,https://www.linkedin.com/jobs/view/3749373703/...,Nurse Premier Medical Staffing is seeking a tr...,nurse premier medical staffing seeking travel ...,1,1
1111,3749373704,Nurse,Coast Medical Service is seeking a travel nurs...,Senior,Part-time,Massachusetts,Healthcare,https://www.linkedin.com/jobs/view/3749373704/...,Nurse Coast Medical Service is seeking a trave...,nurse coast medical service seeking travel nur...,1,1
1110,3749373708,Nurse,Host Healthcare is seeking a travel nurse RN M...,Senior,Part-time,Ohio,Healthcare,https://www.linkedin.com/jobs/view/3749373708/...,Nurse Host Healthcare is seeking a travel nurs...,nurse host healthcare seeking travel nurse rn ...,1,1
1109,3749373719,Nurse,Stability Healthcare is seeking a travel nurse...,Beginner,Full-time,California,Healthcare,https://www.linkedin.com/jobs/view/3749373719/...,Nurse Stability Healthcare is seeking a travel...,nurse stability healthcare seeking travel nurs...,1,1
1108,3749373720,Nurse,Stability Healthcare is seeking a travel nurse...,Beginner,Full-time,California,Healthcare,https://www.linkedin.com/jobs/view/3749373720/...,Nurse Stability Healthcare is seeking a travel...,nurse stability healthcare seeking travel nurs...,1,1
1107,3749373729,Nurse,Host Healthcare is seeking a travel nurse RN I...,Senior,Part-time,Alabama,Healthcare,https://www.linkedin.com/jobs/view/3749373729/...,Nurse Host Healthcare is seeking a travel nurs...,nurse host healthcare seeking travel nurse rn ...,1,1


## **Second Recommendation Approach: Find user cluster using champion classification model and evaluate EUCLIDEAN similarity score between user data and jobs within the same predicted cluster to find the top similar job and then recommend the top 10 most similar jobs to this job**

In [25]:
# filter out jobs within the same user cluster and calculate similarity scores
get_closest_job = euclidean_distances(dense_vector.reshape(1,-1),cluster_jobs.drop(columns = "cluster_labels_lda1"))
get_closest_job_df = pd.DataFrame(get_closest_job.reshape(-1,1),index = cluster_jobs.index)
# sort jobs from most similar to user profile to least simmilar
get_closest_job_sorted = get_closest_job_df.sort_values(by = [0],ascending = True)
recommended_jobs = df.iloc[get_closest_job_sorted.index,:]
# find the most similar job
closest_job_id = recommended_jobs[recommended_jobs["job_title"] == user["job_title"]].index[0]
print('Most similar job to user profile:')
df[df.index==closest_job_id]

Most similar job to user profile:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
1507,3699431347,Nursing Assistant,Eklego is seeking dedicated and compassionate ...,Beginner,Full-time,Pennsylvania,Healthcare,https://www.linkedin.com/jobs/view/3699431347/...,Nursing Assistant Eklego is seeking dedicated ...,nursing assistant eklego seeking dedicated com...,1,16


## **Hybrid recommedation strategy (main choice)**

- Use first approach to find top similar jobs to user profile in terms of cosine similarity
- Use second approach to find most similar job to user profile and then find the most similar jobs to this job in terms of euclidean similarity
- Recommend the jobs that lie in the intersection of both approaches

### **Assigning values for the user**

In [26]:
# Define an artificial user test data
cols = ["job_title","experience_level","work_type","location","industry"]
test_user1 = df.loc[100,cols]
user_json = test_user1.to_json()
user = json.loads(user_json)

# take skills and education as the "job_desc"
skills = "Machine learning, tensorflow, sql"
education = "Bachelors in computer"
user["description"] = skills + ' ' + education

user

{'job_title': 'Data Scientist',
 'experience_level': 'Beginner',
 'work_type': 'Full-time',
 'location': 'Washington',
 'industry': 'IT',
 'description': 'Machine learning, tensorflow, sql Bachelors in computer'}

In [27]:
# concatenation of user input
user["texts"] = user["job_title"] + " "+ user["experience_level"] + " " + user["work_type"] + " " + user["location"] +" "+ user["industry"] +" "+ user["description"]
user["texts"]

'Data Scientist Beginner Full-time Washington IT Machine learning, tensorflow, sql Bachelors in computer'

In [28]:
# preprocess user input
user["texts_preprocessed"] = preprocess_doc(user["texts"])
user["texts_preprocessed"]

'data scientist beginner fulltime washington machine learning tensorflow sql bachelor computer'

In [29]:
# perform LDA vectorization
user_tokens = list(nltk.word_tokenize(user["texts_preprocessed"]))
user_corpus = lda_vect[dictionary.doc2bow(user_tokens)]
dense_vector = gensim.matutils.sparse2full(user_corpus,lda_vect.num_topics)
dense_vector

array([0.9896581, 0.       , 0.       ], dtype=float32)

In [30]:
# convert to data frame
user_lda_df = pd.DataFrame(dense_vector).T
user_lda_df

Unnamed: 0,0,1,2
0,0.989658,0.0,0.0


In [31]:
# get user cluster using champion model
user_cluster = champ_model.predict(user_lda_df)
user_cluster[0]

2

### **1. Calculating cosine similarity score between user data and all jobs within the same cluster and find top similar jobs**

In [32]:
# filtering out jobs with the same user cluster and calculating similarity scores
cluster_jobs = df_lda[df_lda["cluster_labels_lda1"] == user_cluster[0]]

# calculating user data and jobs data similarity
similarity_scores =  cosine_similarity(dense_vector.reshape(1,-1), cluster_jobs.drop(columns=['cluster_labels_lda1']))
#print(similarity_scores)

# converting to dataframe to get index
similarity_df = pd.DataFrame(similarity_scores.reshape(-1,1),index = cluster_jobs.index)

# sorting similarity score from highest to lowest
similarity_sorted = similarity_df.sort_values(by = [0],ascending = False)

# getting top n jobs
top_n_jobs = df.iloc[similarity_sorted.index,:]

print('Top jobs to be recommended to user:')
top_n_jobs

Top jobs to be recommended to user:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
1245,3749344807,Developer,"Junior Developer, IT HelpdeskLocation: North A...",Senior,Full-time,Texas,IT,https://www.linkedin.com/jobs/view/3749344807/...,"Developer Junior Developer, IT HelpdeskLocatio...",developer junior developer helpdesklocation no...,2,9
1378,3701317947,UI/UX Designer,Role - UI Lead (JavaScript)Location - Columbus...,Senior,Full-time,Ohio,IT,https://www.linkedin.com/jobs/view/3701317947/...,UI/UX Designer Role - UI Lead (JavaScript)Loca...,uiux designer ui lead javascriptlocation colum...,2,17
1432,3701306850,Developer,Role Overview: We are seeking a skilled Salesf...,Senior,Full-time,Washington,IT,https://www.linkedin.com/jobs/view/3701306850/...,Developer Role Overview: We are seeking a skil...,developer overview seeking skilled salesforce ...,2,15
1427,3701307311,Developer,A quick brief of the Job: Java Full stackContr...,Beginner,Contract,California,IT,https://www.linkedin.com/jobs/view/3701307311/...,Developer A quick brief of the Job: Java Full ...,developer quick brief java full stackcontract ...,2,15
1423,3701308803,Data Scientist,"Geomagical Labs is a high-tech AI company, in ...",Intermediate,Contract,California,IT,https://www.linkedin.com/jobs/view/3701308803/...,Data Scientist Geomagical Labs is a high-tech ...,data scientist geomagical lab hightech ai stra...,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
687,3756148205,Developer,Want to learn more about this role and Jobot? ...,Senior,Full-time,North Carolina,IT,https://www.linkedin.com/jobs/view/3756148205/...,Developer Want to learn more about this role a...,developer want learn jobot click jobot logo fo...,2,17
2034,3693065721,UI/UX Designer,"Overview\n Berry Global Berry Global, headqua...",Beginner,Full-time,Indiana,IT,https://www.linkedin.com/jobs/view/3693065721/...,UI/UX Designer Overview\n Berry Global Berry ...,uiux designer overview berry global berry glob...,2,17
1835,3694112271,Account Executive,"Description\nAbout Us\nHiBob helps modern, mid...",Senior,Temporary,New York,Finance,https://www.linkedin.com/jobs/view/3694112271/...,Account Executive Description\nAbout Us\nHiBob...,account executive description hibob modern mid...,2,18
1431,3701306922,Developer,Role: Unisys Cobol Developer\nLocation: Onsite...,Beginner,Full-time,Iowa,IT,https://www.linkedin.com/jobs/view/3701306922/...,Developer Role: Unisys Cobol Developer\nLocati...,developer unisys cobol developer location onsi...,2,15


### **2. Use second approach to find most similar job to user profile and then find the most similar jobs to this job in terms of euclidean similarity**

In [33]:
cosine_matrix = cosine_similarity(df_lda.drop(columns = ['cluster_labels_lda1']))
cosine_df = pd.DataFrame(cosine_matrix)
cosine_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2228,2229,2230,2231,2232,2233,2234,2235,2236,2237
0,1.000000,1.000000,0.999979,0.999983,0.999979,0.994627,0.954004,0.971100,0.536029,0.976589,...,0.238673,0.238673,0.238673,0.0,0.236557,0.0,0.238673,0.0,0.238673,0.238673
1,1.000000,1.000000,0.999972,0.999988,0.999985,0.994727,0.953776,0.970868,0.536828,0.976486,...,0.239617,0.239617,0.239617,0.0,0.237492,0.0,0.239617,0.0,0.239617,0.239617
2,0.999979,0.999972,1.000000,0.999923,0.999915,0.993935,0.955505,0.972628,0.530681,0.977253,...,0.232367,0.232367,0.232367,0.0,0.230307,0.0,0.232367,0.0,0.232367,0.232367
3,0.999983,0.999988,0.999923,1.000000,1.000000,0.995220,0.952606,0.969677,0.540865,0.975950,...,0.244389,0.244389,0.244389,0.0,0.242223,0.0,0.244389,0.0,0.244389,0.244389
4,0.999979,0.999985,0.999915,1.000000,1.000000,0.995283,0.952449,0.969518,0.541399,0.975877,...,0.245022,0.245022,0.245022,0.0,0.242850,0.0,0.245022,0.0,0.245022,0.245022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2233,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.186815,0.000000,0.190242,0.187431,...,0.000000,0.000000,0.000000,1.0,0.132860,1.0,0.000000,1.0,0.000000,0.000000
2234,0.238673,0.239617,0.232367,0.244389,0.245022,0.337920,0.000000,0.000000,0.926650,0.130571,...,1.000000,1.000000,1.000000,0.0,0.991135,0.0,1.000000,0.0,1.000000,1.000000
2235,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.186815,0.000000,0.190242,0.187431,...,0.000000,0.000000,0.000000,1.0,0.132860,1.0,0.000000,1.0,0.000000,0.000000
2236,0.238673,0.239617,0.232367,0.244389,0.245022,0.337920,0.000000,0.000000,0.926650,0.130571,...,1.000000,1.000000,1.000000,0.0,0.991135,0.0,1.000000,0.0,1.000000,1.000000


In [35]:
# filtering jobs based on the user cluster
cluster_jobs = df_lda[df_lda["cluster_labels_lda1"] == user_cluster[0]]

# getting job closest to the user data point based on euclidean distance
close_jobs = euclidean_distances(dense_vector.reshape(1,-1),cluster_jobs.drop(columns = ["cluster_labels_lda1"]))
close_jobs_sim = pd.DataFrame(close_jobs.reshape(-1,1),index = cluster_jobs.index)
close_jobs_df = df.iloc[close_jobs_sim.index,:]
top_job = close_jobs_df[close_jobs_df["job_title"] == user["job_title"]]
top_job = pd.DataFrame(top_job.iloc[[1]])

# comparing the distances of this job (closest to user data point) to other jobs based on the euclidean distances matrix
similar_jobs_scores = cosine_df[[top_job.index[0]]].drop(top_job.index[0])
similar_n_jobs_scores = similar_jobs_scores.nlargest(700,top_job.index)


similar_n_jobs_df = df.iloc[similar_n_jobs_scores.index,:]
similar_n_jobs_df


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
708,3756131672,Data Scientist,Milliman’s Medicaid Finance and Policy Practic...,Senior,Full-time,Wisconsin,IT,https://www.linkedin.com/jobs/view/3756131672/...,Data Scientist Milliman’s Medicaid Finance and...,data scientist millimans medicaid finance seek...,1,17
1264,3748844912,Data Scientist,The position of Data Processing Systems Analys...,Intermediate,Full-time,Hawaii,IT,https://www.linkedin.com/jobs/view/3748844912/...,Data Scientist The position of Data Processing...,data scientist data processing system analyst ...,1,3
1774,3697357361,Nurse,Ascend Learning is a national leader in data d...,Senior,Full-time,Kansas,Healthcare,https://www.linkedin.com/jobs/view/3697357361/...,Nurse Ascend Learning is a national leader in ...,nurse ascend learning leader data driven onlin...,1,16
871,3755588935,Nurse,Working with Us\n\nChallenging. Meaningful. Li...,Senior,Full-time,Florida,Healthcare,https://www.linkedin.com/jobs/view/3755588935/...,Nurse Working with Us\n\nChallenging. Meaningf...,nurse challenging meaningful lifechanging aren...,1,16
207,3757724610,Data Scientist,The salary range for this role takes into acco...,Senior,Full-time,Kansas,IT,https://www.linkedin.com/jobs/view/3757724610/...,Data Scientist The salary range for this role ...,data scientist take account wide factor consid...,2,17
...,...,...,...,...,...,...,...,...,...,...,...,...
1433,3701306417,Nurse,Job Title: Staff/Registered NurseLocation: Ric...,Beginner,Contract,Indiana,Healthcare,https://www.linkedin.com/jobs/view/3701306417/...,Nurse Job Title: Staff/Registered NurseLocatio...,nurse title staffregistered nurselocation rich...,1,16
89,3757765713,Banker,Why Wells Fargo\n\nAre you ready for the next ...,Senior,Full-time,Pennsylvania,Finance,https://www.linkedin.com/jobs/view/3757765713/...,Banker Why Wells Fargo\n\nAre you ready for th...,banker fargo ready next step begin known appro...,0,2
985,3755513963,Financial Analyst,We Stand For Something Good\n\nOur secret to l...,Beginner,Full-time,New York,Finance,https://www.linkedin.com/jobs/view/3755513963/...,Financial Analyst We Stand For Something Good\...,financial analyst stand something good secret ...,0,12
1610,3699081709,Banker,Brief Description\nThis position is responsibl...,Senior,Full-time,Pennsylvania,Finance,https://www.linkedin.com/jobs/view/3699081709/...,Banker Brief Description\nThis position is res...,banker brief description responsible performin...,0,2


### **3. Intersection of strategies**

Intersection of jobs (common indexes) from the above two strategies which is the final recommended job list

In [36]:
intersecting_recommendation = similar_n_jobs_df.index.intersection(top_n_jobs.index)
intersecting_recommendation

Index([ 207, 2004, 1431, 1835,  465,  113, 1942, 1044, 1753, 1559,
       ...
       1190,  926, 1315, 2135,  499, 1229,  467, 1595, 1585,  247],
      dtype='int64', length=312)

In [37]:
print('Top jobs to recommend using chatbot:')
df.iloc[intersecting_recommendation].head(20)

Top jobs to recommend using chatbot:


Unnamed: 0,job_id,job_title,job_desc,experience_level,work_type,location,industry,job_posting_url,texts,texts_preprocessed,cluster_labels_lda1,cluster_labels_tfidf2
207,3757724610,Data Scientist,The salary range for this role takes into acco...,Senior,Full-time,Kansas,IT,https://www.linkedin.com/jobs/view/3757724610/...,Data Scientist The salary range for this role ...,data scientist take account wide factor consid...,2,17
2004,3693068277,Software Engineer,The Software Engineering (SE) Analyst II will ...,Beginner,Full-time,Illinois,IT,https://www.linkedin.com/jobs/view/3693068277/...,Software Engineer The Software Engineering (SE...,software engineer software engineering se anal...,2,17
1431,3701306922,Developer,Role: Unisys Cobol Developer\nLocation: Onsite...,Beginner,Full-time,Iowa,IT,https://www.linkedin.com/jobs/view/3701306922/...,Developer Role: Unisys Cobol Developer\nLocati...,developer unisys cobol developer location onsi...,2,15
1835,3694112271,Account Executive,"Description\nAbout Us\nHiBob helps modern, mid...",Senior,Temporary,New York,Finance,https://www.linkedin.com/jobs/view/3694112271/...,Account Executive Description\nAbout Us\nHiBob...,account executive description hibob modern mid...,2,18
465,3757468588,Data Scientist,"Company Description\n\nAt Intuitive, we are un...",Beginner,Full-time,Virginia,IT,https://www.linkedin.com/jobs/view/3757468588/...,Data Scientist Company Description\n\nAt Intui...,data scientist description intuitive behind mi...,2,17
113,3757748126,Data Scientist,The Josef Korbel School of International Studi...,Senior,Internship,Colorado,IT,https://www.linkedin.com/jobs/view/3757748126/...,Data Scientist The Josef Korbel School of Inte...,data scientist josef korbel school internation...,2,0
1942,3693584462,Database Management,If you’re looking to be part of a team that va...,Senior,Full-time,Ohio,IT,https://www.linkedin.com/jobs/view/3693584462/...,Database Management If you’re looking to be pa...,database management youre part value bold thin...,2,9
1044,3749906347,Developer,Senior PHP Developer – 100% Remote\n\nOur Port...,Senior,Full-time,Oregon,IT,https://www.linkedin.com/jobs/view/3749906347/...,Developer Senior PHP Developer – 100% Remote\n...,developer senior php developer remote portland...,2,15
1753,3697361293,Software Engineer,"This internship is in person in Omaha, NE for ...",Beginner,Part-time,Nebraska,IT,https://www.linkedin.com/jobs/view/3697361293/...,Software Engineer This internship is in person...,software engineer internship person omaha ne d...,2,17
1559,3699405298,Data Scientist,About Milliman\n\nIndependent for over 75 year...,Senior,Full-time,California,IT,https://www.linkedin.com/jobs/view/3699405298/...,Data Scientist About Milliman\n\nIndependent f...,data scientist milliman independent milliman d...,2,0
