# Job Recommendation System

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# turn off warnings
warnings.filterwarnings('ignore')

In [35]:
# load data
df = pd.read_csv('../data/job_posting_clean.csv')
df.head()

Unnamed: 0,title,description,views,work_type,location,tokenized_Description,tokenized_desc_join
0,Licensed Insurance Agent,While many industries were hurt by the last fe...,5.0,FULL_TIME,"Chico, CA","['many', 'industries', 'hurt', 'last', 'people...",many industries hurt last people still need in...
1,Sales Manager,Are you a dynamic and creative marketing profe...,0.0,FULL_TIME,"Santa Clarita, CA","['dynamic', 'creative', 'marketing', 'professi...",dynamic creative marketing professional lookin...
2,Model Risk Auditor,Join Us as a Model Risk Auditor – Showcase You...,17.0,CONTRACT,"New York, NY","['join', 'us', 'model', 'risk', 'auditor', 'sh...",join us model risk auditor showcase financial ...
3,Business Manager,Business ManagerFirst Baptist Church ForneyFor...,0.0,FULL_TIME,"Forney, TX","['business', 'managerfirst', 'baptist', 'churc...",business managerfirst baptist church forneyfor...
4,NY Studio Assistant,YOU COULD BE ONE OF THE MAGIC MAKERS\nKen Fulk...,2.0,FULL_TIME,"New York, NY","['could', 'one', 'magic', 'makers', 'ken', 'fu...",could one magic makers ken fulk inc seeking st...


In [36]:
df.shape

(15886, 7)

There are users who has rated only once, even if they have rated it 5, it can't be considered a valuable record for recommendation. So I have considered minimum 200 ratings by the user as threshold value. You can play around changing the threshold value to get better results, but this worked fine

In [37]:
# filter out views less than 5
df = df[df['views'] >= 5]

In [38]:
df.shape

(10430, 7)

In [39]:
job_pivot = df.pivot_table(index='title',columns=df.index,values='views').fillna(0)
job_pivot.head()

Unnamed: 0_level_0,0,2,5,6,7,10,11,12,14,30,...,15872,15873,15874,15875,15876,15877,15878,15882,15884,15885
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"AG - BIM - Opportunistic Hires (Recruitment) 1085 ,",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
React JS + Java Full stack Developer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Account Manager,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Accounting Associate (2196),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Associate Director Marketing Asset Management,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
from scipy.sparse import csr_matrix

job_matrix = csr_matrix(job_pivot.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(job_matrix)

In [41]:
query_index = np.random.choice(job_pivot.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(job_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

5983


In [42]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(job_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, job_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Senior HVAC Engineer:

1: Project Manager - Traveling, with distance of 1.0:
2: Project Manager / Construction Project Manager, with distance of 1.0:
3: Project Manager - Supply Chain, with distance of 1.0:
4: Project Manager 2, with distance of 1.0:
5: Project Manager -General Contractor- K-12 Construction, with distance of 1.0:


# Furtherwork

In [43]:
import re
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

df['description'] = df['description'].apply(text_cleaning)
df['tokenized_Description'] = df['tokenized_Description'].apply(text_cleaning)

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
df['tokenized_Description'] = df['tokenized_Description'].fillna('')
genres_str = df['tokenized_Description'].str.split(',').astype(str)
tfv_matrix = tfv.fit_transform(genres_str)

In [45]:
tfv_matrix.shape

(10430, 396095)

In [46]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [47]:
len(sig)

10430

In [59]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
indices.head()

title
Licensed Insurance Agent    0
Model Risk Auditor          2
Office Associate            5
Education Manager           6
Civil Engineer              7
dtype: int64

In [60]:
indices_frame = pd.DataFrame(indices).reset_index().drop_duplicates(subset = 'title')
# rename the columns
indices_frame.columns = ['title', 'views']
indices_frame['work type'] = df['work_type']
indices_frame['location'] = df['location']
indices_frame.head()

Unnamed: 0,title,views,work type,location
0,Licensed Insurance Agent,0,FULL_TIME,"Chico, CA"
1,Model Risk Auditor,2,,
2,Office Associate,5,CONTRACT,"New York, NY"
3,Education Manager,6,,
4,Civil Engineer,7,,


In [50]:
# select item name Civil Engineer from indices_frame
test = indices_frame[indices_frame['title'] == 'Civil Engineer']
test.index.to_numpy()[0]

4

In [51]:
def give_rec(titlename, sig=sig):
    # Get the index corresponding to original_title
    idx = indices_frame[indices_frame['title'] == titlename]['views']
    idx = idx.index.to_numpy()[0]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    anime_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return pd.DataFrame({'Job Title': df['title'].iloc[anime_indices].values,
                                 'View': df['views'].iloc[anime_indices].values})

In [52]:
give_rec('Civil Engineer').sort_values(by='View', ascending=False).head()

Unnamed: 0,Job Title,View
2,Director of Engineering,124.0
8,Product Development Engineer,68.0
7,Interior Designer,44.0
3,Project Consultant,31.0
4,Computer Aided Design Technician,22.0


# Text Matching For Jobs
- We can only use the existing job in the dataset, but job can often be similar but with slight variation
- We will use fuzzy wuzzy matching to find the most similar job with in the dataset that the user input and use the most similar result for the computation

In [53]:
import fuzzywuzzy
from fuzzywuzzy import process

In [54]:
def job_matcher(df, column, string_to_match, min_ratio=85):
    # get a list of unique strings
    strings = df[column].unique()

    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match,
        strings,
        limit=10,
        scorer=fuzzywuzzy.fuzz.token_sort_ratio,
    )

    # only get matches with a ratio > 88
    close_matches = [
        matches[0] for matches in matches if matches[1] >= min_ratio
    ]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)
    
    return df.loc[rows_with_matches]['title'].to_numpy()[0]

job_matcher(df, 'title', 'Civil Engineer')


'Civil Engineer'