In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/kasra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kasra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kasra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
df = pd.read_csv('./datasets/job_descriptions.csv')
df = df.sample(frac=0.01)
df.to_csv('./datasets/job_descriptions_sample.csv', index=False)

In [13]:
df = pd.read_csv('./datasets/job_descriptions_sample.csv')
df.dropna(inplace=True)
df.drop(['Job Id', 'latitude', 'longitude', 'Contact Person', 'Contact', 'Company Size', 'Job Posting Date', 'Company Profile', 'Company', 'Job Portal'], axis=1, inplace=True)
df.rename(columns={'skills': 'Skills'}, inplace=True)
df[['Salary Min', 'Salary Max']] = df['Salary Range'].str.split('-', expand=True)
df[['Salary Min', 'Salary Max']] = df[['Salary Min', 'Salary Max']].apply(lambda x: x.str.replace('$', ''))
df[['Salary Min', 'Salary Max']] = df[['Salary Min', 'Salary Max']].apply(lambda x: x.str.replace('K', ''))
df.drop(['Salary Range'], axis=1, inplace=True)
df[['Experience Min', 'Experience Max']] = df['Experience'].str.split('to', expand=True)
df[['Experience Min', 'Experience Max']] = df[['Experience Min', 'Experience Max']].apply(lambda x: x.str.replace('Years', ''))
df.drop(['Experience'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,Qualifications,location,Country,Work Type,Preference,Job Title,Role,Job Description,Benefits,Skills,Responsibilities,Salary Min,Salary Max,Experience Min,Experience Max
0,B.Tech,City of Port Louis,Mauritius,Temporary,Both,Marketing Director,Digital Marketing Director,Digital Marketing Directors lead digital marke...,"{'Life and Disability Insurance, Stock Options...",Digital marketing strategy Marketing analytics...,Develop and oversee digital marketing strategi...,61,80,3,8
1,MBA,Port of Spain,Trinidad and Tobago,Contract,Male,Procurement Manager,Procurement Analyst,"Analyze procurement data, identify cost-saving...","{'Life and Disability Insurance, Stock Options...",Procurement processes Vendor assessment Contra...,Analyze procurement data and trends to support...,55,116,0,12


In [15]:
df.shape

(16118, 15)

In [16]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text


df['Responsibilities'] = df['Responsibilities'].apply(preprocess_text)
df['Job Description'] = df['Job Description'].apply(preprocess_text)
df['Skills'] = df['Skills'].apply(preprocess_text)


In [21]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['Responsibilities'] + df['Job Description'] + df['Skills'])

responsibilities_vectors = vectorizer.transform(df['Responsibilities'])
job_description_vectors = vectorizer.transform(df['Job Description'])
skills_vectors = vectorizer.transform(df['Skills'])

In [22]:
print(responsibilities_vectors.shape)
print(job_description_vectors.shape)
print(skills_vectors.shape)

(16118, 1542)
(16118, 1542)
(16118, 1542)


In [23]:
combined_vectors = responsibilities_vectors + job_description_vectors + skills_vectors

combined_similarity_matrix = cosine_similarity(combined_vectors)

related_indices = combined_similarity_matrix[0].argsort()[::-1]
closest_indices = related_indices[1:11]

print(f"Closest to index 0: {closest_indices}")

Closest to index 0: [16068  9623 14125 14111 13030  9713 12322 10102  3927  3889]
