In [30]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/kasra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kasra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kasra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv('./datasets/job_descriptions.csv')
df = df.sample(frac=0.01)
df.to_csv('./datasets/job_descriptions_sample.csv', index=False)

In [48]:
df = pd.read_csv('./datasets/job_descriptions_sample.csv')
df.dropna(inplace=True)
df.drop(['Job Id', 'latitude', 'longitude', 'Contact Person', 'Contact', 'Company Size', 'Job Posting Date', 'Company Profile', 'Company', 'Job Portal'], axis=1, inplace=True)
df.rename(columns={'skills': 'Skills'}, inplace=True)
df.drop_duplicates(subset=['Job Description', 'Benefits', 'Skills', 'Responsibilities'], inplace=True)
df[['Salary Min', 'Salary Max']] = df['Salary Range'].str.split('-', expand=True)
df[['Salary Min', 'Salary Max']] = df[['Salary Min', 'Salary Max']].apply(lambda x: x.str.replace('$', ''))
df[['Salary Min', 'Salary Max']] = df[['Salary Min', 'Salary Max']].apply(lambda x: x.str.replace('K', ''))
df.drop(['Salary Range'], axis=1, inplace=True)
df[['Experience Min', 'Experience Max']] = df['Experience'].str.split('to', expand=True)
df[['Experience Min', 'Experience Max']] = df[['Experience Min', 'Experience Max']].apply(lambda x: x.str.replace('Years', ''))
df.drop(['Experience'], axis=1, inplace=True)
df['Salary Min'] = pd.to_numeric(df['Salary Min'], errors='coerce')
df['Salary Max'] = pd.to_numeric(df['Salary Max'], errors='coerce')
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,Qualifications,location,Country,Work Type,Preference,Job Title,Role,Job Description,Benefits,Skills,Responsibilities,Salary Min,Salary Max,Experience Min,Experience Max
0,MBA,Montevideo,Uruguay,Full-Time,Female,Network Security Specialist,Security Consultant,A Security Consultant is responsible for asses...,"{'Transportation Benefits, Professional Develo...",Security consulting Risk assessment Security a...,Provide expert advice on security strategies a...,64,120,3,15
1,B.Com,Cockburn Town,Turks and Caicos Islands,Part-Time,Female,Architect,Sustainable Design Specialist,Sustainable Design Specialists incorporate eco...,"{'Tuition Reimbursement, Stock Options or Equi...",Sustainable design principles Energy efficienc...,Focus on sustainable and eco-friendly architec...,60,113,3,12


In [49]:
df.shape

(3936, 15)

In [39]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text


df['Responsibilities'] = df['Responsibilities'].apply(preprocess_text)
df['Job Description'] = df['Job Description'].apply(preprocess_text)
df['Skills'] = df['Skills'].apply(preprocess_text)


In [40]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['Responsibilities'] + df['Job Description'] + df['Skills'])

responsibilities_vectors = vectorizer.transform(df['Responsibilities'])
job_description_vectors = vectorizer.transform(df['Job Description'])
skills_vectors = vectorizer.transform(df['Skills'])

In [41]:
print(responsibilities_vectors.shape)
print(job_description_vectors.shape)
print(skills_vectors.shape)

(3936, 1542)
(3936, 1542)
(3936, 1542)


In [42]:
combined_vectors = responsibilities_vectors + job_description_vectors + skills_vectors

combined_similarity_matrix = cosine_similarity(combined_vectors)

related_indices = combined_similarity_matrix[0].argsort()[::-1]
closest_indices = related_indices[1:11]

print(f"Closest to index 0: {closest_indices}")

Closest to index 0: [  13 3103 2898 2298 2812  403 1732 2497  466 3171]


In [29]:
target_row_index = 0

target_salary_min = df.loc[target_row_index, 'Salary Min']
target_salary_max = df.loc[target_row_index, 'Salary Max']

distances = np.sqrt((df['Salary Min'] - target_salary_min)**2 + (df['Salary Max'] - target_salary_max)**2)

sorted_indices = distances.argsort()

N = 10
most_similar_rows = df.iloc[sorted_indices[1:N+1]]
