In [113]:
import pandas as pd
import numpy as np
import pickle
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [114]:
df = pd.read_csv(r'data/mini-job-rec-dataset - jobs_data.csv')

In [115]:
df

Unnamed: 0,job_id,job_title,specialization,job_description,skillsets,year_of_experience
0,1,Junior Accountant,Accounting & Finance,"Assists in financial record-keeping, prepares ...","Accounting principles, Excel, attention to detail",1-2
1,2,Financial Analyst,Accounting & Finance,Analyzes financial data to assist in business ...,"Financial modeling, data analysis, Excel",2-4
2,3,Tax Consultant,Accounting & Finance,Advises clients on tax planning and compliance.,"Tax law, analytical skills, communication",3-5
3,4,Audit Manager,Accounting & Finance,"Oversees auditing processes, ensures complianc...","Auditing, critical thinking, leadership",5+
4,5,Entry-Level Software Developer,Software Developer,,"Programming languages, HTML, CSS",0-2
5,6,Full-Stack Developer,Software Developer,Develops front-end and back-end aspects of web...,"Full-stack development, database management",3-5
6,7,Mobile App Developer,Software Developer,Specializes in creating applications for mobil...,"Mobile development, UI/UX design, Java/Kotlin",2-4
7,8,Software Architect,,Designs the overall structure of software syst...,"System design, programming, leadership",6+
8,9,Receptionist,Admin,"Manages front desk, answers calls, and assists...","Customer service, organizational skills",0-2
9,10,Executive Assistant,Admin,Provides comprehensive support to senior execu...,"Time management, discretion, MS Office",


In [116]:
# check for NaN values in df
df.isna().sum()

job_id                0
job_title             0
specialization        2
job_description       1
skillsets             1
year_of_experience    1
dtype: int64

In [117]:
# fill job_description and skillsets NaN values with empty string
df["job_description"] = df["job_description"].fillna("")
df["skillsets"] = df["skillsets"].fillna("")

In [118]:
# # only interested in minimum years of experience
# df['min_year_of_experience'] = df['year_of_experience'].apply(lambda x: int(x.split('-')[0]) if '-' in x else int(x.replace('+', '')))

# drop year_of_experience (not required)
df = df.drop("year_of_experience", axis=1)

# add combine_description column for training specialization classifier
df["combined_description"] = df['job_title'] + " " + df['job_description'] + " " + df['skillsets']

# combine IT and Software Developer into IT & Software Developer column
df["specialization"] = df["specialization"].apply(lambda x: "IT & Software Developer" if x in ["IT", "Software Developer"] else x)

In [119]:
df

Unnamed: 0,job_id,job_title,specialization,job_description,skillsets,combined_description
0,1,Junior Accountant,Accounting & Finance,"Assists in financial record-keeping, prepares ...","Accounting principles, Excel, attention to detail",Junior Accountant Assists in financial record-...
1,2,Financial Analyst,Accounting & Finance,Analyzes financial data to assist in business ...,"Financial modeling, data analysis, Excel",Financial Analyst Analyzes financial data to a...
2,3,Tax Consultant,Accounting & Finance,Advises clients on tax planning and compliance.,"Tax law, analytical skills, communication",Tax Consultant Advises clients on tax planning...
3,4,Audit Manager,Accounting & Finance,"Oversees auditing processes, ensures complianc...","Auditing, critical thinking, leadership","Audit Manager Oversees auditing processes, ens..."
4,5,Entry-Level Software Developer,IT & Software Developer,,"Programming languages, HTML, CSS",Entry-Level Software Developer Programming la...
5,6,Full-Stack Developer,IT & Software Developer,Develops front-end and back-end aspects of web...,"Full-stack development, database management",Full-Stack Developer Develops front-end and ba...
6,7,Mobile App Developer,IT & Software Developer,Specializes in creating applications for mobil...,"Mobile development, UI/UX design, Java/Kotlin",Mobile App Developer Specializes in creating a...
7,8,Software Architect,,Designs the overall structure of software syst...,"System design, programming, leadership",Software Architect Designs the overall structu...
8,9,Receptionist,Admin,"Manages front desk, answers calls, and assists...","Customer service, organizational skills","Receptionist Manages front desk, answers calls..."
9,10,Executive Assistant,Admin,Provides comprehensive support to senior execu...,"Time management, discretion, MS Office",Executive Assistant Provides comprehensive sup...


#### Specialization Classifier

##### Tokenization and Embeddings with BERT

In [120]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [121]:
def get_bert_embeddings(bert_model, bert_tokenizer, text, device):
    with torch.no_grad():
        tokenized_text = bert_tokenizer.encode(text, add_special_tokens=True, return_tensors="pt").to(device)
        # outputs = bert_model(torch.LongTensor([token]))
        outputs = bert_model(tokenized_text)
        last_hidden_states = outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
    return last_hidden_states

In [122]:
df["embedded_combined_description"] = df['combined_description'].apply(lambda x: get_bert_embeddings(bert_model,tokenizer,x,device))

In [123]:
# To contain rows with specialization NaN values
df_nan_specialization = df[df['specialization'].isna()]
df = df.dropna()

##### Experimentation with ML Models

In [41]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(np.array(df["embedded_combined_description"].to_list()), df['specialization'], 
                                                    test_size=0.2, stratify = df['specialization'], random_state = 1)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
}

# Train and evaluate models using cross-validation
print("Cross-validation results")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    print(f'{name} - Cross-Validation Accuracy: {scores.mean():.4f} (±{scores.std():.4f})')

# Test results
print("\nTest set results")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} - Test Accuracy: {accuracy:.4f}')

Cross-validation results
Logistic Regression - Cross-Validation Accuracy: 0.9333 (±0.0471)
Decision Tree - Cross-Validation Accuracy: 0.5333 (±0.0943)
Random Forest - Cross-Validation Accuracy: 0.7667 (±0.0471)
SVM - Cross-Validation Accuracy: 0.7000 (±0.0816)
KNN - Cross-Validation Accuracy: 0.8333 (±0.1247)

Test set results
Logistic Regression - Test Accuracy: 0.8750
Decision Tree - Test Accuracy: 0.5000
Random Forest - Test Accuracy: 0.7500
SVM - Test Accuracy: 0.6250
KNN - Test Accuracy: 0.7500


##### GridSearch

In [43]:
# Initialize Logistic Regression model
logistic_regression_model = LogisticRegression()

# Perform grid search with cross-validation
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'max_iter': [100, 200, 300] 
}

grid_search = GridSearchCV(logistic_regression_model, param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy with Best Model: {accuracy:.4f}')

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters: {'C': 1, 'max_iter': 100}
Test Accuracy with Best Model: 0.8750


In [None]:
# Save ML model
with open('models/spec_cls.pkl', 'wb') as file:
    pickle.dump(best_model, file)

#### Matching job using BERT embeddings

In [44]:
# Load ML model
with open('models/spec_cls.pkl', 'rb') as file:
    spec_cls_model = pickle.load(file)


def get_top_n_indices(array, n):
    top_n_indices = array.argsort()[::-1][:n]
    return top_n_indices

# Predict specialization
def predict_spec(spec_cls_model, bert_embedding, diff_threshold):
    predictions = spec_cls_model.predict_proba([bert_embedding])[0]
    top_2_indices = get_top_n_indices(predictions, 2)
    proba_difference = predictions[top_2_indices[0]] - predictions[top_2_indices[1]]
    if proba_difference >= diff_threshold:
        return [spec_cls_model.classes_[top_2_indices[0]]]
    return [spec_cls_model.classes_[idx] for idx in top_2_indices]

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [46]:
# Sample input
user_sample_text = "I am responsible for determining the types of risks that could affect a company’s financial health, legal compliance or reputation"
user_embeddings = get_bert_embeddings(bert_model, tokenizer, user_sample_text, device)

In [47]:
# Perform filtering to omit unwanted matching of irrelavant specialization
predicted_spec = predict_spec(spec_cls_model, user_embeddings, diff_threshold = 0.15)
print(predicted_spec)
spec_filter = df["specialization"].isin(predicted_spec)
df[spec_filter]

['Accounting & Finance']


Unnamed: 0,job_id,job_title,specialization,job_description,skillsets,combined_description,embedded_combined_description
0,1,Junior Accountant,Accounting & Finance,"Assists in financial record-keeping, prepares ...","Accounting principles, Excel, attention to detail",Junior Accountant Assists in financial record-...,"[-0.16596301, 0.23873356, 0.28833672, 0.239333..."
1,2,Financial Analyst,Accounting & Finance,Analyzes financial data to assist in business ...,"Financial modeling, data analysis, Excel",Financial Analyst Analyzes financial data to a...,"[-0.252826, -0.16007352, 0.2223302, 0.25238103..."
2,3,Tax Consultant,Accounting & Finance,Advises clients on tax planning and compliance.,"Tax law, analytical skills, communication",Tax Consultant Advises clients on tax planning...,"[-0.17809252, 0.2923153, 0.16424774, 0.1462709..."
3,4,Audit Manager,Accounting & Finance,"Oversees auditing processes, ensures complianc...","Auditing, critical thinking, leadership","Audit Manager Oversees auditing processes, ens...","[-0.14261793, 0.44612873, 0.17900231, 0.373750..."
20,21,Payroll Specialist,Accounting & Finance,"Manages payroll processing, ensures accuracy a...",,"Payroll Specialist Manages payroll processing,...","[-0.11196892, 0.142278, 0.1764309, 0.32669482,..."
21,22,Credit Analyst,Accounting & Finance,"Assesses credit risk, evaluates financial stat...","Financial analysis, risk assessment","Credit Analyst Assesses credit risk, evaluates...","[0.028640425, 0.26161313, 0.09391271, 0.121830..."
22,23,Investment Banker,Accounting & Finance,Advises on and manages large financial transac...,"Financial modelling, negotiation, valuation",Investment Banker Advises on and manages large...,"[-0.035764243, 0.2793286, 0.104248405, -0.1392..."
23,24,Risk Manager,Accounting & Finance,Identifies financial risks and develops strate...,"Risk management, analytical skills",Risk Manager Identifies financial risks and de...,"[-0.26526693, 0.34649956, 0.05648968, 0.198842..."
38,39,Compliance Officer,Accounting & Finance,Ensures financial operations and transactions ...,"Regulatory knowledge, detail-oriented",Compliance Officer Ensures financial operation...,"[-0.2585497, 0.25350386, 0.32698923, 0.1101090..."
39,40,Cost Accountant,Accounting & Finance,"Analyzes production costs, prepares budget rep...","Cost analysis, budgeting","Cost Accountant Analyzes production costs, pre...","[-0.22156699, 0.27856416, 0.077836, 0.35531604..."


In [48]:
df_filtered = df[spec_filter]
similarities = cosine_similarity(user_embeddings.reshape(1, -1), np.array(df_filtered["embedded_combined_description"].to_list()))[0]

In [49]:
# Predict top 3 job preference
def predict_job(df_filtered, user_embeddings):
    similarities = cosine_similarity(user_embeddings.reshape(1, -1), np.array(df_filtered["embedded_combined_description"].to_list()))[0]
    top_3_indices = get_top_n_indices(similarities, 3)
    recommended_jobs = [(df_filtered["job_title"].iloc[idx], df_filtered["job_description"].iloc[idx]) for idx in top_3_indices]
    return recommended_jobs

In [50]:
recommended_jobs = predict_job(df_filtered, user_embeddings)

In [190]:
for job, description in recommended_jobs:
    print(f"{job} - {description}")

Compliance Officer - Ensures financial operations and transactions comply with legal and regulatory requirements.
Credit Analyst - Assesses credit risk, evaluates financial statements and investment opportunities.
Risk Manager - Identifies financial risks and develops strategies to minimize impact.


In [63]:
# Back to processing specialization NaN values using trained ML model
df_nan_specialization

Unnamed: 0,job_id,job_title,specialization,job_description,skillsets,combined_description,embedded_combined_description
7,8,Software Architect,,Designs the overall structure of software syst...,"System design, programming, leadership",Software Architect Designs the overall structu...,"[0.107352175, 0.60535455, 0.09075236, 0.033642..."
31,32,Sales Engineer,,Combines technical knowledge with sales skills...,"Technical expertise, sales skills",Sales Engineer Combines technical knowledge wi...,"[-0.032000657, 0.49894565, 0.503667, 0.1387504..."


In [None]:
df_nan_specialization["specialization"] = df_nan_specialization["embedded_combined_description"].apply(lambda x: spec_cls_model.predict(x.reshape(1,-1))[0])

In [132]:
df_nan_specialization

Unnamed: 0,job_id,job_title,specialization,job_description,skillsets,combined_description,embedded_combined_description
7,8,Software Architect,IT & Software Developer,Designs the overall structure of software syst...,"System design, programming, leadership",Software Architect Designs the overall structu...,"[0.107352175, 0.60535455, 0.09075236, 0.033642..."
31,32,Sales Engineer,Sales,Combines technical knowledge with sales skills...,"Technical expertise, sales skills",Sales Engineer Combines technical knowledge wi...,"[-0.032000657, 0.49894565, 0.503667, 0.1387504..."


In [138]:
# Concatenate the processed df into a clean df
df_cleaned = pd.concat([df, df_nan_specialization], ignore_index=True)
df_cleaned.to_csv('data/mini-job-rec-dataset - jobs_data_clean.csv', index=False)