In [1]:
import spacy
import json
import re
import copy
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from modules.ml_tools import *



In [2]:
embeddings, data, oos_data = process_text(spacy_lib_str = "en_core_web_md")

In [3]:
levels = [d["level"] for d in data if "level" in d.keys()]
titles = [d["title"] for d in data if "title" in d.keys()]

model_data = [e for e in embeddings if e["level"] is not None]
missing_level_data = [e for e in embeddings if e["level"] is None]

In [5]:
# Calculate statistics
total_jobs = len(data)
missing_levels_size = len(missing_level_data)
unique_levels = len(set(levels))

# Print statistics
print(f"Total jobs: {total_jobs}")
print(f"Missing levels: ({missing_levels_size / total_jobs * 100:.2f}% of total)")

# Print unique levels
print("Unique levels:")
for level in set(levels):
    print(f"- {INVERT_LVL_DIC[level]}")

Total jobs: 141
Missing levels: (53.19% of total)
Unique levels:
- Internship
- Entry Level
- Mid Level
- Senior Level


- The total number of jobs in the dataset is 141.
- The missing levels percentage is 53.19% of total. This indicates that approximately half of the jobs in the dataset have missing levels.

In [7]:
# What is the coverage and accuracy if we apply heuristic rules?
infer_data = infer_lvl_from_rules(oos_data)
in_samp_data = infer_lvl_from_rules(model_data)

data_heu_pred = [i for i in in_samp_data if i["level_inf"] is not None]
data_heu_correct = [i for i in data_heu_pred if i["level"] == i["level_inf"]]

acc_heu = len(data_heu_correct)/len(data_heu_pred) # in sample accuracy using only heuristics

# Keyword heuristic coverage
coverage_heu = len(data_heu_pred)/len(in_samp_data) 

print("In-Sample Accuracy (Heuristics Only):", acc_heu)
print("Keyword Heuristic Coverage:", coverage_heu)

In-Sample Accuracy (Heuristics Only): 0.8846153846153846
Keyword Heuristic Coverage: 0.36879432624113473


Using only heuristic rules in `modules.ml_tools.heuristic_rules` function, we are able to obtain 88% accuracy, with coverage over 36% of the data. The rest we will not need to use another model.

In [8]:
xg_model, accuracy_tr, accuracy_test = run_xgboost_pipeline(in_samp_data)
in_samp_acc = in_sample_accuracy(in_samp_data, xg_model)

print(f"XGB Training Accuracy: {accuracy_tr:.2f}")
print(f"XGB Testing Accuracy: {accuracy_test:.2f}")

XGB Training Accuracy: 0.87
XGB Testing Accuracy: 0.38


In [None]:
infer_data = run_inference_oos(oos_data, xg_model)


In [None]:
# augment data, with heuristic predictions
infer_data_w_lab = [i for i in infer_data if i["level_inf"] is not None]

for i in infer_data_w_lab:
    i["level"] = i["level_inf"]

in_samp_data_augmented = in_samp_data + infer_data_w_lab

In [None]:
data_x = np.array([d["doc_vec"].vector for d in in_samp_data_augmented])
data_y = np.array([d["level"] for d in in_samp_data_augmented])
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=42)


In [None]:
model, accuracy_tr, accuracy_test = run_xgboost(X_train, X_test, y_train, y_test)
print(accuracy_tr, accuracy_test)
# no significant improvements with augmentation

0.8931297709923665 0.48484848484848486




In [None]:
for d in infer_data:
    if d['level_inf'] is None:
        d['level_inf'] = model.predict(np.array([d["doc_vec"].vector]))[0]

In [None]:
for d in in_samp_data:
    if d['level_inf'] is None:
        d['level_inf'] = model.predict(np.array([d["doc_vec"].vector]))[0]

In [None]:
# hiearchical model accuracy on in sample data
data_insamp_correct = [i for i in in_samp_data if i["level"] == i["level_inf"]]
len(data_insamp_correct)/len(in_samp_data) # in sample accuracy using only heuristics

0.8368794326241135