In [1]:
import spacy
import json
import re
import copy
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from modules.ml_tools import *

In [2]:
nlp = spacy.load("en_core_web_sm")

with open("data.json") as f:
    data = json.load(f)

In [3]:
data

[{'level': 'Entry Level',
  'description': "OUTFITTERY is Europe’s biggest Personal Shopping Service and we have a clear mission: We build the future of menswear shopping. Our goal is to make men happy!\nAre you interested in building products used by hundreds of thousend of people? Do you like delivering code at a rapid pace? Outfittery is seeking an experienced Frontend Engineer (m/f) that is passionate about building mobile and desktop web applications. This position is full-time and is based in our Berlin office.\n \nYour responsibilities:\n\n\nImplement the features and user interfaces of Outfittery products like the Outfit Recommender\n\n\nArchitect efficient and reusable front-end systems that drive complex web applications\n\n\nCollaborate with Product Designers, Product Managers, and Software Engineers to deliver compelling user-facing products\n\n\nIdentify and resolve performance and scalability issues\n\n\n \nYour profile:\n\n\n3+ years of JavaScript experience, including c

In [4]:
# Basic stats

levels = [d["level"] for d in data if "level" in d.keys()]
titles = [d["title"] for d in data if "title" in d.keys()]

print(len(data))
print(len(levels)) # Half the labels are missing
print(len(set(levels))) # For classes to predict
print(len(set(titles))) # almost all titles are unique, they can have some important information about the prediction
print(set(levels))


216
141
4
212
{'Entry Level', 'Internship', 'Senior Level', 'Mid Level'}


In [5]:
embeddings = generate_embeddings(data)

In [6]:
model_data = [e for e in embeddings if e["level"] is not None]
missing_title_data = [e for e in embeddings if e["level"] is None]

In [7]:
# What is the coverage if we apply heuristic rules?
infer_data = infer_lvl_from_rules(missing_title_data)
in_samp_data = infer_lvl_from_rules(model_data)


In [8]:
len(in_samp_data)
in_samp_data

[{'level_inf': None,
  'level': 1,
  'title': 'Frontend Engineer (m/f)',
  'desc_text': "OUTFITTERY is Europe’s biggest Personal Shopping Service and we have a clear mission: We build the future of menswear shopping. Our goal is to make men happy!\nAre you interested in building products used by hundreds of thousend of people? Do you like delivering code at a rapid pace? Outfittery is seeking an experienced Frontend Engineer (m/f) that is passionate about building mobile and desktop web applications. This position is full-time and is based in our Berlin office.\n \nYour responsibilities:\n\n\nImplement the features and user interfaces of Outfittery products like the Outfit Recommender\n\n\nArchitect efficient and reusable front-end systems that drive complex web applications\n\n\nCollaborate with Product Designers, Product Managers, and Software Engineers to deliver compelling user-facing products\n\n\nIdentify and resolve performance and scalability issues\n\n\n \nYour profile:\n\n\n3

In [9]:

data_heu_pred = [i for i in in_samp_data if i["level_inf"] is not None]
data_heu_correct = [i for i in data_heu_pred if i["level"] == i["level_inf"]]

len(data_heu_correct)/len(data_heu_pred) # in sample accuracy using only heuristics

0.8846153846153846

In [10]:
len(data_heu_pred)/len(in_samp_data) # Can estimate ~36% of the data using heuristics with 88% accuracy.

0.36879432624113473

In [22]:
# train xgboost model
in_samp_data[0]["doc_vec"]

array([-0.06985028, -0.16447067, -0.2055225 ,  0.04123456,  0.15656285,
        0.35479012,  0.30583754,  0.36857107,  0.21695788, -0.07687789,
       -0.05280695,  0.00536465, -0.3993671 , -0.15417135, -0.08534644,
        0.00372916,  0.03200232,  0.13694769, -0.22489032, -0.11952426,
       -0.14080803,  0.22783725,  0.02982019, -0.10934812,  0.057911  ,
       -0.1342012 ,  0.40479118,  0.22024822,  0.25641373, -0.10261051,
       -0.00469595, -0.17551881,  0.24953668, -0.14290991,  0.18353695,
       -0.10930878,  0.50206923,  0.0706281 , -0.20413771, -0.17580517,
       -0.2986951 ,  0.17606413, -0.10680257,  0.00537238,  0.03855364,
        0.00278362, -0.17625192,  0.13042249,  0.03064273, -0.0527813 ,
       -0.33947045,  0.01088314,  0.05900292, -0.3232903 , -0.26580054,
       -0.01511579, -0.20101538, -0.08309133,  0.03895557, -0.08267199,
       -0.12144693, -0.1813356 ,  0.12978254, -0.17632462,  0.1916408 ,
        0.07090751, -0.01471244, -0.00624061,  0.05760714,  0.09

In [12]:
in_samp_data_x = np.array([d["doc_vec"] for d in in_samp_data])
in_samp_data_y = np.array([d["level"] for d in in_samp_data])

In [13]:

X_train, X_test, y_train, y_test = train_test_split(in_samp_data_x, in_samp_data_y, test_size=0.2, random_state=42)

In [14]:
model = xgb.XGBClassifier(
    n_estimators=2,  # Increase the number of trees (default: 100)
    max_depth=20,       # Increase the maximum depth of each tree (default: 3)
    learning_rate=0.03, # Increase the learning rate (default: 0.1)
    subsample=0.7,     # Increase the subsample ratio (default: 1.0)
    colsample_bytree=0.6,  # Increase the feature subsampling ratio (default: 1.0)
)

In [15]:
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=2, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
y_train_pred = model.predict(X_train)

accuracy_tr = accuracy_score(y_train, y_train_pred)
accuracy_tr

0.875

In [17]:
y_pred = model.predict(X_test)

# Evaluate the predictions
accuracy_test = accuracy_score(y_test, y_pred)
accuracy_test

0.4482758620689655

In [18]:
y_test

array([0, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 2, 2, 2, 1, 1, 1, 1, 2, 3, 0, 1,
       3, 3, 0, 1, 1, 3, 1])

In [19]:
y_pred

array([0, 2, 3, 3, 3, 3, 1, 2, 3, 3, 1, 1, 3, 2, 3, 2, 2, 1, 1, 1, 0, 3,
       3, 1, 3, 1, 3, 3, 3])