In [3]:
from GlassBox.ebm.ebm_utils import EBMGridSearch
from interpret.glassbox import ExplainableBoostingClassifier

import pandas as pd
%reload_ext autoreload
%autoreload 2

In [4]:
EBM_parameter = dict(
    learning_rate= [0.01, 0.02, 0.03],
    min_samples_leaf=[2, 3, 4, 5, 6, 7, 8, 9, 10],
    max_leaves=[2, 3, 4, 5, 6, 7],
)

# Grid-search

In [5]:
gs = EBMGridSearch(
    path_dataset="../../outputs/scores.csv",
    task="Classifier",
    random_state=841)

In [6]:
best_ = gs.grid_search(
    EBMModel=ExplainableBoostingClassifier,
    hyperparameters=EBM_parameter)
gs.save_model(best_[0], name="ebmclass_unbalanced")

100%|██████████| 162/162 [01:21<00:00,  2.00it/s, nDCG_15_at=0.762]


# Metric evaluation

In [7]:
best_model = gs.load_model(name="ebmclass_unbalanced")

In [8]:
# nDCG on test-set
nDCG_train = gs.eval_model(model=best_model, df=gs.train, nDCG_at=[1,10,15])
nDCG_valid = gs.eval_model(model=best_model, df=gs.valid, nDCG_at=[1,10,15])
nDCG_test = gs.eval_model(model=best_model, df=gs.test, nDCG_at=[1,10,15])

display(pd.DataFrame([nDCG_train,nDCG_valid,nDCG_test],index=["Training","Validation","Test"]))

Unnamed: 0,nDCG@1,nDCG@10,nDCG@15
Training,0.7909,0.7034,0.7143
Validation,0.7268,0.729,0.7617
Test,0.7411,0.7001,0.7295


# Grid search - balanced

In [9]:
gs = EBMGridSearch(
    path_dataset="../../outputs/balanced_scores.csv",
    task="Classifier",
    random_state=841)

In [10]:
best_ = gs.grid_search(
    EBMModel=ExplainableBoostingClassifier,
    hyperparameters=EBM_parameter)
gs.save_model(best_[0], name="ebmclass_balanced")

100%|██████████| 162/162 [00:39<00:00,  4.07it/s, nDCG_15_at=0.91]


In [11]:
best_model = gs.load_model(name="ebmclass_balanced")

In [12]:
# nDCG on test-set
nDCG_train = gs.eval_model(model=best_model, df=gs.train, nDCG_at=[1, 10, 15])
nDCG_valid = gs.eval_model(model=best_model, df=gs.valid, nDCG_at=[1, 10, 15])
nDCG_test = gs.eval_model(model=best_model, df=gs.test, nDCG_at=[1, 10, 15])

display(pd.DataFrame([nDCG_train, nDCG_valid, nDCG_test], index=["Training", "Validation", "Test"]))

Unnamed: 0,nDCG@1,nDCG@10,nDCG@15
Training,0.8424,0.8281,0.8759
Validation,0.7769,0.8943,0.9101
Test,0.8026,0.8474,0.8985


# Explanation

In [13]:
from interpret import show

In [14]:
# sample = gs.test.sort_values("labels").iloc[0,:]
# sample
# show(best_model.explain_local(sample.iloc[:,2:13]))

In [15]:
# best_model.bins_[0]

In [16]:
# best_model.bag_weights_[0]

## Example of Job-offer

In [17]:
# qId = 1
# job_curricula = gs.test[gs.test["qId"] == qId]
# 
# y_pred = best_model.predict(np.asarray(job_curricula.iloc[:, 2:13]))
# 
# y_pred = pd.DataFrame(y_pred, index=job_curricula.index, columns=["lambdas"])
# dt_final = pd.merge(job_curricula, y_pred, left_index=True, right_index=True)
# dt_final.sort_values("labels", ascending=False)["labels"].head(15)
# dt_final.sort_values("lambdas",ascending=False)["labels"].head(15)