In [1]:
import pandas as pd
import numpy as np
from GlassBox.figs.figs_utils import FIGSGridSearch
from imodels import FIGSRegressor
%reload_ext autoreload
%autoreload 2

In [2]:
FIGS_parameter = dict(
    max_rules= [5,10,15,20,25],
    max_trees=[2,3,4,5,6,7,8,9,10],
    min_impurity_decrease=[0.1, 0.2, 0.3],
)
gridsearch_parameters = dict( # unbalanced sources
    train="../../outputs/scores/scores_tr.csv",
    valid="../../outputs/scores/scores_vl.csv",
    test="../../outputs/scores/scores_ts.csv",
    task="Regression",
    nDCG_at=15
)
gridsearch_parameters2 = dict( # balanced sources
    train="../../outputs/bal_scores/scores_tr.csv",
    valid="../../outputs/bal_scores/scores_vl.csv",
    test="../../outputs/bal_scores/scores_ts.csv",
    task="Regression",
    nDCG_at=15
)

# Grid-search

In [3]:
gs = FIGSGridSearch(**gridsearch_parameters)

In [4]:
best_ = gs.grid_search(FIGSRegressor, FIGS_parameter)
gs.save_model(best_[0], name="FIGSReg_unbalanced")

100%|██████████| 135/135 [01:08<00:00,  1.98it/s, nDCG=0.857]


# Metric evalutation

In [5]:
best_model = gs.load_model(name="./saved_models/FIGSReg_unbalanced")

In [6]:
# nDCG on test-set
nDCG_train = gs.eval_model(model=best_model, df=gs.train, nDCG_at=[1,10,15])
nDCG_valid = gs.eval_model(model=best_model, df=gs.valid, nDCG_at=[1,10,15])
nDCG_test = gs.eval_model(model=best_model, df=gs.test, nDCG_at=[1,10,15])

display(pd.DataFrame([nDCG_train,nDCG_valid,nDCG_test],index=["Training","Validation","Test"]))

Unnamed: 0,nDCG@1,nDCG@10,nDCG@15
Training,0.9075,0.9129,0.897
Validation,0.955,0.8764,0.8568
Test,0.9167,0.88,0.858


# Example of Job-offer

In [7]:
gs.test[gs.test["qId"]==15][["kId","labels"]].sort_values("labels",ascending=False).head(15)

Unnamed: 0,kId,labels
2839,228,3
171,256,3
4808,117,3
2377,67,2
2712,194,2
2635,34,2
2599,85,2
2561,150,2
4055,167,2
4095,226,2


In [8]:
features = gs.test[gs.test["qId"]==15]
y_pred = best_model.predict(np.asarray(features.iloc[:,2:13].values))
y_pred = pd.DataFrame(y_pred, index=features.index, columns=["lambdas"])
dt_final = pd.merge(features, y_pred, left_index=True, right_index=True)
dt_final.sort_values("lambdas",ascending=False)[["kId","labels"]].head(15)

Unnamed: 0,kId,labels
171,256,3
2839,228,3
4808,117,3
2307,151,2
2635,34,2
902,265,1
1729,133,1
3816,61,2
1453,230,2
3321,264,1


# Grid search - balanced

In [9]:
gs = FIGSGridSearch(**gridsearch_parameters2)

In [10]:
best_ = gs.grid_search(FIGSRegressor, FIGS_parameter)
gs.save_model(best_[0], name="FIGSReg_balanced")

100%|██████████| 135/135 [00:44<00:00,  3.02it/s, nDCG=0.877]


In [11]:
best_model = gs.load_model(name="./saved_models/FIGSReg_balanced")

In [12]:
# nDCG on test-set
nDCG_train = gs.eval_model(model=best_model, df=gs.train, nDCG_at=[1,10,15])
nDCG_valid = gs.eval_model(model=best_model, df=gs.valid, nDCG_at=[1,10,15])
nDCG_test = gs.eval_model(model=best_model, df=gs.test, nDCG_at=[1,10,15])

display(pd.DataFrame([nDCG_train,nDCG_valid,nDCG_test],index=["Training","Validation","Test"]))

Unnamed: 0,nDCG@1,nDCG@10,nDCG@15
Training,0.9558,0.9389,0.9284
Validation,0.9717,0.8969,0.8773
Test,0.9136,0.9068,0.89
