In [2]:
import os
import sys
import numpy as np

from playlist_recommender.modelling import model_pipeline
from playlist_recommender.modelling import utils
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight 

In [3]:
X, y = utils.prep_playlist_df()
X_train, X_test, y_train, y_test = model_pipeline.make_best_transformation_pipeline(
    X, y
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape,

pipeline dumped


((1571, 862), (846, 862), (1571,), (846,))

In [4]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [6]:
class_weights = compute_class_weight(class_weight = 'balanced',
                                                 classes = np.unique(y_train),
                                                 y = y_train)
class_weight_dict = dict(enumerate(class_weights))

In [14]:
train_dict_weights = []
for train in y_train:
    train_dict_weights.append(class_weight_dict[train])

#XGBoost needs a weight per row, not per target feature
    
test_dict_weights = []
for test in y_test:
    test_dict_weights.append(class_weight_dict[test])

In [15]:
assert(len(train_dict_weights) == y_train.shape[0])

In [17]:
assert(len(test_dict_weights) == y_test.shape[0])

In [25]:
# load data
config_defaults = {
    "booster": "gbtree",
    "max_depth": 3,
    "learning_rate": 0.1,
    "subsample": 1,
}
# fit model on train
model = XGBClassifier(
    booster=config_defaults["booster"],
    max_depth=config_defaults["max_depth"],
    learning_rate=config_defaults["learning_rate"],
    subsample=config_defaults["subsample"],
     eval_metric=['merror','mlogloss']
)
model.fit(X_train, y_train, 
          sample_weight = train_dict_weights,
          eval_set = [(X_test,y_test)], 
          sample_weight_eval_set = [test_dict_weights])

# make predictions on test
y_pred = model.predict(X_test)

[0]	validation_0-merror:0.71243	validation_0-mlogloss:3.25596
[1]	validation_0-merror:0.67740	validation_0-mlogloss:3.11137
[2]	validation_0-merror:0.67030	validation_0-mlogloss:3.01127
[3]	validation_0-merror:0.65846	validation_0-mlogloss:2.92878
[4]	validation_0-merror:0.65979	validation_0-mlogloss:2.86800
[5]	validation_0-merror:0.65461	validation_0-mlogloss:2.82204
[6]	validation_0-merror:0.66680	validation_0-mlogloss:2.77730
[7]	validation_0-merror:0.67217	validation_0-mlogloss:2.73832
[8]	validation_0-merror:0.66806	validation_0-mlogloss:2.70191
[9]	validation_0-merror:0.66355	validation_0-mlogloss:2.66614
[10]	validation_0-merror:0.66426	validation_0-mlogloss:2.64030
[11]	validation_0-merror:0.66169	validation_0-mlogloss:2.61363
[12]	validation_0-merror:0.66395	validation_0-mlogloss:2.58709
[13]	validation_0-merror:0.65737	validation_0-mlogloss:2.56537
[14]	validation_0-merror:0.66212	validation_0-mlogloss:2.54362
[15]	validation_0-merror:0.66003	validation_0-mlogloss:2.52679
[1

In [26]:
f1_score = metrics.f1_score(y_test, y_pred, average="macro", zero_division=0)
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average="macro", zero_division=0)
recall = metrics.recall_score(y_test, y_pred, average="macro")

In [27]:
f1_score, accuracy, precision, recall

(0.2942595228672323,
 0.2907801418439716,
 0.292014687760366,
 0.32174085063768054)