In [None]:
import os
import sys
import numpy as np
import wandb

from playlist_recommender.modelling import model_pipeline
from playlist_recommender.modelling import utils
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight 

In [None]:
X, y = utils.prep_playlist_df()
X_train, X_test, y_train, y_test = model_pipeline.make_best_transformation_pipeline(
    X, y
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape,

In [None]:
le = LabelEncoder()
le.fit(y_train)
# XGBoost requires labelled targets

y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [None]:
class_weights = compute_class_weight(class_weight = 'balanced',
                                                 classes = np.unique(y_train),
                                                 y = y_train)
class_weight_dict = dict(enumerate(class_weights))

In [None]:
train_dict_weights = []
for train in y_train:
    train_dict_weights.append(class_weight_dict[train])

#XGBoost needs a weight per row, not per target feature
    
test_dict_weights = []
for test in y_test:
    test_dict_weights.append(class_weight_dict[test])

In [None]:
assert(len(train_dict_weights) == y_train.shape[0])
assert(len(test_dict_weights) == y_test.shape[0])

# Sweeps

In [None]:
sweep_config = {
    "method": "bayes",  # try grid or random
    "metric": {"name": "f1_score", "goal": "maximize"},
    "parameters": {
        "booster": {"values": ["gbtree", "gblinear"]},
        "max_depth": {"values": [3, 6, 9, 12]},
        "learning_rate": {"values": [0.1, 0.05, 0.2]},
        "subsample": {"values": [1, 0.5, 0.3]},
    },
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="spotify-recommender")

In [None]:
def train():
    config_defaults = {
        "booster": "gbtree",
        "max_depth": 3,
        "learning_rate": 0.1,
        "subsample": 1,
    }
    with wandb.init(
        project="spotify-recommender",
        tags=["xgboost"],
        name="XGBoost tuning - Class weights",
        config=config_defaults,
    ):
        config = wandb.config
        # fit model on train
        model = XGBClassifier(
            booster=config.booster,
            max_depth=config.max_depth,
            learning_rate=config.learning_rate,
            subsample=config.subsample,
        )
        model.fit(X_train, y_train, 
                sample_weight = train_dict_weights,
                eval_set = [(X_test,y_test)], 
                sample_weight_eval_set = [test_dict_weights], 
                verbose = False)

        # make predictions on test
        y_pred = model.predict(X_test)

        # evaluate predictions
        f1_score = metrics.f1_score(y_test, y_pred, average="macro", zero_division=0)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(
            y_test, y_pred, average="macro", zero_division=0
        )
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        wandb.log(
            {
                "f1_score": f1_score,
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
            }
        )

In [None]:
wandb.agent(sweep_id, train)