In [16]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import label_binarize
import sklearn
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [2]:
"""
Helper functions to encode and decode string labels to integers
"""

def encode(y):
    labels = y.unique()
    # encode label map
    code = {v: i for i, v in enumerate(labels)}

    # encode labels to new vector
    return list(map(lambda x: code[x], y)), code


def decode(y, code):
    keys = list(code.keys())
    values = list(code.values())
    return list(map(lambda x: keys[values.index(x)], y))


In [3]:
df = pd.read_csv("data/features_var_thres.csv")
X = df.iloc[:, 2:-1]  # skip index and name

y = df["label"]  # 10 genres
y, code = encode(y)

In [4]:
#### NORMALIZE X ####
# Normalize so everything is on the same scale.

cols = X.columns
std_scaler = sklearn.preprocessing.StandardScaler()
np_scaled = std_scaler.fit_transform(X)

# new data frame with the new scaled data. 
X = pd.DataFrame(np_scaled, columns = cols)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
def model_assess(model, X_train, X_test, y_train, y_test, title="Default"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # binarize labels for multi class roc score
    labels = list(set(y))
    y_test_bin = label_binarize(y_test, classes=labels)
    y_pred_bin = label_binarize(y_pred, classes=labels)
    roc = roc_auc_score(y_test_bin, y_pred_bin, average="weighted", multi_class="ovo")
    matt_cor = matthews_corrcoef(y_test, y_pred)

    print(
        f"{title}:\n  Acc: {round(acc, 2)}\n  F1: {round(f1, 2)}\n  ROC score: {round(roc, 2)}\n  Matth. corr. coeff.: {round(matt_cor, 2)}"
    )
    return acc, f1, roc, matt_cor


In [21]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=500).fit(X_train, y_train)
model_assess(clf, X_train, X_test, y_train, y_test, title="Log. Regression")

Log. Regression:
  Acc: 0.54
  F1: 0.53
  ROC score: 0.74
  Matth. corr. coeff.: 0.49


(0.5366666666666666, 0.531945408790232, 0.7425931296330234)

In [22]:
from sklearn.svm import SVC

svc_clf = SVC(gamma="auto")
model_assess(svc_clf, X_train, X_test, y_train, y_test, title="SVC")


SVC:
  Acc: 0.63
  F1: 0.63
  ROC score: 0.79
  Matth. corr. coeff.: 0.59


(0.63, 0.6287606668348482, 0.7949669162093735)

In [23]:
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05, eval_metric="mlogloss")
model_assess(xgb, X_train, X_test, y_train, y_test, "Cross Gradient Booster")




Cross Gradient Booster:
  Acc: 0.58
  F1: 0.59
  ROC score: 0.77
  Matth. corr. coeff.: 0.54


(0.5833333333333334, 0.5866160524081848, 0.7695172316832881)

In [11]:
xgbrf = XGBRFClassifier(objective="multi:softmax", eval_metric="mlogloss")
model_assess(
    xgbrf, X_train, X_test, y_train, y_test, "Cross Gradient Booster (Random Forest)"
)




Cross Gradient Booster (Random Forest):
  Acc: 0.53
  F1: 0.52
  ROC score: 0.74


(0.5266666666666666, 0.5209247330955603, 0.7370149571376233)