In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, XGBRFClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import label_binarize
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from lstm import LSTMNet, LSTMNetParams, prepare_data as prepare_data_lstm
from mlp_pytorch import (
    Net as TorchMLP,
    NetParams as TorchMLPParams,
    prepare_data as torch_mlp_prepare,
)

from encoding import encode


Load data

In [27]:
df = pd.read_csv("data/features.csv")
X = df.iloc[:, 2:-1]  # skip index and name

y = df["label"]  # 10 genres
y, code = encode(y) # encode labels to integers

Prepare data

In [28]:
#### NORMALIZE X ####
# Normalize so everything is on the same scale.

cols = X.columns
std_scaler = sklearn.preprocessing.StandardScaler()
np_scaled = std_scaler.fit_transform(X)

# new data frame with the new scaled data. 
X = pd.DataFrame(np_scaled, columns = cols)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [29]:
def model_assess(model, X_train, X_test, y_train, y_test, title="Default"):
    """
    Fit given model and assess its performance regarding accuracy, F1 score, AUC score and
    matthews correlation coefficient
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # binarize labels for multi class roc score
    labels = list(set(y))
    y_test_bin = label_binarize(y_test, classes=labels)
    y_pred_bin = label_binarize(y_pred, classes=labels)
    roc = roc_auc_score(y_test_bin, y_pred_bin, average="weighted", multi_class="ovo")
    matt_cor = matthews_corrcoef(y_test, y_pred)

    print(
        f"{title}:\n  Acc: {round(acc, 2)}\n  F1: {round(f1, 2)}\n  AUC score: {round(roc, 2)}\n  Matth. corr. coeff.: {round(matt_cor, 2)}"
    )
    return acc, f1, roc, matt_cor


### Classifier experiments

In [30]:
# save experiment results
df_res = pd.DataFrame(columns=["classifier", "accuracy", "F1", "AUC", "MCC"])

Logistic Regression

In [31]:
log_reg = LogisticRegression(random_state=42, max_iter=500).fit(X_train, y_train)
log_reg_acc, log_reg_f1, log_reg_auc, log_reg_mcc = model_assess(
    log_reg, X_train, X_test, y_train, y_test, title="Log. Regression"
)
df_res.loc[len(df_res)] = ["Logistic regression", log_reg_acc, log_reg_f1, log_reg_auc, log_reg_mcc]


Log. Regression:
  Acc: 0.59
  F1: 0.6
  AUC score: 0.77
  Matth. corr. coeff.: 0.55


Gaussian Naive bayes

In [32]:
gnb = GaussianNB()
gnb_acc, gnb_f1, gnb_auc, gnb_mcc = model_assess(
    gnb, X_train, X_test, y_train, y_test, title="Gaussian naive bayes"
)
df_res.loc[len(df_res)] = ["Gaussian naive bayes", gnb_acc, gnb_f1, gnb_auc, gnb_mcc]


Gaussian naive bayes:
  Acc: 0.45
  F1: 0.41
  AUC score: 0.7
  Matth. corr. coeff.: 0.4


Support vector classifier

In [33]:
svc_clf = SVC(gamma="scale", C=3, kernel="rbf", random_state=42)
svc_acc, svc_f1, svc_auc, svc_mcc = model_assess(
    svc_clf, X_train, X_test, y_train, y_test, title="SVC"
)
df_res.loc[len(df_res)] = ["SVC", svc_acc, svc_f1, svc_auc, svc_mcc]



SVC:
  Acc: 0.73
  F1: 0.74
  AUC score: 0.85
  Matth. corr. coeff.: 0.7


Random forests

In [34]:
forest_clf = RandomForestClassifier(random_state=42)
forest_acc, forest_f1, forest_auc, forest_mcc = model_assess(
    forest_clf, X_train, X_test, y_train, y_test, title="Random forest"
)
df_res.loc[len(df_res)] = ["Random forest", forest_acc, forest_f1, forest_auc, forest_mcc]

Random forest:
  Acc: 0.67
  F1: 0.66
  AUC score: 0.81
  Matth. corr. coeff.: 0.63


XGBoost

In [35]:
xgb = XGBClassifier(
    n_estimators=1000,
    booster="gbtree",
    learning_rate=0.04,
    eval_metric="mlogloss",
    random_state=42,
)
xgb_acc, xgb_f1, xgb_auc, xgb_mcc = model_assess(
    xgb, X_train, X_test, y_train, y_test, title="XGBoost"
)
df_res.loc[len(df_res)] = ["XGBoost", xgb_acc, xgb_f1, xgb_auc, xgb_mcc]




XGBoost:
  Acc: 0.7
  F1: 0.7
  AUC score: 0.83
  Matth. corr. coeff.: 0.67


In [36]:
xgbrf = XGBRFClassifier(
    n_estimators=1000,
    booster="gbtree",
    learning_rate=0.04,
    objective="multi:softmax",
    eval_metric="mlogloss",
    random_state=42,
)
xgbrf_acc, xgbrf_f1, xgbrf_auc, xgbrf_mcc = model_assess(
    xgbrf, X_train, X_test, y_train, y_test, title="XGBoost random forests"
)
df_res.loc[len(df_res)] = ["XGBoost random forests", xgbrf_acc, xgbrf_f1, xgbrf_auc, xgbrf_mcc]




XGBoost random forests:
  Acc: 0.68
  F1: 0.67
  AUC score: 0.82
  Matth. corr. coeff.: 0.64


MLP Sklearn

In [37]:
mlp = MLPClassifier(
    random_state=42,
    max_iter=1000,
    activation="tanh",
    solver="adam",
    alpha=0.0001,
    learning_rate="adaptive",
    learning_rate_init=0.01,
)
mlp_acc, mlp_f1, mlp_auc, mlp_mcc = model_assess(
    mlp, X_train, X_test, y_train, y_test, title="MLP Sklearn"
)
df_res.loc[len(df_res)] = ["MLP Sklearn", mlp_acc, mlp_f1, mlp_auc, mlp_mcc]


MLP Sklearn:
  Acc: 0.73
  F1: 0.73
  AUC score: 0.85
  Matth. corr. coeff.: 0.7


MLP Pytorch

In [38]:
X_train_t, y_train_t, X_test_t, y_test_t = torch_mlp_prepare(
    X_train.to_numpy(), y_train, X_test.to_numpy(), y_test
)

params = TorchMLPParams(
    input_features=29, hidden_size=100, num_classes=10, epochs=1000, learning_rate=0.01
)
mlp_torch = TorchMLP(params)
mlp_torch_acc, mlp_torch_f1, mlp_torch_auc, mlp_torch_mcc = model_assess(
    mlp_torch, X_train_t, X_test_t, y_train_t, y_test_t, title="MLP PyTorch"
)
df_res.loc[len(df_res)] = [
    "MLP PyTorch",
    mlp_torch_acc,
    mlp_torch_f1,
    mlp_torch_auc,
    mlp_torch_mcc,
]


MLP PyTorch:
  Acc: 0.66
  F1: 0.66
  AUC score: 0.81
  Matth. corr. coeff.: 0.62


LSTM Pytorch

In [39]:
X_train_t, X_test_t, y_train_t, y_test_t = prepare_data_lstm(
    X_train, X_test, y_train, y_test
)
lstm_params = LSTMNetParams(
    num_epochs=2000,
    learning_rate=0.01,
    dropout=0.3,
    input_size=29,
    hidden_size=20,
    hidden_layer=50,
    num_layers=1,
    num_classes=10,
    seq_length=X_train_t.shape[1],
    tensorboard=False
)
lstm = LSTMNet(lstm_params)
lstm_acc, lstm_f1, lstm_auc, lstm_mcc = model_assess(
    lstm, X_train_t, X_test_t, y_train_t, y_test_t, title="LSTM"
)
df_res.loc[len(df_res)] = ["LSTM", lstm_acc, lstm_f1, lstm_auc, lstm_mcc]


LSTM:
  Acc: 0.67
  F1: 0.66
  AUC score: 0.82
  Matth. corr. coeff.: 0.63


Save results

In [40]:
df_res.to_csv("data/classifier_results.csv")

In [44]:
df_res.sort_values(by="MCC", ascending=False).round(decimals=2)

Unnamed: 0,classifier,accuracy,F1,AUC,MCC
2,SVC,0.73,0.74,0.85,0.7
6,MLP Sklearn,0.73,0.73,0.85,0.7
4,XGBoost,0.7,0.7,0.83,0.67
5,XGBoost random forests,0.68,0.67,0.82,0.64
8,LSTM,0.67,0.66,0.82,0.63
3,Random forest,0.67,0.66,0.81,0.63
7,MLP PyTorch,0.66,0.66,0.81,0.62
0,Logistic regression,0.59,0.6,0.77,0.55
1,Gaussian naive bayes,0.45,0.41,0.7,0.4
