In [24]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, XGBRFClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import label_binarize
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from lstm import LSTMNet, LSTMNetParams, prepare_data as prepare_data_lstm

import matplotlib.pyplot as plt
from util.encoding import encode

In [4]:
df = pd.read_csv("data/features.csv")
X = df.iloc[:, 2:-1]  # skip index and name

y = df["label"]  # 10 genres
y, code = encode(y)

In [11]:
#### NORMALIZE X ####
# Normalize so everything is on the same scale.

cols = X.columns
std_scaler = sklearn.preprocessing.StandardScaler()
np_scaled = std_scaler.fit_transform(X)

# new data frame with the new scaled data. 
X = pd.DataFrame(np_scaled, columns = cols)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
def model_assess(model, X_train, X_test, y_train, y_test, title="Default"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    # binarize labels for multi class roc score
    labels = list(set(y))
    y_test_bin = label_binarize(y_test, classes=labels)
    y_pred_bin = label_binarize(y_pred, classes=labels)
    roc = roc_auc_score(y_test_bin, y_pred_bin, average="weighted", multi_class="ovo")
    matt_cor = matthews_corrcoef(y_test, y_pred)

    print(
        f"{title}:\n  Acc: {round(acc, 2)}\n  F1: {round(f1, 2)}\n  AUC score: {round(roc, 2)}\n  Matth. corr. coeff.: {round(matt_cor, 2)}"
    )
    return acc, f1, roc, matt_cor


### Classifier experiments

In [47]:
# save experiment results
df_res = pd.DataFrame(columns=["classifier", "accuracy", "F1", "AUC", "MCC"])

Logistic Regression

In [48]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=0, max_iter=500).fit(X_train, y_train)
log_reg_acc, log_reg_f1, log_reg_auc, log_reg_mcc = model_assess(
    log_reg, X_train, X_test, y_train, y_test, title="Log. Regression"
)
df_res.loc[len(df_res)] = ["Logistic regression", log_reg_acc, log_reg_f1, log_reg_auc, log_reg_mcc]


Log. Regression:
  Acc: 0.59
  F1: 0.6
  ROC score: 0.77
  Matth. corr. coeff.: 0.55


Gaussian Naive bayes

In [50]:
gnb = GaussianNB()
gnb_acc, gnb_f1, gnb_auc, gnb_mcc = model_assess(
    gnb, X_train, X_test, y_train, y_test, title="Gaussian naive bayes"
)
df_res.loc[len(df_res)] = ["Gaussian naive bayes", gnb_acc, gnb_f1, gnb_auc, gnb_mcc]


Gaussian naive bayes:
  Acc: 0.45
  F1: 0.41
  ROC score: 0.7
  Matth. corr. coeff.: 0.4
Log. Regression:
  Acc: 0.45
  F1: 0.41
  ROC score: 0.7
  Matth. corr. coeff.: 0.4


Support vector classifier

In [51]:
from sklearn.svm import SVC

svc_clf = SVC(gamma="auto")
svc_acc, svc_f1, svc_auc, svc_mcc = model_assess(
    svc_clf, X_train, X_test, y_train, y_test, title="SVC"
)
df_res.loc[len(df_res)] = ["SVC", svc_acc, svc_f1, svc_auc, svc_mcc]



Gaussian naive bayes:
  Acc: 0.69
  F1: 0.69
  ROC score: 0.83
  Matth. corr. coeff.: 0.66


Random forests

In [52]:
forest_clf = RandomForestClassifier()
forest_acc, forest_f1, forest_auc, forest_mcc = model_assess(
    forest_clf, X_train, X_test, y_train, y_test, title="Random forest"
)
df_res.loc[len(df_res)] = ["Random forest", forest_acc, forest_f1, forest_auc, forest_mcc]

Random forest:
  Acc: 0.69
  F1: 0.69
  ROC score: 0.83
  Matth. corr. coeff.: 0.66


XGBoost

In [69]:
xgb = XGBClassifier(
    n_estimators=1000,
    booster="gbtree",
    learning_rate=0.04,
    eval_metric="mlogloss",
    random_state=42,
)
model_assess(xgb, X_train, X_test, y_train, y_test, title="Cross Gradient Booster")




Cross Gradient Booster:
  Acc: 0.7
  F1: 0.7
  ROC score: 0.83
  Matth. corr. coeff.: 0.67


(0.7, 0.6987124151824312, 0.8340382415816421, 0.6698132500371287)

In [68]:
xgbrf = XGBRFClassifier(
    n_estimators=1000,
    booster="gbtree",
    learning_rate=0.04,
    objective="multi:softmax",
    eval_metric="mlogloss",
    random_state=42,
)
model_assess(
    xgbrf, X_train, X_test, y_train, y_test, "Cross Gradient Booster Random Forest"
)




Cross Gradient Booster Random Forest:
  Acc: 0.68
  F1: 0.67
  ROC score: 0.82
  Matth. corr. coeff.: 0.64


(0.6766666666666666,
 0.6720489895367148,
 0.8202430278423257,
 0.6423984369768605)

MLP Sklearn

In [57]:
mlp = MLPClassifier(
    random_state=42,
    max_iter=1000,
    activation="tanh",
    solver="adam",
    alpha=0.0001,
    learning_rate="adaptive",
    learning_rate_init=0.01,
)
mlp_acc, mlp_f1, mlp_auc, mlp_mcc = model_assess(
    mlp, X_train, X_test, y_train, y_test, title="MLP"
)
df_res.loc[len(df_res)] = ["MLP", mlp_acc, mlp_f1, mlp_auc, mlp_mcc]


MLP:
  Acc: 0.73
  F1: 0.73
  ROC score: 0.85
  Matth. corr. coeff.: 0.7


LSTM Pytorch

In [59]:
X_train_t, X_test_t, y_train_t, y_test_t = prepare_data_lstm(
    X_train, X_test, y_train, y_test
)
lstm_params = LSTMNetParams(
    num_epochs=2000,
    learning_rate=0.01,
    dropout=0.3,
    input_size=29,
    hidden_size=20,
    hidden_layer=50,
    num_layers=1,
    num_classes=10,
    seq_length=X_train_t.shape[1],
)
lstm = LSTMNet(lstm_params)
lstm_acc, lstm_f1, lstm_auc, lstm_mcc = model_assess(
    lstm, X_train, X_test, y_train, y_test, title="LSTM"
)
df_res.loc[len(df_res)] = ["LSTM", lstm_acc, lstm_f1, lstm_auc, lstm_mcc]


TypeError: 'numpy.int64' object is not callable

CNN Resnet

TODO

Save results

In [60]:
df_res.to_csv("data/classifier_results.csv")