# Lyrics Feature Training

## Imports

In [None]:
import pandas as pd
import numpy as np

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from typing import List

import matplotlib.pyplot as plt
# import seaborn as sns TODO requirements.txt

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

load_dotenv()

DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = (DATA_PATH.parent) 
os.chdir(str(root_path))
 
import src.training.plotting as p
import src.training.postprocessing as pp
import src.training.pre_training as t

from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# import models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Preprocessing

## Load Data

In [None]:
df = t.get_lyric_df()

## Split data: features X and target variable y

In [None]:
X = df.values[:, :3]
y = df["popularity"].apply(t.multiclass_popularity)

## Over-/Undersampling

In [None]:
# sampled and encoded popularity
X, y = RandomUnderSampler(random_state=42).fit_resample(X, y)

## Train/Test-Split
No PCA or feature selection, because only 3 features.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape)
print(X_test.shape)

# Classification

In [None]:
# store classifiers for later plotting
clf_list = []

## Gaussian Naive Bayes

In [None]:
gaussian_clf = GaussianNB()

# fit the model
gaussian_clf.fit(X_train, y_train)
clf_list.append(gaussian_clf)

pp.print_metrics(gaussian_clf, X_test, y_test)

## SVM

In [None]:
svc_clf = SVC()

# fit the model
svc_clf.fit(X_train, y_train)
clf_list.append(svc_clf)

pp.print_metrics(svc_clf, X_test, y_test)

## Neural Network

In [None]:
nn_clf = MLPClassifier()

# fit the model
nn_clf.fit(X_train, y_train)
clf_list.append(nn_clf)

pp.print_metrics(nn_clf, X_test, y_test)

## K-Neighbours Classifier

In [None]:
knn_clf = KNeighborsClassifier()

# fit the model
knn_clf.fit(X_train, y_train)
clf_list.append(knn_clf)

pp.print_metrics(knn_clf, X_test, y_test)

## Decision Trees

In [None]:
dt_clf = DecisionTreeClassifier()

# fit the model
dt_clf.fit(X_train, y_train)
clf_list.append(dt_clf)

pp.print_metrics(dt_clf, X_test, y_test)

## Random forest

In [None]:
# use different number of trees in forest 
forest_size = [10,50,100,250,500,1000]

# set seed for random state to get compareable results in every execution (forest randomness)
np.random.seed(500)

# store rf classifiers additionally because of overwriting
rf_clfs = []

for trees in forest_size:
    # set forest size
    print("Predicting with forest size " + str(trees))
    rf = RandomForestClassifier(n_estimators=trees)

    # fit the model
    rf.fit(X_train, y_train)
    clf_list.append(rf)

    pp.print_metrics(rf, X_test, y_test)
    print("--------\n")

## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier

# ens_clf = VotingClassifier(estimators=[
    # ('gauss', gaussian_clf), ('knn', knn_clf), ('rf', rf)
# ])

ens_clf = BaggingClassifier(base_estimator=GaussianNB(),
    n_estimators=250, random_state=42)

ens_clf.fit(X_train, y_train)

pp.print_metrics(ens_clf, X_test, y_test)

# Model Evaluation

## Store model

In [None]:
# pp.store_model_to_file(gaussian_clf, "gauss_prec=31", "lyrics")

## Metrics + Confusion Matrices

In [None]:
 # generate list of plots for each clf: metrics, cf_matrix, cf_matrix_norm
 p_list = p.generate_model_plots(X_test, y_test, clf_list)

## Save/display plots

In [None]:
# params
save_plots = True
n_cols = 3
document_title = "Random Forest up to 2000 trees"
document_folder = "all" # lyrics, model, artist, all

# save/display plots as jpg
p.plots_from_list(document_title, p_list, document_folder, cols=n_cols, save=save_plots)

## Confusion Matrix for Single Classifier

In [None]:
# assign single classifier
cf_clf = gaussian_clf
normalized = None #"true" # "true", "all" or None

# Confusion matrix
fig, cax = plt.subplots(figsize=(5, 5)) # subplot for larger size
cax.set_title(str(cf_clf), fontsize=15)
plot_confusion_matrix(estimator=cf_clf, X=X_test, y_true=y_test, cmap=plt.cm.Blues,normalize=normalized,values_format=".2f",ax=cax)

plt.show()

In [None]:
# Scaled conf matrix
from sklearn.metrics import confusion_matrix
import itertools

y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred, normalize='true')

# classes = ["0", "1"]
classes = ["1", "2", "3", "4", "5"]
fig, ax = plt.subplots(1, 1)

im = ax.imshow(cm, vmin=0, vmax=1, cmap="Blues")
ax.set_title("Random forest on lyrical dataset")
tick_marks = np.arange(len(classes))
ax.set_xticks(tick_marks)
ax.set_xticklabels(classes)
ax.set_yticks(tick_marks)
ax.set_yticklabels(classes)

for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    text_color = "white" if cm[i, j] > 0.8 else "black"
    ax.text(j, i, format(cm[i, j], '.2f'),
                horizontalalignment="center",
                color=text_color)

ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')

sm = plt.cm.ScalarMappable(cmap="Blues", norm=plt.Normalize(vmin=0, vmax=1))
plt.colorbar(sm)

plt.show()