## Imports

In [None]:
import pandas as pd
import numpy as np

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from typing import List
import matplotlib.pyplot as plt

load_dotenv()

DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = (DATA_PATH.parent) 
os.chdir(str(root_path))
 
import src.training.plotting as p
import src.training.postprocessing as pp
import src.training.pre_training as t

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split

# import models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Load Data

In [None]:
df = t.get_lyric_df()

In [None]:
X = df.values[:, :3]
y = df["popularity"].apply(lambda x: int(x / 10))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Classification

In [None]:
clf_list = []

## Gaussian Naive Bayes

In [None]:
print("Gaussian Naive Bayes")
gaussian_clf = GaussianNB()

# fit the model
gaussian_clf.fit(X_train, y_train)
clf_list.append(gaussian_clf)

pp.print_metrics(gaussian_clf, X_test, y_test)

## SVM

In [None]:
# print("SVC")
# svc_clf = SVC()

# # fit the model
# svc_clf.fit(X_train, y_train)
# clf_list.append(svc_clf)

# pp.print_metrics(svc_clf, X_test, y_test)

## Neural Network

In [None]:
print("Neural Network")
nn_clf = MLPClassifier()

# fit the model
nn_clf.fit(X_train, y_train)
clf_list.append(nn_clf)

pp.print_metrics(nn_clf, X_test, y_test)

## K-Neighbours Classifier

In [None]:
print("K-Neighbours Classifier")
knn_clf = KNeighborsClassifier()

# fit the model
knn_clf.fit(X_train, y_train)
clf_list.append(knn_clf)

pp.print_metrics(knn_clf, X_test, y_test)

## Decision Trees

In [None]:
print("Decision Trees")
dt_clf = DecisionTreeClassifier()

# fit the model
dt_clf.fit(X_train, y_train)
clf_list.append(dt_clf)

pp.print_metrics(dt_clf, X_test, y_test)

## Random forest

In [None]:
# use different number of trees in forest (comparing different hyperparameters)
forest_size = [10,20,50,100,200,250,300,400,500,1000,2000]

# set seed for random state to get compareable results in every execution (forest randomness)
np.random.seed(500)

for trees in forest_size:
    # set forest size
    print("Predicting with forest size " + str(trees))
    rf = RandomForestClassifier(n_estimators=trees)

    # fit the model
    rf.fit(X_train, y_train)
    clf_list.append(rf)

    pp.print_metrics(rf, X_test, y_test)
    print("--------\n")

# Result Plotting

In [None]:
 p_list = p.generate_model_plots(X_test, y_test, clf_list)
 print(len(p_list))
 print(len(clf_list))
 p.plots_from_list("Random Forest up to 2000", p_list,"Random Forest up to 2000","lyrics", cols=3, save=True)

In [None]:


plt.title("Dataset Lyrics V1 + unpredicted popularity")

plt.xlabel("word count")
x = df["word_count"]
plt.ylabel("popularity")
y = df["popularity"]

plt.scatter(x, y, s=5, alpha=0.5)
plt.show()

In [None]:
plt.title("Dataset Lyrics V1 + unpredicted popularity")
plt.xlabel("popularity")
plt.ylabel("song count")

plt.bar(list(set(y_test)), pp.count_distribution(y_test))
plt.show()

In [None]:
# Confusion matrix
fig, cax = plt.subplots(figsize=(16, 16)) # subplot for larger size
cax.set_title("Random Forest (size 100) Accuracy", fontsize=15)
plot_confusion_matrix(estimator=rf, X=X_test, y_true=y_test, cmap=plt.cm.Blues,normalize="true",values_format=".2f",ax=cax)

plt.show()

In [None]:
plt.title("Dataset Lyrics V1 + predicted popularity")

plt.xlabel("popularity")
plt.ylabel("song count")

# Get prediction data
y_predict = gaussian_clf.predict(X_test)

plt.bar(list(set(y_predict)),pp.count_distribution(y_predict))
plt.show()