In [None]:
import pandas as pd
import numpy as np

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from typing import List

import matplotlib.pyplot as plt

load_dotenv()

DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = (DATA_PATH.parent) 
os.chdir(str(root_path))
 
import src.database.db_interface as db
import src.training.plotting as p
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# import models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Data methods

In [None]:
def load_data():
    """Loads the musical features from the database into a dataframe

    Returns:
        dataframe: the pd dataframe of the musical features
    """
    cnx, cursor = db.connect_to_db("spotify_ds")

    query = """
        SELECT t.duration_ms, t.explict, t.release_year, t.danceability, t.energy, t.key, t.loadness, t.mode, t.speechiness, t.acousticness, t.instrumentalness, t.liveness, t.valence, t.tempo, t.time_signature, t.popularity
        FROM tracks AS t
        INNER JOIN track_status AS ts
        ON t.id == ts.song_id
        WHERE ts.song_valid == 1
        AND ts.lyrics_stored == 1
        AND t.release_year >= 2000;
    """

    return pd.read_sql_query(query, cnx)

In [None]:
def create_classes(popularities: List[int]) -> List[int]:
    """Scale popularity into classes in [0, 10].

    Args:
        popularities (List[int]): List of popularity scores in [0, 10]

    Returns:
        List[int]: List of popularity scores in [0, 10]
    """

    return [int(x / 10) for x in popularities]

In [None]:
def calculate_metrics(clf, X_test, y_test):
    # predict on test set
    y_predict = clf.predict(X_test)

    # print metrics
    print("Accuracy: " + str(round(accuracy_score(y_test, y_predict), 4)))
    print("F1: " + str(round(f1_score(y_test, y_predict, average="weighted"), 4)))
    print("Recall: " + str(round(recall_score(y_test, y_predict, average="weighted"), 4)))
    print("Precision: " + str(round(precision_score(y_test, y_predict, average="weighted"), 4)))
    print("\n")

    # check which labels do not appear in prediction
    print(f"Contained predictions: {set(y_predict)}")
    print(f"Contained tests: {set(y_test)}")
    set(y_test) - set(y_predict)

    return y_predict


## Training

In [None]:
df = load_data()

X = df.values[:, :15]
print(df)
y = create_classes(df.values[:, 15])
print(set(y))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape)
print(X_test.shape)
print(X_test)

## Gaussian Naive Bayes

In [None]:
print("Gaussian Naive Bayes")
gaussian_clf = GaussianNB()

# fit the model
gaussian_clf.fit(X_train, y_train)

result = calculate_metrics(gaussian_clf, X_test, y_test)

## SVM

In [None]:
print("SVC")
svc_clf = SVC()

# fit the model
svc_clf.fit(X_train, y_train)

calculate_metrics(svc_clf, X_test, y_test)

## Neural Network

In [None]:
print("Neural Network")
nn_clf = MLPClassifier()

# fit the model
nn_clf.fit(X_train, y_train)

calculate_metrics(nn_clf, X_test, y_test)

## K-Neighbours Classifier

In [None]:
print("K-Neighbours Classifier")
knn_clf = KNeighborsClassifier()

# fit the model
knn_clf.fit(X_train, y_train)

calculate_metrics(knn_clf, X_test, y_test)

## Decision Trees

In [None]:
print("Decision Trees")
dt_clf = DecisionTreeClassifier()

# fit the model
dt_clf.fit(X_train, y_train)

calculate_metrics(dt_clf, X_test, y_test)

## Random forest

In [None]:
# use different number of trees in forest (comparing different hyperparameters)
forest_size = [10,20,50,100]

# set seed for random state to get compareable results in every execution (forest randomness)
np.random.seed(500)

for trees in forest_size:
    # set forest size
    print("Predicting with forest size " + str(trees))
    rf = RandomForestClassifier(n_estimators=trees)

    # fit the model
    rf.fit(X_train, y_train)

    result = calculate_metrics(rf, X_test, y_test)
    print("--------\n")

## Plotting

In [None]:
title = "Dataset Music V1 + unpredicted popularity"
x = df["explict"]
y = df["popularity"]
p.disp_scatter(x, y, "explicit", "popularity", title)

In [None]:
plt.title("Dataset Music V1 + unpredicted popularity")

plt.xlabel("popularity")
plt.ylabel("song count")

lst = list(df.groupby("popularity"))

# plt.stem(list(range(0,100)), list(map(lambda x: len(x[1]),lst)))
plt.bar(list(range(0,10)), list(map(lambda x: len(x[1]),pd.DataFrame(y_test).groupby(0, as_index=True))))
plt.show()

In [None]:
import src.training.postprocessing as pp

plt.title("Dataset Music V1 + predicted popularity")

plt.xlabel("popularity")
plt.ylabel("song count")

plt.bar(list(set(result)), list(map(lambda x: x, pd.DataFrame(result).value_counts(sort=False))))
plt.show()

dummy = [x, y, "popularity", "song_count", "Plot Name"]

m = pp.get_metrics(knn_clf, X_test, y_test)
p.create_plots(m, [dummy], "music")

