In [None]:
import pandas as pd
import numpy as np

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from typing import List

import matplotlib.pyplot as plt
import seaborn as sns

# RUS
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

load_dotenv()

DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = (DATA_PATH.parent) 
os.chdir(str(root_path))
 
import src.training.plotting as p
import src.training.postprocessing as pp
import src.training.pre_training as t

from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# import models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Prepare Data

In [None]:
df = t.get_music_df()
# df2 = t.get_lyric_df()

# print(df1)
# print(df2)

# df = df1.join(df2, on='song_id')
# df = df1.merge(df2, 'ts.song_id')

In [None]:
X = df.values[:, :15]
# X = df[["explict", "danceability", "energy", "loadness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence"]]
# X = df[["danceability", "energy", "loadness"]]
y = df["popularity"].apply(t.encode_popularity)

In [None]:
# RUS sampled and encoded popularity
X, y = RandomUnderSampler(random_state=42).fit_resample(X, y)

In [None]:
# PCA feature selection
cols = pd.DataFrame(X).columns

# Standardization of X
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# # Dimensionality Reduction
# reduce_to = 15
# pca = PCA(n_components=reduce_to)

# pca.fit(X, y)
# X = pca.transform(X)

# print("Amount explained:", sum(pca.explained_variance_ratio_))
# print("Amount explained in each PC:", pca.explained_variance_ratio_)

# descr = ["PC-" + str(x) for x in range(1, reduce_to + 1)]
# print(pd.DataFrame(pca.components_, columns=cols, index=descr))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# print(X_train.shape)
# print(X_test.shape)
# print(X_test)

## Feature Selection

In [None]:
# Pearson Correlation Coefficient
pear_corr = df.corr(method='pearson')
plt.imshow(pear_corr, cmap='hot')
plt.show()

In [None]:
# TODO use correct X, y values
# X_1, y_1 = RandomUnderSampler(random_state=42).fit_resample(df, y)

# # Scale features
# max_db = X_1['loadness'].max()
# min_db = X_1['loadness'].min()
# X_1['loadness'] = X_1['loadness'].apply(lambda x: abs(x/40))

# # Drop features with range outside [0, 1]
# X_1 = pd.DataFrame(X_1).drop(['key', 'time_signature', 'release_year', 'duration_ms', 'tempo'], axis=1)

# fig = plt.figure(figsize = (20, 25))
# j = 0
# for i in pd.DataFrame(X_1).columns:
#     plt.subplot(6, 4, j+1)
#     j += 1

#     sns.kdeplot(pd.DataFrame(X_1).query("popularity == 0")[i], color='b', label='pop=0')
#     sns.kdeplot(pd.DataFrame(X_1).query("popularity == 1")[i], color='#000000', label='pop=1')
#     sns.kdeplot(pd.DataFrame(X_1).query("popularity == 2")[i], color='#ff5959', label='pop=2')
#     sns.kdeplot(pd.DataFrame(X_1).query("popularity == 3")[i], color='#fffd86', label='pop=3')
#     sns.kdeplot(pd.DataFrame(X_1).query("popularity == 4")[i], color='#a7e81c', label='pop=4')
#     sns.kdeplot(pd.DataFrame(X_1).query("popularity == 5")[i], color='#65bf65', label='pop=5')
#     plt.legend(loc='best')
#     plt.ylim(0, 17)
#     plt.xlim(0, 1)

# fig.suptitle('Density Analysis')
# fig.tight_layout()
# fig.subplots_adjust(top=0.95)
# plt.show()

## Gaussian Naive Bayes

In [None]:
print("Gaussian Naive Bayes")
gaussian_clf = GaussianNB()

# fit the model
gaussian_clf.fit(X_train, y_train)

pp.print_metrics(gaussian_clf, X_test, y_test)

## SVM

In [None]:
# print("SVC")
# svc_clf = SVC()

# # fit the model
# svc_clf.fit(X_train, y_train)

# pp.print_metrics(svc_clf, X_test, y_test)

## Neural Network

In [None]:
print("Neural Network")
nn_clf = MLPClassifier()

# fit the model
nn_clf.fit(X_train, y_train)

pp.print_metrics(nn_clf, X_test, y_test)

## K-Neighbours Classifier

In [None]:
print("K-Neighbours Classifier")
knn_clf = KNeighborsClassifier()

# fit the model
knn_clf.fit(X_train, y_train)

pp.print_metrics(knn_clf, X_test, y_test)

## Decision Trees

In [None]:
print("Decision Trees")
dt_clf = DecisionTreeClassifier()

# fit the model
dt_clf.fit(X_train, y_train)

pp.print_metrics(dt_clf, X_test, y_test)

## Random forest

In [None]:
# use different number of trees in forest (comparing different hyperparameters)
forest_size = [10,20,50,100,250,500,1000]

# set seed for random state to get compareable results in every execution (forest randomness)
np.random.seed(500)

for trees in forest_size:
    # set forest size
    print("Predicting with forest size " + str(trees))
    rf = RandomForestClassifier(n_estimators=trees)

    # fit the model
    rf.fit(X_train, y_train)

    pp.print_metrics(rf, X_test, y_test)
    print("--------\n")

## Result Plotting

In [None]:
title = "Dataset Music V1 + unpredicted popularity"
x = df["explict"]
y = df["popularity"]
p.disp_scatter(x, y, "explicit", "popularity", title)

In [None]:
# plt.title("Dataset Music V1 + unpredicted popularity")

# plt.xlabel("popularity")
# plt.ylabel("song count")
# plt.bar(list(set(y_predict)),pp.count_distribution(y_predict))
# plt.show()
y_predict = rf.predict(X_test)


In [None]:
# Confusion matrix
fig, cax = plt.subplots(figsize=(10,10)) # subplot for larger size
cax.set_title("Random Forest (size 100) Accuracy - Undersampling", fontsize=15)
plot_confusion_matrix(estimator=rf, X=X_test, y_true=y_test, cmap=plt.cm.Blues,normalize="true",values_format=".2f",ax=cax)

plt.show()

In [None]:
import src.training.postprocessing as pp

dummy = [x, y, "popularity", "song_count", "Plot Name"]

m = pp.get_metrics(knn_clf, X_test, y_test)

plist = [].append((plt.scatter, {"x": x,"y": y,"s": 5, "alpha": 0.5}, "xlabel", "ylabel", "p_name"))

y_lst = list(map(lambda x: len(x[1]),pd.DataFrame(y_test).groupby(0, as_index=True)))

plist.append((plt.bar, {"x": list(range(0,10)),"height": y_lst}, "popularity", "song count", "Dataset Music V1 + unpredicted popularity"))

p.plots_from_list(m, plist, "music", "test_plots_from_list_16")
