## Imports

In [109]:
import pandas as pd
import numpy as np

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from typing import List

load_dotenv()
DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = (DATA_PATH.parent) 
os.chdir(str(root_path))
 
import src.training.pre_training as t
import src.training.postprocessing as pp

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import plot_confusion_matrix

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Preprocessing

## Load Data

In [110]:
df = t.get_artist_df()

# scaling
max_followers = df["followers"].max()
df["followers"] = df["followers"].apply(lambda x: x / (max_followers / 100))
df["genre_name"] = df["genre_name"].apply(t.encode_genres)

## Split data (X,y)

In [111]:
y = df["popularity"].apply(t.encode_popularity)
X = df.values[:, :2]

# print(X)
# print(y)

## Over-/Undersampling

In [112]:
# sampled and encoded popularity
X, y = RandomUnderSampler(random_state=42).fit_resample(X, y)
# X, y = RandomOverSampler(random_state=42).fit_resample(X, y)

In [113]:
# Plot distr
# fig = plt.figure(figsize=(5,5))
# ax = fig.add_subplot(111)
# ax.set_title("pop distr")
# ax.set_xlabel("popularity")
# ax.set_ylabel("count")

# plt.bar(list(set(y)), pp.count_distribution(y))

# print(pd.DataFrame(y).value_counts())


## Train/Test-Split

In [114]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape)
print(X_test.shape)

(200, 2)
(50, 2)


# Classification

## Gaussian Naive Bayes

In [115]:
print("Gaussian Naive Bayes")
gaussian_clf = GaussianNB()

# fit the model
gaussian_clf.fit(X_train, y_train)

pp.print_metrics(gaussian_clf, X_test, y_test)

Gaussian Naive Bayes
Weighted accuracy: 0.6
Weighted f1: 0.6007
Weighted recall: 0.6
Weighted precision: 0.6724
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}


## SVM

In [116]:
print("SVC")
svc_clf = SVC()

# fit the model
svc_clf.fit(X_train, y_train)

pp.print_metrics(svc_clf, X_test, y_test)

SVC
Weighted accuracy: 0.38
Weighted f1: 0.3251
Weighted recall: 0.38
Weighted precision: 0.2857
Contained classes in prediction: {1, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}


  _warn_prf(average, modifier, msg_start, len(result))


## Neural Network

In [117]:
print("Neural Network")
nn_clf = MLPClassifier()

# fit the model
nn_clf.fit(X_train, y_train)

pp.print_metrics(nn_clf, X_test, y_test)

Neural Network
Weighted accuracy: 0.48
Weighted f1: 0.4613
Weighted recall: 0.48
Weighted precision: 0.4633
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}




## K-Neighbours Classifier

In [118]:
print("K-Neighbours Classifier")
knn_clf = KNeighborsClassifier()

# fit the model
knn_clf.fit(X_train, y_train)

pp.print_metrics(knn_clf, X_test, y_test)

K-Neighbours Classifier
Weighted accuracy: 0.58
Weighted f1: 0.577
Weighted recall: 0.58
Weighted precision: 0.602
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}


## Decision Trees

In [119]:
print("Decision Trees")
dt_clf = DecisionTreeClassifier()

# fit the model
dt_clf.fit(X_train, y_train)

pp.print_metrics(dt_clf, X_test, y_test)

Decision Trees
Weighted accuracy: 0.66
Weighted f1: 0.6436
Weighted recall: 0.66
Weighted precision: 0.6676
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}


## Random forest

In [121]:
# use different number of trees in forest (comparing different hyperparameters)
forest_size = [10,20,50,100,250]

# set seed for random state to get compareable results in every execution (forest randomness)
np.random.seed(500)

for trees in forest_size:
    # set forest size
    print("Predicting with forest size " + str(trees))
    rf = RandomForestClassifier(n_estimators=trees)

    # fit the model
    rf.fit(X_train, y_train)

    pp.print_metrics(rf, X_test, y_test)
    print("--------\n")

Predicting with forest size 10
Weighted accuracy: 0.6
Weighted f1: 0.6061
Weighted recall: 0.6
Weighted precision: 0.6503
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}
--------

Predicting with forest size 20
Weighted accuracy: 0.66
Weighted f1: 0.6565
Weighted recall: 0.66
Weighted precision: 0.7313
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}
--------

Predicting with forest size 50
Weighted accuracy: 0.6
Weighted f1: 0.5981
Weighted recall: 0.6
Weighted precision: 0.6415
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}
--------

Predicting with forest size 100
Weighted accuracy: 0.64
Weighted f1: 0.6409
Weighted recall: 0.64
Weighted precision: 0.7062
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}
--------

Predicting with forest size 250
Weighted accuracy: 0.64
Weighted f1: 0.6409
Weighted recall: 0.64
W

In [142]:
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, BaggingClassifier, HistGradientBoostingClassifier

ens_clf = VotingClassifier(estimators=[
    ('gauss', GaussianNB()), ('knn', KNeighborsClassifier()), ('rf', RandomForestClassifier(n_estimators=1000))
], voting='hard')

# ens_clf = BaggingClassifier(base_estimator=GaussianNB(),
#     n_estimators=5000, random_state=42)

ens_clf.fit(X_train, y_train)

pp.print_metrics(ens_clf, X_test, y_test)

Weighted accuracy: 0.62
Weighted f1: 0.6208
Weighted recall: 0.62
Weighted precision: 0.6862
Contained classes in prediction: {1, 2, 3, 4, 5}
Contained classes in test: {1, 2, 3, 4, 5}


# Model Evaluation

## Store model

In [None]:
pp.store_model_to_file(rf, "rf_size=100_prec=70", "artist")

## Plotting

In [None]:
import matplotlib.pyplot as plt

plt.title("Dataset Artists V1 + unpredicted popularity")
plt.xlabel("popularity")
plt.ylabel("artist count")

plt.bar(list(set(y_test)), pp.count_distribution(y_test))
plt.show()

In [None]:
# Confusion matrix
fig, cax = plt.subplots(figsize=(16, 16)) # subplot for larger size
# plot_confusion_matrix(estimator=knn_clf, X=X_test, y_true=y_test, cmap=plt.cm.Blues,normalize="true",values_format=".2f",ax=cax)
plot_confusion_matrix(estimator=knn_clf, X=X_test, y_true=y_test, cmap=plt.cm.Blues,normalize=None,values_format=".2f",ax=cax)

plt.show()