# Ensemble Model Training

# Imports

In [None]:
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

load_dotenv()

DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = (DATA_PATH.parent) 
os.chdir(str(root_path))

import src.training.postprocessing as pp
import src.training.pre_training as t
import src.training.plotting as p

from sklearn.metrics import recall_score, precision_score, accuracy_score, plot_confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# sklearn ensemble
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

# import models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Data preparation and feature assignment

In [None]:
df_complete = t.get_complete_df()

## Encode all features

In [None]:
df = df_complete.drop(['id', 'name', 'artist_id', 'generation', 'song_id', 'primary_artist_id', 'lyrics_skipped', 'lyrics_stored'], axis=1)
print(list(df))

# Song popularity
df.iloc[:, 0] = df.iloc[:, 0].apply(t.binary_popularity)

# Artist popularity
df.iloc[:, -2] = df.iloc[:, -2].apply(t.binary_popularity)

# Artist genres
df.iloc[:, -1] = df.iloc[:, -1].apply(t.encode_genre)

max_followers = df["followers"].max()
df["followers"] = df["followers"].apply(lambda x: x / (max_followers / 100))

X_all, y = RandomUnderSampler(random_state=42).fit_resample(df.iloc[:, 1:], df.iloc[:, 0])
print(pp.count_distribution(y))

In [None]:
# pear_corr = df.corr(method='pearson')
# plt.imshow(pear_corr, cmap='PuBuGn', extent=["test", "r", "t", "s"])
# plt.show()
# # print(pear_corr)

## Train-Test-Split

In [None]:
# Train test
X_train_all, X_test_all, y_train, y_test = train_test_split(
    X_all, y, test_size=0.2, random_state=42
)

X_test_dict = dict()
X_train_dict = dict()

## Split Dataframe into model types

In [None]:
# music features
X_train_dict['music'] = X_train_all.iloc[:, 1:15]
X_test_dict['music'] = X_test_all.iloc[:, 1:15]

# lyrics features
X_train_dict['lyrics'] = X_train_all.iloc[:, 15:19]
X_test_dict['lyrics'] = X_test_all.iloc[:, 15:19]

# artist features
X_train_dict['artist'] = X_train_all.iloc[:, -3:]
X_test_dict['artist'] = X_test_all.iloc[:, -3:]

# all features
X_train_dict['all'] = X_train_all
X_test_dict['all'] = X_test_all

In [None]:
# Pearson Correlation Coefficient
pear_corr = df.corr(method='pearson')

plt.imshow(pear_corr, cmap='coolwarm')

# Models

In [None]:
# list for classifiers
clf_list = []

### Music Model

In [None]:
rf = RandomForestClassifier(n_estimators=250)
rf.fit(X_train_dict['music'], y_train)

clf_list.append((rf, 'music'))

### Lyrics Model

In [None]:
gaussian_clf = GaussianNB()
gaussian_clf.fit(X_train_dict['lyrics'], y_train)

clf_list.append((gaussian_clf, 'lyrics'))

### Artist Model

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train_dict['artist'], y_train)
clf_list.append((knn_clf, 'artist'))

### Complete Model

In [None]:
complete_clf = nn_clf = RandomForestClassifier(n_estimators=100)
complete_clf.fit(X_train_dict['all'], y_train)

clf_list = [(complete_clf, 'all')]

# Simulate Voting Classifier

In [None]:
print(clf_list)

predictions = np.asarray([clf.predict(X_test_dict[X_type]) for clf, X_type in clf_list])

weights = None
pred_avg = np.average(predictions, axis=0, weights=weights).round()

print("Weighted accuracy: " + str(round(accuracy_score(y_test, pred_avg), 4)))
print("Weighted precision: " + str(round(precision_score(y_test, pred_avg, average="weighted"), 4)))
print("Weighted recall: " + str(round(recall_score(y_test, pred_avg, average="weighted"), 4)))

# Model Evaluation

## Confusion Matrix

In [None]:
# Scaled conf matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from mpl_toolkits.axes_grid1 import ImageGrid
import itertools

# y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, pred_avg, normalize='true')

classes = ["0", "1"]
# classes = ["1", "2", "3", "4", "5"]
fig, ax = plt.subplots(1, 1)

im = ax.imshow(cm, vmin=0, vmax=1, cmap="Blues")
ax.set_title("Random forest on complete dataset")
tick_marks = np.arange(len(classes))
ax.set_xticks(tick_marks)
ax.set_xticklabels(classes)
ax.set_yticks(tick_marks)
ax.set_yticklabels(classes)

for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    text_color = "white" if cm[i, j] > 0.8 else "black"
    ax.text(j, i, format(cm[i, j], '.2f'),
                horizontalalignment="center",
                color=text_color)

ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')

sm = plt.cm.ScalarMappable(cmap="Blues", norm=plt.Normalize(vmin=0, vmax=1))
plt.colorbar(sm)

plt.show()

## MDI

In [None]:
importances = complete_clf.feature_importances_
std = np.std([
    tree.feature_importances_ for tree in complete_clf.estimators_], axis=0)

feature_list = list(map(lambda feat: "loudness" if feat == "loadness" else feat, list(X_all)))
feature_list = list(map(lambda feat: "artist_popularity" if feat == "popularity" else feat, feature_list))

forest_importances = pd.Series(importances, index=feature_list).sort_values(ascending=False)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI on V2")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

## MDA

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    complete_clf, X_test_dict['all'], y_test, n_repeats=10, random_state=42, n_jobs=2)


feature_list = list(map(lambda feat: "loudness" if feat == "loadness" else feat, list(X_all)))
feature_list = list(map(lambda feat: "artist_popularity" if feat == "popularity" else feat, feature_list))

forest_importances = pd.Series(result.importances_mean, index=feature_list).sort_values(ascending=False)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using MDA on V2 binary")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
# Plot distr
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.set_title("Distribution of predicted popularity")
ax.set_xlabel("popularity")
ax.set_ylabel("count")

plt.bar(list(set(pred_avg)), pp.count_distribution(pred_avg))

In [None]:
fig, cax = plt.subplots(figsize=(8, 8)) # subplot for larger size
IC = type('IdentityClassifier', (), {"predict": lambda i : i, "_estimator_type": "classifier"})
cax.set_title("Ensemble Classifier", fontsize=15)
plot_confusion_matrix(IC, y_test, pred_avg, cmap=plt.cm.Blues,normalize="true",values_format=".2f",ax=cax)
plt.show()

fig, cax = plt.subplots(figsize=(8, 8)) # subplot for larger size
IC = type('IdentityClassifier', (), {"predict": lambda i : i, "_estimator_type": "classifier"})
cax.set_title("Ensemble Classifier", fontsize=15)
plot_confusion_matrix(IC, y_test, pred_avg, cmap=plt.cm.Blues,normalize=None,values_format=".2f",ax=cax)
plt.show()


# Impact of artist popularity on song popularity

## Take popular songs from X_test and set to 0

In [None]:
# Songs with from popular artist
X_test_popular = X_test_all[X_test_all['popularity'] > 0]

# set popularity to 0
X_test_popular.loc[:, 'popularity'] = 0

## Overwrite test dict

In [None]:
X_test_dict['music'] = X_test_popular.iloc[:, 1:15]

X_test_dict['lyrics'] = X_test_popular.iloc[:, 15:19]

X_test_dict['artist'] = X_test_popular.iloc[:, -3:]

X_test_dict['all'] = X_test_popular
print(len(list(X_test_popular)))

## Predict popularity again

In [None]:
predictions_new = np.asarray([clf.predict(X_test_dict[X_type]) for clf, X_type in clf_list])

weights = None
pred_avg_new = np.average(predictions_new, axis=0, weights=weights).round()
print(list(pred_avg)[:10])
print(pred_avg_new[:10])

print("Weighted accuracy: " + str(round(accuracy_score(y_test, pred_avg), 4)))
print("Weighted precision: " + str(round(precision_score(y_test, pred_avg, average="weighted"), 4)))

## Check for differences in prediction

In [None]:
zipped = list(zip(filter(lambda x: x == 1, y_test), pred_avg_new))

# list of popular songs that are predicted unpopular
l = [x for x, y in zipped if x == 1 and y == 0]

print("Popular predictions", len(l), "/", len(zipped))

In [None]:
# Plot distr
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.set_title("Distribution of predicted popularity")
ax.set_xlabel("popularity")
ax.set_ylabel("count")

plt.bar(list(set(pred_avg_new)), pp.count_distribution(pred_avg_new))