# Projet de Machine Learning

Notebook Python avec les codes utilisés pour le rapport final. 

## Importation des librairies 

In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import os

from functions import *
from tensorflow import keras

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 6)
plt.rcParams['font.size'] = 16
sns.set(style="darkgrid")
sns.set(rc={'figure.figsize': (15, 6)})

PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

In [None]:
from IPython.core.display import HTML

HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

# Obtenir les données

In [None]:
spotify_data = pd.read_csv("data/spotify-extr.txt", sep=" ")

# Description de l'ensemble du jeu de données

In [None]:
spotify_data.head()

In [None]:
spotify_data.info()

Les variables explicatives sont :
* `valence` : la positivité de la chanson, vaut 1 si la chanson est très joyeuse, 0 sinon ;
* `year` : année de sortie ;
* `acousticness` : mesure "l'acousticité" de la chanson ;
* `danceability` : mesure la "dançabilite" d'une chanson ;
* `duration` : durée d'une chanson en millisecondes ;
* `energy` : l'énergie de la chanson, vaut 1 si la chanson est très énergétique, 0 sinon ;
* `intrumentalness` : taux d'instrumentalisation, vaut 1 s'il n'y a aucune voix présente dans la chanson, 0 sinon ; 
* `key` : tonalité de la musique (ex : A=la), ne prend pas en compte la distinction majeur/mineur ;
* `liveness` : taux de prestation en live, vaut 1 si la chanson ne comporte que de la musique (sans sons à intérêts non-musicaux), 0 sinon ;
* `loudness` : intensité sonore de la chanson
* `mode` : variable binaire qui indique si la chanson commence par une progression d'accords majeure (1) ou non (0)
* `speechiness` : taux de vocaux dans la chanson, vaut 1 si la chanson comporte de la voix tout le long, 0 sinon ;
* `tempo` :  tempo de la chanson en beats par minute (bpm)

Notre objectif consiste à prédire la valeur de `pop.class` et de `popularity`, c'est-à-dire la popularité d'une chanson, soit comme un entier entre 0 et 100, soit comme une classe $A$, $B$, $C$ ou $D$.

In [None]:
spotify_data.describe()

In [None]:
data_qual = spotify_data[["pop.class", "mode", "key"]]
data_qual.head()

On transforme les variables qualitatives en catégories pour mieux traiter les données.

In [None]:
spotify_data["key"] = pd.Categorical(spotify_data["key"], ordered=False)
spotify_data["mode"] = pd.Categorical(spotify_data["mode"], ordered=False)
spotify_data["pop.class"] = pd.Categorical(spotify_data["pop.class"],
                                           ordered=True,
                                           categories=['D', 'C', 'B', 'A'])

In [None]:
spotify_data.dtypes

# Analyses uni et multidimensionnelles

## Variables qualitatives

On commence par analyser les variables qualitatives `pop.class`, `key` et `mode`.

<b>Classe de popularité</b>

Cette variable a été créée en amont de l'obtention des données. C'est notre variable à prédire en classification.

In [None]:
pop_class_count = data_qual["pop.class"].value_counts().iloc[::-1]

plt.figure(figsize=(12, 6))
sns.barplot(x=pop_class_count.index, y=pop_class_count.values)
#plt.title("Fréquence des classes de popularité", fontsize=14)
plt.ylabel("Nombre d'occurences", fontsize=13)
plt.xlabel("Classe", fontsize=13)
save_fig("pop_class_frequencies")
plt.show()

<b>Clé</b>

In [None]:
fig, ax = plt.subplots()
key_count = spotify_data['key'].value_counts(
    normalize=True, sort=True, ascending=True) * 100
y_ticks = spotify_data['key'].value_counts().index

sns.barplot(x=key_count.values, y=y_ticks, data=key_count, orient='h')
plt.xlabel("% d'occurences", fontsize=12, weight='bold')
plt.ylabel('Clé', fontsize=12, weight='bold')
ax.set_xticks(ticks=range(0, 16, 1))
ax.set_yticklabels(labels=y_ticks, fontsize=12)

rects = ax.patches
for rect in rects:
    x_value = rect.get_width()
    y_value = rect.get_y() + rect.get_height() / 2
    label = f'{x_value:.1f}%'

    plt.annotate(label, (x_value, y_value),
                 xytext=(5, 0),
                 textcoords="offset points",
                 va='center',
                 ha='left')

#plt.title("Distribution de 'key'", fontsize=14)
save_fig('keys_frequencies')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='key', y='popularity', data=spotify_data)
#plt.title("Popularité selon la clé", fontsize=14)
plt.ylabel("Popularité")
plt.xlabel("Clé")
save_fig("popularity_by_key")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='key', y='popularity', data=spotify_data)
#plt.title("Popularité selon la clé", fontsize=14)
plt.ylabel("Popularité")
plt.xlabel("Clé")
#save_fig("popularity_by_key")
plt.show()

<b>Mode</b>

In [None]:
mode_count = spotify_data["mode"].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=mode_count.index, y=mode_count.values)
#plt.title("Fréquence des modes", fontsize=14)
plt.ylabel("Nombre d'occurences", fontsize=13)
plt.xlabel("Mode", fontsize=13)
save_fig("mode_frequencies")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='mode', y='popularity', data=spotify_data)
#plt.title("Fréquence des modes", fontsize=14)
plt.ylabel("Popularité selon le mode", fontsize=13)
plt.xlabel("Mode", fontsize=13)
save_fig("popularity_by_mode")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='mode', y='popularity', data=spotify_data)
#plt.title("Popularité selon la clé", fontsize=14)
plt.ylabel("Popularité")
plt.xlabel("Clé")
#save_fig("popularity_by_key")
plt.show()

On regroupe toutes les variables qualitatives en un barplot :

In [None]:
sns.barplot(x='mode', y='popularity', hue='key', data=spotify_data)
#plt.title("Popularité selon la clé et le mode", fontsize=14)
save_fig("popularity_by_key_and_mode")
plt.show()

In [None]:
sns.boxplot(x='mode', y='popularity', hue='key', data=spotify_data)
#plt.title("Popularité selon la clé et le mode", fontsize=14)
#save_fig("popularity_by_key_and_mode")
plt.show()

## Variables quantitatives

On commence par visualiser la corrélation entre les variables quantitatives :

In [None]:
data_quant = spotify_data[spotify_data.columns.difference(
    ['key', 'mode', 'pop.class'], sort=False)]
data_quant.keys()

In [None]:
corr_matrix = data_quant.corr()
cmap = sns.diverging_palette(240, 10, as_cmap=True)

plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix, cmap=cmap, center=0, vmin=-1, vmax=1)
#plt.title("Matrice de corrélation")
save_fig("correlation_square_matrix")
plt.show()

Ce graphique nous montre qu'il y a certaines variables qui ont une forte corrélation. Par exemple, il y a une forte corrélation négative entre les variables `energy` et `acousticness`. Cela a du sens vu que les chansons acoustiques sont plus tranquilles (moins énergiques) que celles qui ne sont pas acoustiques. De même, `energy` et `loudness` sont positivement corrélées, ce qui est attendu vu que les chansons bruyantes ont souvent plus d'énergie.
<br>
On voit aussi que plus une chanson est acoustique, moins elle est populaire, vu que les variables `acousticness` et `popularity` ont une forte corrélation négative.

In [None]:
series = np.abs(corr_matrix['popularity']).sort_values(ascending=False)
print("Les variables les plus corrélées avec la variable 'popularity' sont : ")
for i, row in enumerate(series):
    if 0.2 <= row < 1:
        print(f'{series.index[i]:17} --> {row: .2f} (abs)')

Voici leurs distributions avec boxplot :

In [None]:
# plt.scatter(spotify_data['energy'], spotify_data['acousticness'], alpha=.5)
# plt.xlabel("Energy")
# plt.ylabel("Acousticness")
# plt.title("Acousticité des chansons en fonction de leur énergie")
# save_fig("acousticness_by_energy")
# plt.show()

In [None]:
# histograms_plot(data_quant, data_quant.columns, 4, 3)
# save_fig("quantitative_data_histograms")
# plt.show()

In [None]:
plt.style.use('seaborn-poster')

fig = plt.figure(figsize=(22, 28))
outer = fig.add_gridspec(6, 2, wspace=0.1, hspace=0.5, left=0.03,
                         right=0.98, bottom=0.03, top=0.98)

a = 0
for i in range(6):
    for j in range(2):
        feature = data_quant.columns[a]
        inner = outer[i, j].subgridspec(2, 1, wspace=0.2, hspace=0,
                                        height_ratios=[0.15, 0.85])
        axs = inner.subplots(sharex=True)

        sns.boxplot(data=data_quant, x=feature, orient='h', ax=axs[0])
        sns.histplot(data=data_quant, x=feature,
                     bins=50 if a != 1 else 100,
                     ax=axs[1], kde=True)

        axs[0].spines['top'].set_color('black')
        axs[0].spines['right'].set_color('black')
        axs[0].spines['left'].set_color('black')

        axs[1].set_title("Distribution de '" + feature + "'", y=1.2, fontsize=14)
        axs[1].spines['bottom'].set_color('black')
        axs[1].spines['right'].set_color('black')
        axs[1].spines['left'].set_color('black')

        a += 1

    #fig.suptitle('Distribution des variables quantitatives', y=1.01, fontsize=20)
plt.show()

Voici une étude plus approfondie de chaque variable quantitative :

<b>Acousticness</b>

In [None]:
ax_data = spotify_data.groupby(
    'acousticness')['popularity'].mean().to_frame().reset_index()
sns.scatterplot(x=ax_data['acousticness'],
                y=ax_data['popularity'],
                color='blue')
#plt.title("Acousticité")
plt.ylabel('Popularité moyenne', fontsize=12)
plt.tight_layout()
plt.show()

<b>Danceability</b>

In [None]:
ax_data = spotify_data.groupby(
    'danceability')['popularity'].mean().to_frame().reset_index()
sns.scatterplot(x='danceability', y='popularity', data=ax_data, color='blue')
#plt.title('Dançabilité')
plt.ylabel('Popularité moyenne', fontsize=12)
plt.tight_layout()
plt.show()

<b>Duration</b>

On convertit la durée des chansons en minutes pour en tirer plus d'informations.

In [None]:
spotify_data['duration'] = spotify_data['duration'] / 60000
spotify_data['duration'].describe()

In [None]:
spotify_data['duration'].hist(bins=50)
plt.show()

On voit que la chanson la plus longue dans le jeu de données dure 45 minutes, donc on choisit de séparer les chansons longues de chansons courtes au seuil de 8 minutes pour mieux voir les durées.

In [None]:
long_songs = spotify_data.loc[spotify_data['duration'] > 8]
short_songs = spotify_data.loc[spotify_data['duration'] <= 8]

In [None]:
sns.histplot(short_songs['duration'], kde=False)
#plt.title(f'Chansons courtes (<=8 min) : {short_songs.shape[0]} chansons')
plt.xticks(range(0, 9, 1))
plt.xlim(0, 9)
plt.show()

In [None]:
sns.histplot(long_songs['duration'], kde=False, bins=60)
#plt.title(f'Chansons longues (>8 min) : {long_songs.shape[0]} chansons')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

ax1_data = short_songs.groupby(
    'duration')['popularity'].mean().to_frame().reset_index()
ax1 = sns.scatterplot(x='duration',
                      y='popularity',
                      data=ax1_data,
                      color='blue',
                      ax=ax1)
ax1.set_xticks(range(0, 9, 1))
ax1.set_xlim(0, 9)
ax1.set_title('Chansons courtes')

ax2_data = long_songs.groupby(
    'duration')['popularity'].mean().to_frame().reset_index()
ax2 = sns.scatterplot(x=ax2_data['duration'],
                      y=ax2_data['popularity'],
                      color='green',
                      ax=ax2)
ax2.set_xticks(range(8, 49, 4))
ax2.set_xlim(6, 50)
ax2.set_title('Chansons longues')
ax1.set_ylabel('Popularité moyenne', fontsize=12)
plt.tight_layout()
plt.show()

<b>Energy</b>

In [None]:
ax_data = spotify_data.groupby(
    'energy')['popularity'].mean().to_frame().reset_index()
sns.scatterplot(x='energy', y='popularity', data=ax_data, color='blue')
#plt.title('Énergie')
plt.ylabel('Popularité moyenne', fontsize=12)
plt.tight_layout()
plt.show()

<b>Instrumentalness</b>

In [None]:
spotify_data['instrumentalness'].describe()

In [None]:
spotify_data.loc[spotify_data['tempo'] == 0].shape

On transformera la variable `instrumentalness` en trois sous-variables en raison de sa distribution inégale : il y a beaucoup de chansons ayant une valeur nulle d'instrumentalité, ce qui ne correspond pas vraiment avec la réalité.

In [None]:
# sns.histplot(spotify_data['instrumentalness'], kde=False, bins=50)
# plt.vlines(0.1, ymin=0, ymax=7000, linestyles='dashed',
#            linewidths=1., color='grey')
# plt.vlines(0.95, ymin=0, ymax=7000, linestyles='dashed',
#            linewidths=1., color='grey')
# plt.annotate('1ère limite (0.10)', xy=(0.1, 6500),
#              xytext=(0.2, 6500),
#              arrowprops=dict(facecolor='black', shrink=0.05))
# plt.annotate('2ème limite (0.95)', xy=(0.95, 6500),
#              xytext=(0.7, 6500),
#              arrowprops=dict(facecolor='black', shrink=0.05))
# plt.show()

In [None]:
# ax_data = spotify_data.groupby(
#     'instrumentalness')['popularity'].mean().to_frame().reset_index()
# sns.scatterplot(x='instrumentalness', y='popularity',
#                 data=ax_data, color='blue')
# #plt.title('Instrumentalité')
# plt.ylabel('Popularité moyenne', fontsize=12)
# plt.tight_layout()
# plt.show()

In [None]:
# criteria = [
#     spotify_data['instrumentalness'].between(0, 0.1),
#     spotify_data['instrumentalness'].between(0.100001, 0.95),
#     spotify_data['instrumentalness'].between(0.950001, 1)
# ]

# values = np.arange(1, 4)
# spotify_data['instrumentalness_criteria'] = np.select(criteria, values, 0)

Puis on supprime la variable `instrumentalness`.

In [None]:
# del spotify_data['instrumentalness']

# spotify_data.head()

In [None]:
# plt.figure(figsize=(8, 6))
# sns.histplot(spotify_data['instrumentalness_criteria'], kde=False, bins=3)
# plt.xticks(np.arange(1, 4))
# plt.show()

In [None]:
# spotify_data['instrumentalness_criteria'].value_counts()

<b>Liveness</b>

In [None]:
ax_data = spotify_data.groupby(
    'liveness')['popularity'].mean().to_frame().reset_index()
sns.scatterplot(x='liveness', y='popularity', data=ax_data, color='blue')
#plt.title('liveness')
plt.ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()

<b>Popularity</b> (variable à prédire)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 11))
ax1 = sns.histplot(spotify_data['popularity'], ax=ax1, bins=50)
ax2 = sns.histplot(spotify_data.loc[spotify_data['popularity'] > 0, 'popularity'],
                   ax=ax2, bins=50)
ax1.set_xlim(0, 100)
ax2.set_xlim(0, 100)
ax1.set_xlabel('')
ax1.set_title('Haut : Toutes les données\nBas : Popularité > 0', fontsize=12)
plt.show()

On voit qu'il y a un nombre important de chansons ayant 0 comme popularité. En effet ces chansons sont proches de l'extraction de la base des données et donc leur popularité n'avait pas encore été déterminée.

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))
ax = spotify_data.groupby('year')['popularity'].mean().plot()
ax.set_title('Popularité moyenne au cours des années')
ax.set_ylabel('Popularité moyenne', fontsize=12)
ax.set_xlabel('Année')
ax.xaxis.set_tick_params(labelsize=10)
ax.set_xticks(range(1920, 2021, 5))
plt.show()

<b>Tempo</b>

In [None]:
sns.jointplot(x='tempo', y='popularity', data=spotify_data, height=10)
plt.show()

In [None]:
spotify_data.loc[spotify_data['tempo'] == 0].shape

On voit qu'il y a 13 chansons pour lesquelles `tempo` vaut 0 ce qui n'est pas possible.

In [None]:
corrected_tempo = spotify_data.loc[spotify_data['tempo'] > 0]['tempo']
corrected_tempo.describe()

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
ax = sns.histplot(spotify_data['tempo'], bins=200, kde=False)
ax.set_ylabel('Fréquences', fontsize=12)

ax.text(s='13\nOutliers', x=5, y=40, fontdict={'size': 12, 'c': 'darkred'})
ax.text(s='Valeurs sans 0', x=125, y=160,
        fontdict={'size': 12, 'c': 'darkred'})
ax.text(s='Médiane\ncorrigée\n114.55', x=116, y=40,
        fontdict={'size': 10, 'c': 'darkgreen', 'weight': 'bold'})

ax.axvline(x=114.55, ymin=0, ymax=0.7, color='green',
           linestyle='dashed', linewidth=2)
ax.axvline(x=35.37, ymin=0, ymax=1, color='orange',
           linestyle='dashed', linewidth=3)
ax.axvline(x=214.42, ymin=0, ymax=1, color='orange',
           linestyle='dashed', linewidth=3)

ax.annotate("", xy=(35.37, 150), xytext=(214.42, 150),
            arrowprops=dict(arrowstyle="<->",
                            color='r',
                            linestyle='dashed',
                            linewidth=2))
ax.annotate("", xy=(0, 30), xytext=(0, 50),
            arrowprops=dict(arrowstyle="->",
                            color='r',
                            linestyle='dashed',
                            linewidth=3))
plt.show()

<b>Year</b>

In [None]:
sns.jointplot(x='year', y='popularity', data=spotify_data, height=10)
plt.suptitle("Joint plot de la popularité selon l'année de sortie", y=1.02)
save_fig("jointplot_of_popularity_by_year")
plt.show()

Quelques plots de corrélations :

In [None]:
# from pandas.plotting import scatter_matrix

# attributes = ["acousticness", "energy", "loudness", "popularity"]
# scatter_matrix(spotify_data[attributes],
#                alpha=0.2,
#                figsize=(20, 15),
#                diagonal='kde')
# plt.suptitle("Nuage de points de quelques variables", fontsize=20)
# save_fig("scatter_matrix_plot")
# plt.show()

Pour mieux visualiser la durée de la chanson, nous avons décidé de lui appliquer le logarithme naturel afin de réduire les valeurs, tout en gardant l'ordre de croissance.

In [None]:
# spotify_data["log_duration"] = np.log(spotify_data["duration"])
# spotify_data["log_duration"].hist(bins=50)
# plt.title("Log de la durée")
# save_fig("log_of_duration")
# plt.show()

In [None]:
# spotify_data["tempo_norm"] = (
#     spotify_data["tempo"] -
#     spotify_data["tempo"].mean()) / spotify_data["tempo"].std()
# spotify_data["tempo_norm"].hist(bins=50)
# plt.title("Variable 'tempo' normalisée")
# save_fig("scaled_tempo")
# plt.show()

In [None]:
# spotify_data["dance_norm"] = (
#     spotify_data["danceability"] -
#     spotify_data["danceability"].mean()) / spotify_data["danceability"].std()
# spotify_data["dance_norm"].hist(bins=50)
# plt.title("Variable 'danceability' normalisée")
# save_fig("scaled_danceability")
# plt.show()

In [None]:
# del spotify_data["duration"]
# del spotify_data["tempo"]
# del spotify_data["danceability"]
# spotify_data.head()

In [None]:
spotify_data.keys()

Cette cellule prend assez de temps à s'exécuter.

In [None]:
for i in ['key', 'mode']:
    sns.pairplot(spotify_data, hue=i)
    plt.show()

In [None]:
# sns.pairplot(spotify_data)
# #plt.suptitle("Pair plot des données", fontsize=20, y=1.02)
# save_fig("pairplot_of_dataset")
# plt.show()

# ACP

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

attributs = [
    feature for feature in spotify_data.keys()
    if feature not in data_qual.keys()
]
attributs.remove('popularity')
print(attributs)

In [None]:
X_new = spotify_data[attributs]
X_scaled = scale(X_new)
pca = PCA()
C = pca.fit_transform(X_scaled)

In [None]:
x = np.arange(pca.explained_variance_.size)
cumsum = np.cumsum(pca.explained_variance_ratio_)
var_ratio = pca.explained_variance_ratio_

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 10), sharex=True)

ax[0].bar(x, var_ratio)
ax[0].plot(var_ratio, color='black')
ax[0].set_ylabel("Pourcentage de la variance expliquée", fontsize=16)
#ax[0].set_title("Part de la variance expliquée", fontsize=15)

for p in ax[0].patches:
    text = str(np.round(p.get_height(), 3) * 100)[:4] + '%'
    ax[0].annotate(text=text,
                   xy=(p.get_x() + p.get_width() / 2., p.get_height() + 0.01),
                   fontsize='large', ha='center', va='center')

ax[1].bar(x, cumsum, width=.7)
ax[1].plot(x, cumsum)
ax[1].set_ylabel("Variance partagée", fontsize=16)
#ax[1].set_title("Somme cumulée de la part de la variance", fontsize=15)

for p in ax[1].patches:
    text = str(np.round(p.get_height(), 3) * 100)[:4] + '%'
    ax[1].annotate(text=text,
                   xy=(p.get_x() + p.get_width() / 2., p.get_height() + 0.01),
                   fontsize='large', ha='center', va='center')

fig.text(0.5, -0.01, "Composantes Principales", ha='center', fontsize=20)
#plt.suptitle("Analyse de la variance des composantes principales", fontsize=22)
save_fig("explained_var_ratio_and_cumulative")
plt.show()

In [None]:
plt.figure(figsize=(9, 9))
plt.boxplot(C)
plt.axhline(color='grey', linewidth=1, linestyle='--')
#plt.title("Boxplot des variables de l'ACP")
save_fig("boxplot_of_variances")
plt.show()

1. Sélection de variables :
On sélectionne les 6 premières composantes principales.
Variance expliquée par les valeurs propres : 80% de variance expliquée à partir de 6 CP
On observe un coude sur le graphe des variances expliquées à partir de la 6e CP.
Boxplots : étendue des boxplots relativement stable à partir de la 5 ou 6e CP, la médiane des boxplots devient relativement identique.

In [None]:
plt.figure(figsize=(12, 12))
sc = sns.scatterplot(x=C[:, 0], y=C[:, 1], hue='pop.class',
                     data=spotify_data, alpha=.7, legend=True)
sc.legend().set_title('Classe de popularité')
plt.axvline(color="grey")
plt.axhline(color="grey")
#plt.title("Nuage de points des individus de l'ACP")
save_fig("scatterplot_of_individuals")
plt.show()

2. Nuage de points des individus:
On observe 2 groupes distincts : 1 grand et un plus petit.

In [None]:
plot_corr_circle(X_new, pca, 1, 2)
save_fig("pca_components_1_2")
plt.show()

In [None]:
plot_corr_circle(X_new, pca, 1, 3)
save_fig("pca_components_1_3")
plt.show()

3. Cercle des correlations  (dim 1 et dim 2):
Axe des abscisses : Dimension 1
Axe des ordonnées : Dimension 2

Variables représentées par les flèches.

Speechiness : entièrement expliquée par la dimension 2.
Log_duration et speechiness sont très proches de l'axe des ordonnées : variables expliquées en majorité par la dimension 2.
Instrumentalness, accousticness, loudness: essetiellement expliquées par la dimension 1.

Accousticness et loudness : flèches sur le même axe. Variables inversement corrélées. En accord avec le graphe des corrélations.

Axe 2 : "divise" les flèches en 2 ?
A droite du graphe : dans les valeurs positives, on retrouve les chansons plus calmes / accoustiques / instrumentales
A gauche du graphe : dans les valeurs négatives , on retrouve les chansons plus "loud", dançantes

# Préparation des données

In [None]:
spotify_pop_class = spotify_data[["pop.class"]]
spotify_key = spotify_data[["key"]]

In [None]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

ordinal_encoder = OrdinalEncoder()
spotify_pop_class_encoded = ordinal_encoder.fit_transform(spotify_pop_class)
spotify_pop_class_encoded = np.reshape(spotify_pop_class_encoded,
                                       spotify_data.shape[0]).astype(int)
print(spotify_pop_class_encoded[:10])

In [None]:
label_encoder = LabelEncoder()
spotify_key_encoded = label_encoder.fit_transform(spotify_key.values.ravel())
print(spotify_key_encoded[:10])

In [None]:
spotify_data["key"] = spotify_key_encoded
spotify_data["pop.class"] = spotify_pop_class_encoded

In [None]:
features = [
    feature for feature in spotify_data.keys()
    if feature not in ['popularity', 'pop.class']
]
print(features)

X = spotify_data[features]
y_class = spotify_data[["pop.class"]]
y_reg = spotify_data[["popularity"]]
y_class = y_class.values.ravel()
y_reg = y_reg.values.ravel()

In [None]:
X.head()

In [None]:
print(y_reg[:5])
print(y_class[:15])

# Apprentissage

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
def get_NN_model(n_inputs, n_outputs, problem=None):
    model = keras.models.Sequential()
    
    model.add(keras.layers.Dense(100,
                                 input_dim=n_inputs,
                                 activation='relu'))
    
    model.add(keras.layers.Dense(150, activation='relu'))
    model.add(keras.layers.Dense(100, activation='relu'))
    
    if problem == 'regression':
        model.add(keras.layers.Dense(n_outputs,
                                     activation='linear'))

        model.compile(loss='mean_squared_error',
                      optimizer=keras.optimizers.SGD(lr=1e-3),
                      metrics=['accuracy'])
    
    elif problem == 'classification':
        model.add(keras.layers.Dense(n_outputs,
                                     activation='softmax'))

        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=keras.optimizers.SGD(lr=1e-3),
                      metrics=['accuracy'])
    
    return model

## Classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

In [None]:
classes = ['A', 'B', 'C', 'D']

In [None]:
X_train, X_test, y_train_class, y_test_class = train_test_split(
    X, y_class, test_size=0.25, random_state=42
)

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Régression logistique

#### Multinomiale

In [None]:
Popularity_DIC = {1: "A", 2: "B", 3: "C", 4: "D"}
labels = Popularity_DIC.values()

param = [{"C": [0.5, 1, 5, 10, 12, 15, 30]}]
LM = GridSearchCV(LogisticRegression(penalty="l1",
                                     solver='saga',
                                     multi_class='multinomial',
                                     max_iter=4000),
                  param, cv=10)
LM.fit(X_train_scaled, y_train_class)

LM.best_params_["C"]

In [None]:
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (LM.best_score_, LM.best_params_))

In [None]:
LM_Predict = LM.predict(X_test_scaled)
LM_Accuracy = accuracy_score(y_test_class, LM_Predict)
print("Précision :" + str(LM_Accuracy))

In [None]:
plot_cf_matrix(y_test_class, LM_Predict, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of LM")
save_fig("confusion_matrix_of_LM")
plt.show()

#### OVR

In [None]:
param = [{"C": [0.5, 1, 5, 10, 12, 15, 30]}]
LOVR = GridSearchCV(LogisticRegression(penalty="l1",
                                       solver='liblinear',
                                       multi_class='ovr'),
                    param, cv=10)
LOVR.fit(X_train_scaled, y_train_class)

LOVR.best_params_["C"]

In [None]:
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (LOVR.best_score_, LOVR.best_params_))

In [None]:
LOVR_Predict = LOVR.predict(X_test_scaled)
LOVR_Accuracy = accuracy_score(y_test_class, LOVR_Predict)
print("Précision :" + str(LOVR_Accuracy))

In [None]:
plot_cf_matrix(y_test_class, LOVR_Predict, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of LOVR")
save_fig("confusion_matrix_of_LOVR")
plt.show()

### Random Forest

In [None]:
#param=[{"n_estimators":np.arange(100,500,100),"max_features":list(range(2,10,1))}]
param = [{"max_features": list(range(2, 10))}]
RFC_Model = GridSearchCV(RandomForestClassifier(n_estimators=500, n_jobs=-1),
                         param, cv=5, verbose=3)
RFC_Model.fit(X_train, y_train_class)

In [None]:
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (RFC_Model.best_score_, RFC_Model.best_params_))

In [None]:
RFC_Predict = RFC_Model.predict(X_test)
RFC_Accuracy = accuracy_score(y_test_class, RFC_Predict)
print("Précision : " + str(RFC_Accuracy))

In [None]:
plot_cf_matrix(y_test_class, RFC_Predict, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of RFC")
save_fig("confusion_matrix_of_RFC")
plt.show()

In [None]:
# feature_df_rf = pd.DataFrame({
#     'Importance': RFC_Model.feature_importances_,
#     'Features': features
# })

# feature_df_rf.sort_values(by='Importance', ascending=False)

In [None]:
print(classification_report(y_test_class, RFC_Predict))

### Decision Trees

In [None]:
param = [{
    'min_samples_split': range(2, 203, 10),
    'max_features': [None, 'auto', 'sqrt', 'log2']
}]

DT_Model = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  param, cv=5, verbose=3)

DT_Model.fit(X_train, y_train_class)

In [None]:
print("Meilleur score = %f, Meilleurs paramètres = %s" %
      (DT_Model.best_score_, DT_Model.best_params_))

In [None]:
DT_Predict = DT_Model.predict(X_test)
DT_Accuracy = accuracy_score(y_test_class, DT_Predict)
print("Précision : " + str(DT_Accuracy))

In [None]:
plot_cf_matrix(y_test_class, DT_Predict, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of Decision Trees")
save_fig("confusion_matrix_of_DT")
plt.show()

In [None]:
# feature_df_dt = pd.DataFrame({
#     'Importance': DT_Model.feature_importances_,
#     'Features': features
# })

# feature_df_dt.sort_values(by='Importance', ascending=False)

### LinearSVC

#### OVR

In [None]:
LSVC_Model = LinearSVC(multi_class='ovr', max_iter=4000, random_state=42)
LSVC_Model.fit(X_train_scaled, y_train_class)
LSVC_Prediction = LSVC_Model.predict(X_test_scaled)

In [None]:
plot_cf_matrix(y_test_class, LSVC_Prediction, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of LSVC OVR")
save_fig("confusion_matrix_of_LSVC_OVR")
plt.show()

In [None]:
print(classification_report(y_test_class, LSVC_Prediction))

#### Crammer-Singer

In [None]:
LSVC_Model = LinearSVC(multi_class='crammer_singer',
                       max_iter=4000,
                       random_state=42)
LSVC_Model.fit(X_train_scaled, y_train_class)
LSVC_Prediction = LSVC_Model.predict(X_test_scaled)

In [None]:
plot_cf_matrix(y_test_class, LSVC_Prediction, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of LSVC C-S")
save_fig("confusion_matrix_of_LSVC_CS")
plt.show()

In [None]:
print(classification_report(y_test_class, LSVC_Prediction, zero_division=0))

### SVC

In [None]:
param = [{
    "C": [0.5, 1., 2., 5., 10., 15., 30.]
}]
SVC_Model = GridSearchCV(SVC(), param, cv=5, verbose=3)
SVC_Model.fit(X_train_scaled, y_train_class)

SVC_Model.best_params_["C"]

In [None]:
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (SVC_Model.best_score_, SVC_Model.best_params_))

In [None]:
SVC_Predict = SVC_Model.predict(X_test_scaled)
SVC_Accuracy = accuracy_score(y_test_class, SVC_Predict)
print("Précision :" + str(SVC_Accuracy))

In [None]:
plot_cf_matrix(y_test_class, SVC_Predict, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of LOVR")
save_fig("confusion_matrix_of_SVC")
plt.show()

In [None]:
print(classification_report(y_test_class, SVC_Predict))

### Réseaux de neuronnes

In [None]:
# n_hidden = 10
# n_features = 10
# n_classes = 4
# keras_model = Sequential()
# keras_model.add(Dense(n_hidden, input_dim=n_features, activation='sigmoid'))
# keras_model.add(Dense(n_classes, activation='softmax'))

# keras_model.compile(optimizer=SGD(lr=3),
#                     loss='categorical_crossentropy',
#                     metrics=['accuracy'])
# history = keras_model.fit(X_train, y_train, validation_data=(X_valid, y_valid))

In [None]:
n_inputs, n_outputs = X_train.shape[1], 4
model = get_NN_model(n_inputs, n_outputs, 'classification')
model.summary()

In [None]:
MM_scaler = MinMaxScaler()

X_train_scaled = MM_scaler.fit_transform(X_train)
X_test_scaled = MM_scaler.transform(X_test)

In [None]:
history = model.fit(X_train_scaled, y_train_class, epochs=200, batch_size=30,
                    validation_data=(X_test_scaled, y_test_class), verbose=0)

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.ylim(0, 1)
save_fig("keras_learning_curves_plot_class")
plt.show()

In [None]:
NN_Predict = np.argmax(model.predict(X_test_scaled), axis=-1)

plot_cf_matrix(y_test_class, NN_Predict, classes, cmap='Blues', draw_mosaic=False)
#plt.title("Confusion Matrix of NN")
save_fig("confusion_matrix_of_NN")
plt.show()

In [None]:
print(classification_report(y_test_class, NN_Predict))

## Régression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

reg_metrics = (mean_squared_error, r2_score, explained_variance_score)

In [None]:
X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.25, random_state=42
)

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Régression linéaire

In [None]:
LR_Model = LinearRegression()
LR_Model.fit(X_train, y_train_reg)
LR_Predict = LR_Model.predict(X_test)

plot_results(reg_metrics, y_test_reg, y_test_class, LR_Predict)
#plt.title("Results of Linear Regression Predictions")
save_fig("results_of_LR_predictions")
plt.show()

### Random Forest

In [None]:
RF_Model = RandomForestRegressor()
RF_Model.fit(X_train, y_train_reg)
RF_Predict = RF_Model.predict(X_test)

plot_results(reg_metrics, y_test_reg, y_test_class, RF_Predict)
#plt.title("Results of Random Forest Predictions")
save_fig("results_of_RF_predictions")
plt.show()

Optimisation par validation croisée de la valeur de *max_features* et *min_samples_split*.

In [None]:
param = [{
    "max_features": list(range(2, 10)),
    "min_samples_split": list(range(2, 14))
}]

RF_Model_Optim = GridSearchCV(RandomForestRegressor(), param, cv=5, n_jobs=-1)
RF_Optim = RF_Model_Optim.fit(X_train, y_train_reg)
RF_Predict_Optim = RF_Optim.predict(X_test)
# paramètre optimal
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (1. - RF_Optim.best_score_, RF_Optim.best_params_))

In [None]:
plot_results(reg_metrics, y_test_reg, y_test_class, RF_Predict_Optim)
#plt.title("Results of Random Forest Predictions with Optimal Parameters")
save_fig("results_of_RF_predictions_optim")
plt.show()

In [None]:
# feature_df_rf = pd.DataFrame({
#     'Importance': RF_Model.feature_importances_,
#     'Features': features
# })

# feature_df_rf.sort_values(by='Importance', ascending=False)

### Decision Trees

In [None]:
DT_Model = DecisionTreeRegressor()
DT_Model.fit(X_train, y_train_reg)
DT_Predict = DT_Model.predict(X_test)

plot_results(reg_metrics, y_test_reg, y_test_class, DT_Predict)
#plt.title("Results of Decision Tree Predictions")
save_fig("results_of_DT_predictions")
plt.show()

Optimisation par validation croisée de la valeur de *max_depth* et *min_samples_split*.

In [None]:
param = [{
    "max_depth": list(range(2, 10)),
    "min_samples_split": list(range(2, 10))
}]

DT_Model_Optim = GridSearchCV(DecisionTreeRegressor(), param, cv=10, n_jobs=-1)
DT_Optim = DT_Model_Optim.fit(X_train, y_train_reg)
DT_Predict_Optim = DT_Optim.predict(X_test)
# paramètres optimaux
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (1. - DT_Optim.best_score_, DT_Optim.best_params_))

In [None]:
plot_results(reg_metrics, y_test_reg, y_test_class, DT_Predict_Optim)
#plt.title("Results of Decision Tree Predictions with Optimal Parameters")
save_fig("results_of_DT_predictions_optim")
plt.show()

In [None]:
# feature_df_dt = pd.DataFrame({
#     'Importance': DT_Model.feature_importances_,
#     'Features': features
# })

# feature_df_dt.sort_values(by='Importance', ascending=False)

### Linear SVR

In [None]:
LSVR_Model = LinearSVR()
LSVR_Model.fit(X_train_scaled, y_train_reg)
LSVR_Predict = LSVR_Model.predict(X_test_scaled)

plot_results(reg_metrics, y_test_reg, y_test_class, LSVR_Predict)
#plt.title("Results of Linear SVR Predictions")
save_fig("results_of_LSVR_predictions")
plt.show()

Optimisation de la pénalisation (paramètre $C$) par validation croisée.

In [None]:
param = [{
    "C": [0.4, 0.5, 0.6, 0.8, 1., 1.4]
}]

LSVR_Model_Optim = GridSearchCV(LinearSVR(), param, cv=10)
LSVR_Optim = LSVR_Model_Optim.fit(X_train_scaled, y_train_reg)
LSVR_Predict_Optim = LSVR_Optim.predict(X_test_scaled)
# paramètre optimal
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (1. - LSVR_Optim.best_score_, LSVR_Optim.best_params_))

In [None]:
plot_results(reg_metrics, y_test_reg, y_test_class, LSVR_Predict_Optim)
#plt.title("Results of Linear SVR Predictions with Optimal Parameter")
save_fig("results_of_LSVR_predictions_optim")
plt.show()

In [None]:
# feature_df_lsvr = pd.DataFrame({
#     'Coefficients': LSVR_Model.coef_,
#     'Features': features
# })

# feature_df_lsvr.sort_values(by='Coefficients', ascending=False)

### SVR

In [None]:
SVR_Model = SVR()
SVR_Model.fit(X_train_scaled, y_train_reg)
SVR_Predict = SVR_Model.predict(X_test_scaled)

plot_results(reg_metrics, y_test_reg, y_test_class, SVR_Predict)
#plt.title("Results of SVR Predictions")
save_fig("results_of_SVR_predictions")
plt.show()

Optimisation de la pénalisation (paramètre $C$) par validation croisée.

In [None]:
param = [{
    "C": [0.4, 0.5, 0.6, 0.8, 1, 1.4]
}]

SVR_Model_Optim = GridSearchCV(SVR(), param, cv=10)
SVR_Optim = SVR_Model_Optim.fit(X_train_scaled, y_train_reg)
SVR_Predict_Optim = SVR_Optim.predict(X_test_scaled)
# paramètre optimal
print("Meilleur score = %f, Meilleur paramètre = %s" %
      (1. - SVR_Optim.best_score_, SVR_Optim.best_params_))

In [None]:
plot_results(reg_metrics, y_test_reg, y_test_class, SVR_Predict_Optim)
#plt.title("Results of SVR Predictions with Optimal Parameter")
save_fig("results_of_SVR_predictions_optim")
plt.show()

### Réseaux de neuronnes

In [None]:
n_inputs, n_outputs = X_train.shape[1], 1
model = get_NN_model(n_inputs, n_outputs, 'regression')
model.summary()

In [None]:
MM_scaler = MinMaxScaler()

X_train_scaled = MM_scaler.fit_transform(X_train)
X_test_scaled = MM_scaler.transform(X_test)

In [None]:
history = model.fit(X_train_scaled, y_train_reg, epochs=200, batch_size=30,
                    validation_data=(X_test_scaled, y_test_reg), verbose=0)

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
#plt.ylim(0, 1)
save_fig("keras_learning_curves_plot_reg")
plt.show()

In [None]:
NN_Predict = model.predict(X_test_scaled)

plot_results(reg_metrics, y_test_reg, y_test_class, NN_Predict)
#plt.title("Results of NN Predictions")
save_fig("results_of_NN_predictions")
plt.show()