In [None]:
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from collections.abc import Iterable
import ast
from helper import plot_missing_values, replace_titles_with_ids, replace_ids_with_titles, fill_top_genre, pca_group, make_pcs
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import math
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold

**Exploring the datasets**

In [None]:
# Explore different datasets
# Load the genres dataset
genres_path = os.path.join('data', 'genres.csv')
genres = pd.read_csv(genres_path)

echonest_path = os.path.join('data', 'echonest_features.tsv')
echonest = pd.read_csv(echonest_path, sep='\t')

tracks_path = os.path.join('data', 'tracks.tsv')
tracks = pd.read_csv(tracks_path, sep='\t')

spectral_path = os.path.join('data', 'spectral_features.tsv')
spectral = pd.read_csv(spectral_path, sep='\t')

genres_n_features = genres.shape[1] - 1  # Exclude the label column
genres_n_lines = genres.shape[0]

print(f"Genres dataset has {genres_n_lines} lines and {genres_n_features} features.")

echonest_n_features = echonest.shape[1] - 1
echonest_n_lines = echonest.shape[0]

print(f"Echonest dataset has {echonest_n_lines} lines and {echonest_n_features} features.")

tracks_n_features = tracks.shape[1] - 1
tracks_n_lines = tracks.shape[0]

print(f"Tracks dataset has {tracks_n_lines} lines and {tracks_n_features} features.")

spectral_n_features = spectral.shape[1] - 1
spectral_n_lines = spectral.shape[0]

print(f"Spectral dataset has {spectral_n_lines} lines and {spectral_n_features} features.")


# Explore the NA values 
na_counts = genres.isna().sum()
print("NA values in each column of genres dataset:")
print(na_counts[na_counts > 0])

na_counts_echonest = echonest.isna().sum()
print("NA values in each column of echonest dataset:")
print(na_counts_echonest[na_counts_echonest > 0])

na_counts_tracks = tracks.isna().sum()
print("NA values in each column of tracks dataset:")
print(na_counts_tracks[na_counts_tracks > 0])       

na_counts_spectral = spectral.isna().sum()
print("NA values in each column of spectral dataset:")
print(na_counts_spectral[na_counts_spectral > 0])

# Clearer plots for missing values: percent missing per column
datasets = {'Genres': genres, 'Echonest': echonest, 'Tracks': tracks, 'Spectral': spectral}
plt.figure(figsize=(14, 10))
for i, (name, df) in enumerate(datasets.items(), 1):
    plt.subplot(2, 2, i)
    # percent of missing values by column (0-100)
    na_pct = df.isna().mean() * 100
    na_pct = na_pct[na_pct > 0].sort_values(ascending=False)
    if na_pct.empty:
        plt.text(0.5, 0.5, 'No missing values', ha='center', va='center', fontsize=12)
        plt.title(f'{name}: 0 missing columns')
        plt.xlabel('')
        plt.yticks([])
    else:
        sns.barplot(x=na_pct.values, y=na_pct.index, palette='viridis')
        plt.xlabel('Percent missing (%)')
        plt.title(f'{name}: Missing values by column')
        # annotate bars with percent values
        for j, v in enumerate(na_pct.values):
            plt.text(v + 0.5, j, f'{v:.1f}%', va='center')
plt.suptitle('Percent Missing Values per Column', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


In [None]:
# We substitute the titles in tracks datset with their ids in the genres dataset

genres_id = genres[['genre_id', 'genre_parent_id', 'genre_title']]

replace_titles_with_ids(tracks, 'genre_top', genres_id)

# Turn the values in genre_top to int 
tracks['genre_top'] = tracks['genre_top'].astype('Int64')

# save the tracks_id dataset 
tracks.to_csv('data/tracks_id.tsv', sep='\t', index=False)


In [None]:
# Now we re going to construct a dataset where genres are titles and not numbers 
# We can use the genres_id dataset to map back the genre_top ids to titles

tracks['genres'] = tracks['genres'].apply(ast.literal_eval)
tracks['genres_all'] = tracks['genres_all'].apply(ast.literal_eval)


# print the type of the values in 'genres' column
print(tracks['genres'].apply(type).unique())
print(tracks['genres_all'].apply(type).unique())

tracks_titles = replace_ids_with_titles(tracks.copy(), 'genre_top', genres_id)
tracks_titles = replace_ids_with_titles(tracks_titles, 'genres', genres_id)
tracks_titles = replace_ids_with_titles(tracks_titles, 'genres_all', genres_id)

tracks_titles.to_csv('data/tracks_titles.tsv', sep='\t', index=False)


In [None]:
# test the function

# build the mapping, the parent should be a root node, see root_nodes
# We should keep going up the tree until we reach a root node
mapping_genre_parent = {}

parent_lookup = dict(zip(genres['genre_id'], genres['genre_parent_id']))

for gid in genres['genre_id']:
    current = gid
    parent = parent_lookup.get(current)

    while True:

        # Stop if parent is NaN
        if pd.isna(parent):
            break

        # Stop if parent points to itself (self-loop)
        if parent == current:
            break

        # Otherwise climb one level
        current = parent
        parent = parent_lookup.get(current)

    # current is now a root or self-parent
    mapping_genre_parent[gid] = current

# shape of unique values in mapping_genre_parent
print(len(set(mapping_genre_parent.values())))



In [None]:
tracks_filled = fill_top_genre(tracks, mapping_genre_parent)
print(tracks_filled['genre_top'].isna().sum())

In [None]:
# save the filled dataset
tracks_filled.to_csv('data/tracks_filled.tsv', sep='\t', index=False)

In [None]:
datasets = {'Genres': genres, 'Echonest': echonest, 'Tracks Filled': tracks_filled, 'Spectral': spectral}
plot_missing_values(datasets)

In [None]:
# Merging the datasets
# the 1st merge between tracks_filled and spectral_features on 'track_id'

# Remove the na values in tracks_filled in album_title and title features before merging
tracks_filled = tracks_filled.dropna(subset=['album_title', 'title'])

tracks_spectral = pd.merge(tracks_filled, spectral, on='track_id', how='left')

# The 2nd merge between tracks_spectral and echonest_features on 'track_id'
tracks_echonest = pd.merge(echonest, tracks_spectral, on='track_id', how='inner')

In [None]:
#Impute missing values in artist_latitude and artist_longitude
tracks_echonest['artist_location_unknown'] = np.where(
    tracks_echonest['artist_latitude'].isnull(), 
    1, 
    0
)
tracks_echonest['artist_latitude'] = tracks_echonest['artist_latitude'].fillna(0)
tracks_echonest['artist_longitude'] = tracks_echonest['artist_longitude'].fillna(0)
tracks_echonest.to_csv('data/tracks_echonest.tsv', sep='\t', index=False)

tracks_spectral['artist_location_unknown'] = np.where(
    tracks_spectral['artist_latitude'].isnull(), 
    1, 
    0
)
tracks_spectral['artist_latitude'] = tracks_spectral['artist_latitude'].fillna(0)
tracks_spectral['artist_longitude'] = tracks_spectral['artist_longitude'].fillna(0)
tracks_spectral.to_csv('data/tracks_spectral.tsv', sep='\t', index=False)


In [None]:
# Plot the missing values
datasets = {'Tracks_Echonest': tracks_echonest, 'Tracks_Spectral': tracks_spectral, 'Tracks_Filled': tracks_filled}

plot_missing_values(datasets)

In [None]:
# remove the lines with missing values in Tracks_Spectral
tracks_spectral = tracks_spectral.dropna()
tracks_spectral.to_csv('data/tracks_spectral.tsv', sep='\t', index=False)
# print the missing values after cleaning
print(tracks_spectral.isna().sum())

In [None]:
# Now we re going to construct datasets where genres are titles and not numbers
# datasets eith no mention of _title have ids for all genres, genres_all, genre_top

tracks_filled_titles = replace_ids_with_titles(tracks_filled.copy(), 'genre_top', genres_id)
tracks_filled_titles = replace_ids_with_titles(tracks_filled_titles, 'genres', genres_id)
tracks_filled_titles = replace_ids_with_titles(tracks_filled_titles, 'genres_all', genres_id)

tracks_filled_titles.to_csv('data/tracks_filled_titles.tsv', sep='\t', index=False)

tracks_echonest_titles = replace_ids_with_titles(tracks_echonest.copy(), 'genre_top', genres_id)
tracks_echonest_titles = replace_ids_with_titles(tracks_echonest_titles, 'genres', genres_id)
tracks_echonest_titles = replace_ids_with_titles(tracks_echonest_titles, 'genres_all', genres_id)

tracks_echonest_titles.to_csv('data/tracks_echonest_titles.tsv', sep='\t', index=False)

tracks_spectral_titles = replace_ids_with_titles(tracks_spectral.copy(), 'genre_top', genres_id)
tracks_spectral_titles = replace_ids_with_titles(tracks_spectral_titles, 'genres', genres_id)
tracks_spectral_titles = replace_ids_with_titles(tracks_spectral_titles, 'genres_all', genres_id)

tracks_spectral_titles.to_csv('data/tracks_spectral_titles.tsv', sep='\t', index=False)


In [None]:
numeric_cols = tracks_echonest.select_dtypes(include=['number']).columns

# Exclure les colonnes qui ne doivent PAS être utilisées (comme id)
numeric_cols = [c for c in numeric_cols if c not in ['track_id', 'id']]
print("Statistiques")

numeric_cols = echonest.select_dtypes(include=['number']).columns
numeric_cols = [c for c in numeric_cols if c not in ['track_id', 'id']]

print(tracks_echonest[numeric_cols].describe())



In [None]:
numeric_cols = echonest.select_dtypes(include=['number']).columns
numeric_cols = [c for c in numeric_cols if c not in ['track_id']]

n = len(numeric_cols)
rows = math.ceil(n/3)

fig, axes = plt.subplots(rows, 3, figsize=(15, 5*rows))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    echonest[col].hist(bins=40, ax=axes[i], alpha=0.7)
    axes[i].set_title(col)

plt.tight_layout()
plt.show()


In [None]:
numeric_cols = spectral.select_dtypes(include=['number']).columns
numeric_cols = [c for c in numeric_cols if c not in ['track_id', 'id']]

print("Statistiques (Spectral)")
print(spectral[numeric_cols].describe())
import math
fig, axes = plt.subplots(math.ceil(len(numeric_cols)/3), 3, figsize=(10, 10))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    spectral[col].hist(bins=40, ax=axes[i], alpha=0.7)
    axes[i].set_title(col)

plt.tight_layout()
plt.show()


In [None]:
# Colonnes numériques uniquement
X = tracks_spectral.select_dtypes(include=[float, int])

# Calculer toutes les variances
variances = X.var()

# DataFrame trié par variance CROISSANTE (les plus faibles en 1er)
variance_df = pd.DataFrame({
    'feature': variances.index,
    'variance': variances.values
}).sort_values('variance', ascending=True)  # ← Les plus faibles d'abord

print("=== 15 FEATURES AUX PLUS FAIBLES VARIANCES ===")
print(variance_df.head(15))

# Graphique des 20 plus faibles
plt.figure(figsize=(12, 8))
plt.barh(range(20), variance_df.head(20)['variance'], color='red', alpha=0.7)
plt.yticks(range(20), variance_df.head(20)['feature'])
plt.xlabel('Variance')
plt.title('20 Features aux plus FAIBLES variances (à éliminer)')
plt.gca().invert_yaxis()  # Plus faibles en haut
plt.tight_layout()
plt.show()

# Seuil pour éliminer (exemple: variance < 1)
low_var_threshold = 1
low_var_features = variance_df[variance_df['variance'] < low_var_threshold]['feature'].tolist()
print(f"\nFeatures à éliminer (variance < {low_var_threshold}): {len(low_var_features)}")
print(low_var_features)


In [None]:
numeric_cols = tracks_spectral.select_dtypes(include=['number']).columns
numeric_cols = [c for c in numeric_cols if c not in ['track_id', 'id']]
corr_matrix = tracks_echonest[numeric_cols].corr()
print(corr_matrix)
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Matrice de corrélation - Features audio')
plt.tight_layout()
plt.show()

In [None]:
#group avec correlation entre élts > 0.98
group1=['spectral_bandwidth_median_01','spectral_bandwidth_mean_01']
group2=['spectral_centroid_median_01','spectral_centroid_mean_01']
group3=['spectral_rolloff_median_01','spectral_rolloff_mean_01']
#on supprime les variables redondantes 
to_drop = [
    'spectral_bandwidth_median_01',
    'spectral_centroid_median_01',
    'spectral_rolloff_median_01'  
]
tracks_spectral_reduced = tracks_spectral.drop(columns=to_drop)
#group avec correlation entre élts > 0.85
group4=['spectral_rolloff_mean_01','spectral_bandwidth_mean_01','spectral_centroid_mean_01']
group5=['interest','listens','favorites']
#goup avec correllation >0.65
group6=['spectral_bandwidth_skew_01','spectral_centroid_kurtosis_01','spectral_bandwidth_kurtosis_01','spectral_centroid_skew_01','spectral_rolloff_skew_01',
        'spectral_rolloff_kurtosis_01']

In [None]:
# Appliquer PCA sur chaque groupe et afficher variance
pca_group(tracks_spectral_reduced, group4, "4 (moyennes spectrales)")
pca_group(tracks_spectral_reduced, group5, "5 (popularité)")
pca_group(tracks_spectral_reduced, group6, "6 (skew/kurtosis)")

In [None]:

# group4 → 1 PC
tracks_spectral_reduced = make_pcs(tracks_spectral_reduced, group4, 1, 'g4')

# group5 → 1 PC
tracks_spectral_reduced = make_pcs(tracks_spectral_reduced, group5, 1, 'g5')

# group6 → 3 PC
tracks_spectral_reduced = make_pcs(tracks_spectral_reduced, group6, 3, 'g6')

# puis on supprime les colonnes originales de ces groupes
tracks_spectral_reduced = tracks_spectral_reduced.drop(columns=group4+group5+group6)


In [None]:
tracks_spectral_reduced.to_csv('data/tracks_spectral_reduced.csv')

In [None]:
tracks_spectral_reduced.columns