Import modules

In [1]:
import random

from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

from scipy.spatial.distance import cdist

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score


Load df

In [2]:
df = pd.read_excel('df_classique.xlsx')

# Group by playlist name
groups = df.groupby('playlist_name')


In [3]:
def compute_gap(data, k):
  """
  Compute the gap statistic for a given value of k.

  Parameters:
  - data: the data to cluster, with shape (n_samples, n_features)
  - k: the number of clusters

  Returns:
  - gap: the gap statistic for the given value of k
  """
  # Compute the WCSS for the real data
  kmeans = KMeans(n_clusters=k, random_state=42).fit(data)
  wcss = kmeans.inertia_

  # Compute the null reference distribution by shuffling the data and
  # re-assigning it to clusters
  n_samples, n_features = data.shape
  wcss_null = []

  for _ in range(20):
    data_shuffled = np.random.permutation(data)
    wcss_null.append(KMeans(n_clusters=k).fit(data_shuffled).inertia_)
  
  wcss_null = np.array(wcss_null)
  
  # Compute the gap statistic and gap*
  gap = np.log(np.mean(wcss_null)) - np.log(wcss)

  # Compute the standard deviation of the null reference distribution
  gap_std = np.std(np.log(wcss_null))

  return gap, gap_std


In [4]:
# Initialize scaler
scaler = MinMaxScaler()

# Initialize dictionaries to store clustering results and optimal ks
reduced_data_dict_6 = {}
reduced_data_dict_5 = {}
reduced_data_dict_4 = {}

optimal_num_k_6 = {}
optimal_num_k_5 = {}
optimal_num_k_4 = {}

optimal_num_k_2_6 = {}
optimal_num_k_2_5 = {}
optimal_num_k_2_4 = {}

# Initialize PCA objects
pca_6 = PCA(n_components=6)
pca_5 = PCA(n_components=5)
pca_4 = PCA(n_components=4)

# Loop over each group and apply PCA with the specified number of components
for group_name, group in groups:
    # Subset data
    subset = group.loc[:, "danceability":"duration_ms"]

    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA 
    reduced_data_6 = pca_6.fit_transform(scaled_columns)
    reduced_data_5 = pca_5.fit_transform(scaled_columns)
    reduced_data_4 = pca_4.fit_transform(scaled_columns)

    reduced_data_dict_6[group_name] = reduced_data_6
    reduced_data_dict_5[group_name] = reduced_data_5
    reduced_data_dict_4[group_name] = reduced_data_4

    # Initialize lists to store the gap statistics and error bars for different values of k
    group_gaps_6 = []
    group_errors_6 = []

    group_gaps_5 = []
    group_errors_5 = []

    group_gaps_4 = []
    group_errors_4 = []

    # Loop over different values of k
    for k in range(1, 11):
        # Compute the gap statistic and standard deviation for the current value of k
        gap_6, gap_std_6 = compute_gap(reduced_data_6, k)
        group_gaps_6.append(gap_6)
        group_errors_6.append(gap_std_6)

        gap_5, gap_std_5 = compute_gap(reduced_data_5, k)
        group_gaps_5.append(gap_5)
        group_errors_5.append(gap_std_5)

        gap_4, gap_std_4 = compute_gap(reduced_data_4, k)
        group_gaps_4.append(gap_4)
        group_errors_4.append(gap_std_4)

    # Find the optimal number of components based on the gap statistic criterion
    optimal_k_6 = None
    for i in range(1, len(group_gaps_6) - 1):
        s_k = group_errors_6[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps_6[i] >= group_gaps_6[i + 1] - threshold:
            optimal_k_6 = i + 1
            break

    if optimal_k_6 is None:
        optimal_k_6 = np.argmax(group_gaps_6) + 1

    optimal_k_5 = None
    for i in range(1, len(group_gaps_5) - 1):
        s_k = group_errors_5[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps_5[i] >= group_gaps_5[i + 1] - threshold:
            optimal_k_5 = i + 1
            break

    if optimal_k_5 is None:
        optimal_k_5 = np.argmax(group_gaps_5) + 1

    optimal_k_4 = None
    for i in range(1, len(group_gaps_4) - 1):
        s_k = group_errors_4[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps_4[i] >= group_gaps_4[i + 1] - threshold:
            optimal_k_4 = i + 1
            break

    if optimal_k_4 is None:
        optimal_k_4 = np.argmax(group_gaps_4) + 1

    optimal_num_k_6[group_name] = optimal_k_6
    optimal_num_k_5[group_name] = optimal_k_5
    optimal_num_k_4[group_name] = optimal_k_4

    # Find the optimal number of components based on the gap* statistic criterion
    optimal_k_2_6 = np.argmax(group_gaps_6) + 1
    optimal_k_2_5 = np.argmax(group_gaps_5) + 1
    optimal_k_2_4 = np.argmax(group_gaps_4) + 1

    optimal_num_k_2_6[group_name] = optimal_k_2_6
    optimal_num_k_2_5[group_name] = optimal_k_2_5
    optimal_num_k_2_4[group_name] = optimal_k_2_4

    print(f"Group: {group_name}, Optimal k 6 (gap): {optimal_num_k_6[group_name]}, Optimal k 6 (gap*): {optimal_num_k_2_6[group_name]} and Optimal k 5 (gap): {optimal_num_k_5[group_name]}, Optimal k 5 (gap*): {optimal_num_k_2_5[group_name]}, Optimal k 4 (gap): {optimal_num_k_4[group_name]},Optimal k 4 (gap*): {optimal_num_k_2_4[group_name]}")


Group: Anime Classical, Optimal k 6 (gap): 2, Optimal k 6 (gap*): 8 and Optimal k 5 (gap): 2, Optimal k 5 (gap*): 7, Optimal k 4 (gap): 3,Optimal k 4 (gap*): 9
Group: Atmospheric Piano, Optimal k 6 (gap): 2, Optimal k 6 (gap*): 9 and Optimal k 5 (gap): 3, Optimal k 5 (gap*): 8, Optimal k 4 (gap): 2,Optimal k 4 (gap*): 10
Group: Atmospheric Sci-fi Soundtracks, Optimal k 6 (gap): 2, Optimal k 6 (gap*): 9 and Optimal k 5 (gap): 2, Optimal k 5 (gap*): 7, Optimal k 4 (gap): 4,Optimal k 4 (gap*): 8
Group: Baroque Classics, Optimal k 6 (gap): 2, Optimal k 6 (gap*): 9 and Optimal k 5 (gap): 2, Optimal k 5 (gap*): 8, Optimal k 4 (gap): 2,Optimal k 4 (gap*): 8
Group: Calming Carols, Optimal k 6 (gap): 2, Optimal k 6 (gap*): 7 and Optimal k 5 (gap): 3, Optimal k 5 (gap*): 7, Optimal k 4 (gap): 5,Optimal k 4 (gap*): 7
Group: Calming Classical, Optimal k 6 (gap): 2, Optimal k 6 (gap*): 6 and Optimal k 5 (gap): 3, Optimal k 5 (gap*): 10, Optimal k 4 (gap): 4,Optimal k 4 (gap*): 9
Group: Chilled Clas

In [5]:
results_df = pd.DataFrame()

# Loop over each group and apply PCA with the specified number of components
for group_name, group in groups:
    # Get reduced data
    reduced_data_6 = reduced_data_dict_6[group_name]
    reduced_data_5 = reduced_data_dict_5[group_name]
    reduced_data_4 = reduced_data_dict_4[group_name]

    # Perform k-means clustering for both configurations
    kmeans_6 = KMeans(n_clusters=optimal_num_k_6[group_name], random_state=42).fit(reduced_data_6)
    kmeans_2_6 = KMeans(n_clusters=optimal_num_k_2_6[group_name], random_state=42).fit(reduced_data_6)

    kmeans_5 = KMeans(n_clusters=optimal_num_k_5[group_name], random_state=42).fit(reduced_data_5)
    kmeans_2_5 = KMeans(n_clusters=optimal_num_k_2_5[group_name], random_state=42).fit(reduced_data_5)

    kmeans_4 = KMeans(n_clusters=optimal_num_k_4[group_name], random_state=42).fit(reduced_data_4)
    kmeans_2_4 = KMeans(n_clusters=optimal_num_k_2_4[group_name], random_state=42).fit(reduced_data_4)

    # Get cluster assignments for both configurations
    cluster_assignments_6 = kmeans_6.labels_
    cluster_assignments_2_6 = kmeans_2_6.labels_

    cluster_assignments_5 = kmeans_5.labels_
    cluster_assignments_2_5 = kmeans_2_5.labels_

    cluster_assignments_4 = kmeans_4.labels_
    cluster_assignments_2_4 = kmeans_2_4.labels_

    # Compute silhouette scores for both configurations
    silhouette_score_6 = silhouette_score(reduced_data_6, cluster_assignments_6)
    silhouette_score_2_6 = silhouette_score(reduced_data_6, cluster_assignments_2_6)

    silhouette_score_5 = silhouette_score(reduced_data_5, cluster_assignments_5)
    silhouette_score_2_5 = silhouette_score(reduced_data_5, cluster_assignments_2_5)

    silhouette_score_4 = silhouette_score(reduced_data_4, cluster_assignments_4)
    silhouette_score_2_4 = silhouette_score(reduced_data_4, cluster_assignments_2_4)

    results_df = results_df.append({
        'playlist_name': group_name,
        'nb_cluster_6': optimal_num_k_6[group_name],
        'nb_cluster_6_2': optimal_num_k_2_6[group_name],
        'silhouette_score_6': silhouette_score_6,
        'silhouette_score_6_2': silhouette_score_2_6,
        'nb_cluster_5': optimal_num_k_5[group_name],
        'nb_cluster_5_2': optimal_num_k_2_5[group_name],
        'silhouette_score_5': silhouette_score_5,
        'silhouette_score_5_2': silhouette_score_2_5,
        'nb_cluster_4': optimal_num_k_4[group_name],
        'nb_cluster_4_2': optimal_num_k_2_4[group_name],
        'silhouette_score_4': silhouette_score_4,
        'silhouette_score_4_2': silhouette_score_2_4
    }, ignore_index=True)

print(results_df)


  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = resul

                               playlist_name  nb_cluster_6  nb_cluster_6_2  \
0                            Anime Classical             2               8   
1                          Atmospheric Piano             2               9   
2             Atmospheric Sci-fi Soundtracks             2               9   
3                           Baroque Classics             2               9   
4                             Calming Carols             2               7   
5                          Calming Classical             2               6   
6                          Chilled Classical             3               9   
7                   Chilled Classical Covers             3               9   
8                             Choir Classics             2               9   
9                         Cinematic Chillout             3               6   
10                         Classical Cooking             2              10   
11                        Classical Cooldown             3      

  results_df = results_df.append({


In [6]:
results_df.to_excel('robust_classique.xlsx')
