# Data Management

## Setup

In [1]:
# Import modules
import ast
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from collections import Counter

from scipy.spatial.distance import cdist

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances

from tqdm import tqdm


In [2]:
# Load data
df = pd.read_csv('/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_management/db_soundcharts/charts_tracks_info_23-24.csv')


## Clean data

In [3]:
# Collapse to weekly level playlists data
# Determine weekday
df['collection_date'] = pd.to_datetime(df['collection_date'])

df['weekday'] = df['collection_date'].dt.weekday

# Filter to keep only rows that fall on Fridays
df_weekly = df[df['weekday'] == 4].copy()
df_weekly.drop(columns=['weekday'], inplace=True)

# Print data
display(df_weekly)


Unnamed: 0,playlist_id,playlist_name,collection_date,playlist_type,track_position,track_name,track_uuid,collection_date_clean,artist_name,artist_id,label_name,label_type,track_date,track_features,track_genres
250,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,1,Petit génie,5c0b0187-6e00-4499-b310-36e730252d13,2023-10-06 00:00:00+00:00,"['Abou Debeing', 'Alonzo', 'Imen Es', 'lossa',...","['11e81bc3-36f8-dd20-b110-a0369fe50396', '11e8...","['2054 Records', 'Fulgu Prod', 'Next Génération']","['unknown', 'Other independent labels: ""Indie""...",2023-08-04T00:00:00+00:00,"{'acousticness': 0.13, 'danceability': 0.8, 'e...",['hip-hop & rap']
251,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,2,LAISSE MOI,065bda79-ba47-41cb-b44f-487741dc2ab9,2023-10-06 00:00:00+00:00,['KeBlack'],['11e81bc8-7bb4-47b0-ac67-a0369fe50396'],['Dayeden'],['unknown'],2023-09-08T00:00:00+00:00,"{'acousticness': 0.46, 'danceability': 0.77, '...",['hip-hop & rap']
252,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,3,Saiyan,f1928fd5-134e-444e-a66c-7b5541a9f23f,2023-10-06 00:00:00+00:00,"[""Heuss L'enfoiré"", 'Gazo']","['11e81bce-22bb-888e-9f62-a0369fe50396', '59a6...","['150 Prod', 'Straw Production']","['unknown', 'unknown']",2023-06-02T00:00:00+00:00,"{'acousticness': 0.36, 'danceability': 0.78, '...",['hip-hop & rap']
253,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,4,Casanova,be8682de-a811-4754-b310-9c816d3d6b56,2023-10-06 00:00:00+00:00,"['Soolking', 'Gazo']","['11e81bba-d00e-f2d2-a406-a0369fe50396', '59a6...",['Pandor Music'],['unknown'],2023-07-06T00:00:00+00:00,"{'acousticness': 0.68, 'danceability': 0.8, 'e...","['hip-hop & rap', 'r&b, funk & soul']"
254,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,5,Meridian,a899c70e-bcfd-42e3-80b7-0d0280c63c43,2023-10-06 00:00:00+00:00,"['Dave', 'Tiakola']","['11e81bc7-3e47-39ce-afd6-a0369fe50396', '3498...","['Neighbourhood', 'Dave']","['Other independent labels: ""Indie""', 'Self re...",2023-08-24T00:00:00+00:00,"{'acousticness': 0.23, 'danceability': 0.91, '...",['hip-hop & rap']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36440,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,46,PYTHON FUNK,643d4aae-96a3-421a-a70b-1e3ae46cc4f3,2024-09-27 00:00:00+00:00,"['RD12', 'Sayfalse']","['148a2f14-933c-43d1-b3e6-c0c57aaaba75', '3352...",['0to8'],['Self released'],2024-08-23T00:00:00+00:00,"{'acousticness': 0.16, 'danceability': 0.6, 'e...","['latin', 'r&b, funk & soul']"
36441,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,47,Mamma Mia (feat. Mentissa) (Techno Mix),74d09c6a-7969-49ce-a5eb-9df074f889f3,2024-09-27 00:00:00+00:00,"['Mentissa', 'BENNETT']","['850e33b7-a367-4c63-bbdf-689d00ddb7fb', 'ad31...","['Warner', 'BENNETT']","['Warner', 'Self released']",2024-09-06T00:00:00+00:00,"{'acousticness': 0.33, 'danceability': 0.71, '...",['electronic']
36442,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,48,"Yamore (feat. Cesária Evora, Benja (NL) & Fran...",ab00c003-467a-49e7-8872-a30141b6ee28,2024-09-27 00:00:00+00:00,"['Cesária Évora', 'Salif Keita', 'MoBlack', 'F...","['11e81bc7-e901-b2f4-857e-a0369fe50396', '11e8...",['Universal'],['Universal'],2024-07-19T00:00:00+00:00,,['others']
36443,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,49,Champs Elysées,aa45fafb-6e64-4dfd-a993-4db16959e9b1,2024-09-27 00:00:00+00:00,"['Gunna', 'Toosii']","['11e81bc9-9a0b-a932-b7d7-a0369fe50396', '11e8...",['Universal'],['Universal'],2024-09-06T00:00:00+00:00,"{'acousticness': 0.18, 'danceability': 0.79, '...",['hip-hop & rap']


In [4]:
# Normalize track_features
# Convert track features to dict
def string_to_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return []

df_weekly['track_features'] = df_weekly['track_features'].apply(string_to_list)
df_weekly['track_features'] = df_weekly['track_features'].apply(dict)

# Normalize features
df_features = pd.json_normalize(df_weekly['track_features'])
print(df_features)

# Concatenate the new columns back to the original DataFrame
df_final = pd.concat(
    [df_weekly.reset_index(drop=True), df_features.reset_index(drop=True)],
    axis=1
)
df_final.drop(columns='track_features', inplace=True)

# Print and export data
df_final.to_csv('charts_tracks_info_23-24_weekly_normalized.csv', index=False)
display(df_final)


      acousticness  danceability  energy  instrumentalness   key  liveness  \
0             0.13          0.80    0.58              0.00   1.0      0.22   
1             0.46          0.77    0.72              0.00  10.0      0.13   
2             0.36          0.78    0.81              0.00   7.0      0.21   
3             0.68          0.80    0.83              0.00   7.0      0.12   
4             0.23          0.91    0.57              0.00   4.0      0.32   
...            ...           ...     ...               ...   ...       ...   
5195          0.16          0.60    0.95              0.97   9.0      0.22   
5196          0.33          0.71    0.80              0.00   1.0      0.45   
5197           NaN           NaN     NaN               NaN   NaN       NaN   
5198          0.18          0.79    0.55              0.00   4.0      0.25   
5199          0.82          0.43    0.32              0.12   8.0      0.26   

      loudness  mode  speechiness   tempo  timeSignature  valen

Unnamed: 0,playlist_id,playlist_name,collection_date,playlist_type,track_position,track_name,track_uuid,collection_date_clean,artist_name,artist_id,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,timeSignature,valence
0,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,1,Petit génie,5c0b0187-6e00-4499-b310-36e730252d13,2023-10-06 00:00:00+00:00,"['Abou Debeing', 'Alonzo', 'Imen Es', 'lossa',...","['11e81bc3-36f8-dd20-b110-a0369fe50396', '11e8...",...,0.58,0.00,1.0,0.22,-5.93,0.0,0.24,126.05,4.0,0.97
1,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,2,LAISSE MOI,065bda79-ba47-41cb-b44f-487741dc2ab9,2023-10-06 00:00:00+00:00,['KeBlack'],['11e81bc8-7bb4-47b0-ac67-a0369fe50396'],...,0.72,0.00,10.0,0.13,-7.88,0.0,0.35,120.03,4.0,0.70
2,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,3,Saiyan,f1928fd5-134e-444e-a66c-7b5541a9f23f,2023-10-06 00:00:00+00:00,"[""Heuss L'enfoiré"", 'Gazo']","['11e81bce-22bb-888e-9f62-a0369fe50396', '59a6...",...,0.81,0.00,7.0,0.21,-4.57,0.0,0.03,125.03,4.0,0.66
3,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,4,Casanova,be8682de-a811-4754-b310-9c816d3d6b56,2023-10-06 00:00:00+00:00,"['Soolking', 'Gazo']","['11e81bba-d00e-f2d2-a406-a0369fe50396', '59a6...",...,0.83,0.00,7.0,0.12,-4.58,0.0,0.21,132.04,4.0,0.76
4,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,5,Meridian,a899c70e-bcfd-42e3-80b7-0d0280c63c43,2023-10-06 00:00:00+00:00,"['Dave', 'Tiakola']","['11e81bc7-3e47-39ce-afd6-a0369fe50396', '3498...",...,0.57,0.00,4.0,0.32,-5.95,0.0,0.10,115.03,4.0,0.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5195,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,46,PYTHON FUNK,643d4aae-96a3-421a-a70b-1e3ae46cc4f3,2024-09-27 00:00:00+00:00,"['RD12', 'Sayfalse']","['148a2f14-933c-43d1-b3e6-c0c57aaaba75', '3352...",...,0.95,0.97,9.0,0.22,2.20,1.0,0.21,105.01,4.0,0.49
5196,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,47,Mamma Mia (feat. Mentissa) (Techno Mix),74d09c6a-7969-49ce-a5eb-9df074f889f3,2024-09-27 00:00:00+00:00,"['Mentissa', 'BENNETT']","['850e33b7-a367-4c63-bbdf-689d00ddb7fb', 'ad31...",...,0.80,0.00,1.0,0.45,-4.62,0.0,0.19,140.05,4.0,0.48
5197,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,48,"Yamore (feat. Cesária Evora, Benja (NL) & Fran...",ab00c003-467a-49e7-8872-a30141b6ee28,2024-09-27 00:00:00+00:00,"['Cesária Évora', 'Salif Keita', 'MoBlack', 'F...","['11e81bc7-e901-b2f4-857e-a0369fe50396', '11e8...",...,,,,,,,,,,
5198,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,49,Champs Elysées,aa45fafb-6e64-4dfd-a993-4db16959e9b1,2024-09-27 00:00:00+00:00,"['Gunna', 'Toosii']","['11e81bc9-9a0b-a932-b7d7-a0369fe50396', '11e8...",...,0.55,0.00,4.0,0.25,-6.96,0.0,0.15,122.96,4.0,0.18


## PCA dimensionnality reduction

In [5]:
# Load data
df = pd.read_csv('charts_tracks_info_23-24_weekly_normalized.csv')

# Drop NAs
selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]
df = df.dropna(subset=selected_columns)
print(len(df))

# Group by playlist_name
groups = df.groupby(['playlist_name', 'collection_date'], as_index=False)


5096


In [6]:
# Select the number of components
scaler = MinMaxScaler()

selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]

optimal_num_components = []

for group_name, group in groups:
    # Subset and scale
    subset = group.loc[:, selected_columns]
    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA
    pca = PCA()
    pca.fit(scaled_columns)
    
    # Calculate cumulative explained variance
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    optimal_components = next(i for i, var in enumerate(cumulative_variance) if var >= 0.8) + 1
    optimal_num_components.append(optimal_components)
    print(group_name, optimal_components)

    # Plot scree plot
    '''
    plt.plot(range(1, pca.n_components_ + 1), cumulative_variance, 'bo-', linewidth=2)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title(group_name)
    plt.show()
    '''

# Calculate the most modal value
most_common_optimal = Counter(optimal_num_components).most_common(1)
most_modal_value = most_common_optimal[0][0]

print("Most modal value of optimal components:", most_modal_value)


('Top 50 - France', '2023-10-06 00:00:00+00:00') 5
('Top 50 - France', '2023-10-13 00:00:00+00:00') 5
('Top 50 - France', '2023-10-20 00:00:00+00:00') 5
('Top 50 - France', '2023-10-27 00:00:00+00:00') 5
('Top 50 - France', '2023-11-03 00:00:00+00:00') 5
('Top 50 - France', '2023-11-10 00:00:00+00:00') 5
('Top 50 - France', '2023-11-17 00:00:00+00:00') 5
('Top 50 - France', '2023-11-24 00:00:00+00:00') 5
('Top 50 - France', '2023-12-01 00:00:00+00:00') 5
('Top 50 - France', '2023-12-08 00:00:00+00:00') 5
('Top 50 - France', '2023-12-15 00:00:00+00:00') 5
('Top 50 - France', '2023-12-22 00:00:00+00:00') 5
('Top 50 - France', '2023-12-29 00:00:00+00:00') 5
('Top 50 - France', '2024-01-05 00:00:00+00:00') 5
('Top 50 - France', '2024-01-12 00:00:00+00:00') 5
('Top 50 - France', '2024-01-19 00:00:00+00:00') 5
('Top 50 - France', '2024-01-26 00:00:00+00:00') 5
('Top 50 - France', '2024-02-02 00:00:00+00:00') 5
('Top 50 - France', '2024-02-09 00:00:00+00:00') 5
('Top 50 - France', '2024-02-16

In [7]:
# Apply PCA to reduce dimensionality
# Loop over each group and apply PCA with optimal number of components
scaler = MinMaxScaler()

selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]

reduced_data_dict = {}

for group_name, group in groups:
    # Subset data
    subset = group.loc[:, selected_columns]

    # Scale the specified columns
    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA with the optimal number of components
    n_components = 5
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(scaled_columns)
    reduced_data_dict[group_name] = reduced_data


## Clustering

### Elbow method

In [None]:
# Apply the elbow method to determine the optimal number of clusters
# Calculate the elbow
wcss = {}

for k in range(1, 11):
  for group_name, group in groups:
    reduced_data = reduced_data_dict[group_name]
    
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    
    if group_name not in wcss:
      wcss[group_name] = []

    wcss[group_name].append(kmeans.inertia_)

# Plot the elbow
for group_name, values in wcss.items():
  plt.plot(range(1, 11), values, label=group_name)
  plt.xlabel('Number of clusters (k)')
  plt.ylabel('Within-cluster sum of squares (WCSS)')
  plt.legend()
  plt.show()


### Gap statistic method

In [8]:
# Set random seeds
random.seed(42)
np.random.seed(42)


In [9]:
def compute_gap(data, k):
    # Compute the WCSS for the real data
    kmeans = KMeans(n_clusters=k, random_state=42).fit(data)
    wcss = kmeans.inertia_

    # Compute the null reference distribution by shuffling the data and
    # re-assigning it to clusters
    n_samples, n_features = data.shape
    wcss_null = []

    for _ in range(20):
        data_shuffled = np.random.permutation(data)
        wcss_null.append(KMeans(n_clusters=k).fit(data_shuffled).inertia_)
  
    wcss_null = np.array(wcss_null)
  
    # Compute the gap statistic and gap*
    gap = np.mean(np.log(wcss_null)) - np.log(wcss)

    # Compute the standard deviation of the null reference distribution
    gap_std = np.std(np.log(wcss_null))

    return gap, gap_std


In [10]:
# Calculate the optimal k
# Loop over each group and generate scree plot
optimal_num_k = {}
optimal_num_k_2 = {}

for group_name, group in tqdm(groups):
    group_gaps = []
    group_errors = []

    for k in range(1, 11):
        reduced_data = reduced_data_dict[group_name]

        # Compute the gap statistic and standard deviation for the current value of k
        gap, gap_std = compute_gap(reduced_data, k)
        group_gaps.append(gap)
        group_errors.append(gap_std)

    # Find the optimal number of components based on the gap statistic criterion
    optimal_k = None
    for i in range(1, len(group_gaps) - 1):
        s_k = group_errors[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps[i] >= group_gaps[i + 1] - threshold:
            optimal_k = i + 1
            break

    if optimal_k is None:
        optimal_k = np.argmax(group_gaps) + 1

    optimal_num_k[group_name] = optimal_k

    # Find the optimal number of components based on the gap* statistic criterion
    for i in range(1, len(group_gaps)):
        optimal_k_2 = np.argmax(group_gaps) + 1

    optimal_num_k_2[group_name] = optimal_k_2

    # Plot the gap statistics
    '''
    plt.plot(range(1, 11), group_gaps, label=group_name)

    # Set labels and display the plot
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Gap statistic')
    plt.legend()
    plt.show()
    
    print(f"Group: {group_name}, Optimal k (gap): {optimal_num_k[group_name]}, Optimal k (gap*): {optimal_num_k_2[group_name]}")
    '''


100%|██████████| 104/104 [06:27<00:00,  3.73s/it]


### Apply KMeans

In [11]:
# Apply KMeans with optimal number of clusters
df = pd.DataFrame()

kmeans_dict = {}
kmeans_2_dict = {}
cluster_labels_dict = {}
cluster_labels_2_dict = {}

for group_name, group in groups:
    reduced_data = reduced_data_dict[group_name]

    k = optimal_num_k[group_name]
    k_2 = optimal_num_k_2[group_name]

    # Apply KMeans with optimal number of clusters
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    kmeans_2 = KMeans(n_clusters=k_2, random_state=42).fit(reduced_data)
        
    kmeans_dict[group_name] = kmeans
    kmeans_2_dict[group_name] = kmeans_2

    cluster_labels_dict[group_name] = kmeans.labels_
    cluster_labels_2_dict[group_name] = kmeans_2.labels_

    print(f"Group {group_name}: {kmeans.labels_}")

    # Assign cluster labels to original group data
    group = group.assign(cluster=kmeans.labels_)
    group = group.assign(cluster_2=kmeans_2.labels_)

    # Append group to original dataframe
    df = pd.concat([df, group])
    
    # Plot clusters
    '''
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(group_name)
    plt.show()
    '''
display(df)


Group ('Top 50 - France', '2023-10-06 00:00:00+00:00'): [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 0]
Group ('Top 50 - France', '2023-10-13 00:00:00+00:00'): [1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1]
Group ('Top 50 - France', '2023-10-20 00:00:00+00:00'): [0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1
 1 0 0 1 0 1 1 0 1 0 1 1]
Group ('Top 50 - France', '2023-10-27 00:00:00+00:00'): [1 1 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 0 1 0 0 0 1
 1 1 1 0 1 1 0 1 0 1 1 0]
Group ('Top 50 - France', '2023-11-03 00:00:00+00:00'): [0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 0]
Group ('Top 50 - France', '2023-11-10 00:00:00+00:00'): [1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0 0 1
 1 0 0 0 0 0 1 1 1 1 0 1]
Group ('Top 50 - France', '2023-11-17 00:00:00+00:00'): [2

Unnamed: 0,playlist_id,playlist_name,collection_date,playlist_type,track_position,track_name,track_uuid,collection_date_clean,artist_name,artist_id,...,key,liveness,loudness,mode,speechiness,tempo,timeSignature,valence,cluster,cluster_2
0,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,1,Petit génie,5c0b0187-6e00-4499-b310-36e730252d13,2023-10-06 00:00:00+00:00,"['Abou Debeing', 'Alonzo', 'Imen Es', 'lossa',...","['11e81bc3-36f8-dd20-b110-a0369fe50396', '11e8...",...,1.0,0.22,-5.93,0.0,0.24,126.05,4.0,0.97,0,3
1,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,2,LAISSE MOI,065bda79-ba47-41cb-b44f-487741dc2ab9,2023-10-06 00:00:00+00:00,['KeBlack'],['11e81bc8-7bb4-47b0-ac67-a0369fe50396'],...,10.0,0.13,-7.88,0.0,0.35,120.03,4.0,0.70,0,7
2,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,3,Saiyan,f1928fd5-134e-444e-a66c-7b5541a9f23f,2023-10-06 00:00:00+00:00,"[""Heuss L'enfoiré"", 'Gazo']","['11e81bce-22bb-888e-9f62-a0369fe50396', '59a6...",...,7.0,0.21,-4.57,0.0,0.03,125.03,4.0,0.66,0,6
3,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,4,Casanova,be8682de-a811-4754-b310-9c816d3d6b56,2023-10-06 00:00:00+00:00,"['Soolking', 'Gazo']","['11e81bba-d00e-f2d2-a406-a0369fe50396', '59a6...",...,7.0,0.12,-4.58,0.0,0.21,132.04,4.0,0.76,0,0
4,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,Charts,5,Meridian,a899c70e-bcfd-42e3-80b7-0d0280c63c43,2023-10-06 00:00:00+00:00,"['Dave', 'Tiakola']","['11e81bc7-3e47-39ce-afd6-a0369fe50396', '3498...",...,4.0,0.32,-5.95,0.0,0.10,115.03,4.0,0.59,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5194,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,45,Le taf du loup,e7655582-cc32-451f-9ff8-3b3a0931d9b2,2024-09-27 00:00:00+00:00,['Saïf'],['e6706e43-e78d-43dc-b845-51505526bca7'],...,8.0,0.15,-6.32,0.0,0.42,164.02,3.0,0.39,2,3
5195,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,46,PYTHON FUNK,643d4aae-96a3-421a-a70b-1e3ae46cc4f3,2024-09-27 00:00:00+00:00,"['RD12', 'Sayfalse']","['148a2f14-933c-43d1-b3e6-c0c57aaaba75', '3352...",...,9.0,0.22,2.20,1.0,0.21,105.01,4.0,0.49,1,0
5196,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,47,Mamma Mia (feat. Mentissa) (Techno Mix),74d09c6a-7969-49ce-a5eb-9df074f889f3,2024-09-27 00:00:00+00:00,"['Mentissa', 'BENNETT']","['850e33b7-a367-4c63-bbdf-689d00ddb7fb', 'ad31...",...,1.0,0.45,-4.62,0.0,0.19,140.05,4.0,0.48,0,5
5198,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-27 00:00:00+00:00,Charts,49,Champs Elysées,aa45fafb-6e64-4dfd-a993-4db16959e9b1,2024-09-27 00:00:00+00:00,"['Gunna', 'Toosii']","['11e81bc9-9a0b-a932-b7d7-a0369fe50396', '11e8...",...,4.0,0.25,-6.96,0.0,0.15,122.96,4.0,0.18,2,7


## Variables creation

### HH-Index

In [12]:
groups = df.groupby(['playlist_name', 'collection_date'], as_index=False)


In [13]:
# Calculate the shares of each cluster
shares_perc = {}

for group_name, group in groups:
    shares = group['cluster'].value_counts(normalize=True)
    shares_perc[group_name] = shares * 100

# Calculate the HHI
hhi = {}
hhi_2 = {}

for group_name, group in groups:
  shares = group['cluster'].value_counts(normalize=True)
  hhi[group_name] = sum((shares*100)**2)

  shares_2 = group['cluster_2'].value_counts(normalize=True)
  hhi_2[group_name] = sum((shares_2*100)**2)

print(hhi)
print(hhi_2)

print("Minimum HHI: ", min(hhi.values()))
print("Mean HHI: ", np.mean(list(hhi.values())))
print("Median HHI: ", np.median(list(hhi.values())))
print("Maximum HHI: ", max(hhi.values()))
print("Standard deviation: ", np.std(list(hhi.values())))


{('Top 50 - France', '2023-10-06 00:00:00+00:00'): 7850.895460224906, ('Top 50 - France', '2023-10-13 00:00:00+00:00'): 7551.020408163265, ('Top 50 - France', '2023-10-20 00:00:00+00:00'): 5168.679716784673, ('Top 50 - France', '2023-10-27 00:00:00+00:00'): 5251.978342357351, ('Top 50 - France', '2023-11-03 00:00:00+00:00'): 7850.895460224906, ('Top 50 - France', '2023-11-10 00:00:00+00:00'): 5168.679716784673, ('Top 50 - France', '2023-11-17 00:00:00+00:00'): 4052.4781341107873, ('Top 50 - France', '2023-11-24 00:00:00+00:00'): 4002.4989587671807, ('Top 50 - France', '2023-12-01 00:00:00+00:00'): 7001.2494793835895, ('Top 50 - France', '2023-12-08 00:00:00+00:00'): 7001.2494793835895, ('Top 50 - France', '2023-12-15 00:00:00+00:00'): 5512.0, ('Top 50 - France', '2023-12-22 00:00:00+00:00'): 3944.0, ('Top 50 - France', '2023-12-29 00:00:00+00:00'): 4077.4677217825893, ('Top 50 - France', '2024-01-05 00:00:00+00:00'): 5251.978342357351, ('Top 50 - France', '2024-01-12 00:00:00+00:00'): 

### Euclidean distances

In [14]:
# Calculate distances between cluster centroids
distances_dict = {}
distances_dict_2 = {}

for group_name, group in groups:
    # Get the cluster centroids for the group
    cluster_centroids = kmeans_dict[group_name].cluster_centers_
    cluster_centroids_2 = kmeans_2_dict[group_name].cluster_centers_

    # Calculate pairwise euclidean distances between cluster centroids
    distances = pairwise_distances(cluster_centroids)
    distances_2 = pairwise_distances(cluster_centroids_2)

    # Calculate mean pairwise distance
    mean_distance = distances.mean()
    mean_distance_2 = distances_2.mean()

    # Print mean pairwise distance for the group
    print(f"Group {group_name}: Mean pairwise distance between cluster centroids = {mean_distance}")

    distances_dict[group_name] = mean_distance
    distances_dict_2[group_name] = mean_distance_2

# Calculate max, min, mean, median and standard deviation of distances
distances_list = [d for distances in distances_dict.values() for d in distances.flatten() if not np.isnan(d)]
print(f"Max distance: {np.max(distances_list):.5f}")
print(f"Min distance: {np.min(distances_list):.5f}")
print(f"Mean distance: {np.mean(distances_list):.5f}")
print(f"Median distance: {np.median(distances_list):.5f}")
print(f"Standard deviation of distances: {np.std(distances_list):.5f}")


Group ('Top 50 - France', '2023-10-06 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.42108650268321135
Group ('Top 50 - France', '2023-10-13 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.39325267529858626
Group ('Top 50 - France', '2023-10-20 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.28268480547416125
Group ('Top 50 - France', '2023-10-27 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.29187649155121603
Group ('Top 50 - France', '2023-11-03 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.4477359390348635
Group ('Top 50 - France', '2023-11-10 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.32645185826195733
Group ('Top 50 - France', '2023-11-17 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.5576211434084019
Group ('Top 50 - France', '2023-11-24 00:00:00+00:00'): Mean pairwise distance between cluster centroids = 0.5614

In [15]:
# Calculate distances bewteen tracks
scaler = MinMaxScaler()

selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]

distances_dict_3 = {}

for group_name, group in groups:
    subset = group.loc[:, selected_columns]
    scaled_columns = scaler.fit_transform(subset)

    pairwise_dist = pairwise_distances(scaled_columns)
    mean_distance = pairwise_dist.mean()
    
    # Print mean pairwise distance for the group
    print(f"Group {group_name}: Mean pairwise distance between tracks = {mean_distance}")

    distances_dict_3[group_name] = mean_distance

# Calculate max, min, mean, median and standard deviation of distances
distances_list = [d for distances in distances_dict_3.values() for d in distances.flatten() if not np.isnan(d)]
print(f"Max distance: {np.max(distances_list):.5f}")
print(f"Min distance: {np.min(distances_list):.5f}")
print(f"Mean distance: {np.mean(distances_list):.5f}")
print(f"Median distance: {np.median(distances_list):.5f}")
print(f"Standard deviation of distances: {np.std(distances_list):.5f}")


Group ('Top 50 - France', '2023-10-06 00:00:00+00:00'): Mean pairwise distance between tracks = 0.9168930786606451
Group ('Top 50 - France', '2023-10-13 00:00:00+00:00'): Mean pairwise distance between tracks = 0.8653531950604364
Group ('Top 50 - France', '2023-10-20 00:00:00+00:00'): Mean pairwise distance between tracks = 0.8616593633301834
Group ('Top 50 - France', '2023-10-27 00:00:00+00:00'): Mean pairwise distance between tracks = 0.8499858291604053
Group ('Top 50 - France', '2023-11-03 00:00:00+00:00'): Mean pairwise distance between tracks = 0.8534855551149709
Group ('Top 50 - France', '2023-11-10 00:00:00+00:00'): Mean pairwise distance between tracks = 0.8922581634261537
Group ('Top 50 - France', '2023-11-17 00:00:00+00:00'): Mean pairwise distance between tracks = 0.874695150514425
Group ('Top 50 - France', '2023-11-24 00:00:00+00:00'): Mean pairwise distance between tracks = 0.8843299780015722
Group ('Top 50 - France', '2023-12-01 00:00:00+00:00'): Mean pairwise distance be

### Stirling diversity index

In [16]:
# k-alpha Rao-Stirling index
stirling_index_dict = {}

for group_name, group in groups:
    clusters = group.groupby('cluster')

    total_index = 0

    cluster_centroids = kmeans_dict[group_name].cluster_centers_
    
    for i, (cluster_i_name, cluster_i) in enumerate(clusters):
        for j, (cluster_j_name, cluster_j) in enumerate(clusters):
            if i >= j:
                continue
            
            centroid_i = cluster_centroids[i]
            centroid_j = cluster_centroids[j]
            dist = cdist([centroid_i], [centroid_j], 'euclidean')

            share_i = len(cluster_i) / len(group)
            share_j = len(cluster_j) / len(group)

            index = dist * share_i * share_j

            total_index += index

    stirling_index_dict[group_name] = total_index

print(stirling_index_dict)

print("Minimum index: ", min(stirling_index_dict.values()))
print("Mean index: ", np.mean(list(stirling_index_dict.values())))
print("Median index: ", np.median(list(stirling_index_dict.values())))
print("Maximum index: ", max(stirling_index_dict.values()))
print("Standard deviation: ", np.std(list(stirling_index_dict.values())))


{('Top 50 - France', '2023-10-06 00:00:00+00:00'): array([[0.09049589]]), ('Top 50 - France', '2023-10-13 00:00:00+00:00'): array([[0.09630678]]), ('Top 50 - France', '2023-10-20 00:00:00+00:00'): array([[0.13657408]]), ('Top 50 - France', '2023-10-27 00:00:00+00:00'): array([[0.13858359]]), ('Top 50 - France', '2023-11-03 00:00:00+00:00'): array([[0.09622313]]), ('Top 50 - France', '2023-11-10 00:00:00+00:00'): array([[0.15771935]]), ('Top 50 - France', '2023-11-17 00:00:00+00:00'): array([[0.20871584]]), ('Top 50 - France', '2023-11-24 00:00:00+00:00'): array([[0.21326257]]), ('Top 50 - France', '2023-12-01 00:00:00+00:00'): array([[0.12024049]]), ('Top 50 - France', '2023-12-08 00:00:00+00:00'): array([[0.12447802]]), ('Top 50 - France', '2023-12-15 00:00:00+00:00'): array([[0.13937365]]), ('Top 50 - France', '2023-12-22 00:00:00+00:00'): array([[0.20624844]]), ('Top 50 - France', '2023-12-29 00:00:00+00:00'): array([[0.21090923]]), ('Top 50 - France', '2024-01-05 00:00:00+00:00'): 

In [17]:
# k-beta Rao-Stirling index
stirling_index_2_dict = {}

for group_name, group in groups:
    clusters = group.groupby('cluster_2')

    total_index = 0

    cluster_centroids = kmeans_2_dict[group_name].cluster_centers_
    
    for i, (cluster_i_name, cluster_i) in enumerate(clusters):
        for j, (cluster_j_name, cluster_j) in enumerate(clusters):
            if i >= j:
                continue
            
            centroid_i = cluster_centroids[i]
            centroid_j = cluster_centroids[j]
            dist = cdist([centroid_i], [centroid_j], 'euclidean')

            share_i = len(cluster_i) / len(group)
            share_j = len(cluster_j) / len(group)

            index = dist * share_i * share_j

            total_index += index

    stirling_index_2_dict[group_name] = total_index

print(stirling_index_2_dict)

print("Minimum index: ", min(stirling_index_2_dict.values()))
print("Mean index: ", np.mean(list(stirling_index_2_dict.values())))
print("Median index: ", np.median(list(stirling_index_2_dict.values())))
print("Maximum index: ", max(stirling_index_2_dict.values()))
print("Standard deviation: ", np.std(list(stirling_index_2_dict.values())))


{('Top 50 - France', '2023-10-06 00:00:00+00:00'): array([[0.344137]]), ('Top 50 - France', '2023-10-13 00:00:00+00:00'): array([[0.31660785]]), ('Top 50 - France', '2023-10-20 00:00:00+00:00'): array([[0.31360775]]), ('Top 50 - France', '2023-10-27 00:00:00+00:00'): array([[0.2961431]]), ('Top 50 - France', '2023-11-03 00:00:00+00:00'): array([[0.32758602]]), ('Top 50 - France', '2023-11-10 00:00:00+00:00'): array([[0.3384068]]), ('Top 50 - France', '2023-11-17 00:00:00+00:00'): array([[0.33222883]]), ('Top 50 - France', '2023-11-24 00:00:00+00:00'): array([[0.34518327]]), ('Top 50 - France', '2023-12-01 00:00:00+00:00'): array([[0.33767041]]), ('Top 50 - France', '2023-12-08 00:00:00+00:00'): array([[0.33020658]]), ('Top 50 - France', '2023-12-15 00:00:00+00:00'): array([[0.33671639]]), ('Top 50 - France', '2023-12-22 00:00:00+00:00'): array([[0.3360477]]), ('Top 50 - France', '2023-12-29 00:00:00+00:00'): array([[0.31148642]]), ('Top 50 - France', '2024-01-05 00:00:00+00:00'): array

## Append to dataframe

In [18]:
# Append playlist-level indicators
data_final = []

for group_name, group in groups:
    # Variety
    nb_clusters = optimal_num_k[group_name]
    nb_clusters_2 = optimal_num_k_2[group_name]

    # Balance
    hh_index = hhi[group_name]
    hh_index_2 = hhi_2[group_name]

    # Distparity
    distances = distances_dict[group_name]
    distances_2 = distances_dict_2[group_name]
    distances_3 = distances_dict_3[group_name]

    # Diversity
    stirling_index = stirling_index_dict[group_name]
    stirling_index_2 = stirling_index_2_dict[group_name]
    
    # Metadata playlist
    playlist_id = group['playlist_id'].iloc[0]
    playlist_name = group['playlist_name'].iloc[0]
    playlist_type = group['playlist_type'].iloc[0]
    collection_date = group['collection_date'].iloc[0]
    nb_tracks = len(group)

    dates = pd.to_datetime(group['track_date'])
    mean_track_date = dates.mean()

    # Append playlist-level indicators
    data_final.append({
        'playlist_id': playlist_id,
        'playlist_name': playlist_name,
        'collection_date': collection_date,
        'mean_track_date': mean_track_date,
        'playlist_type': playlist_type,
        'nb_tracks': nb_tracks,
        'nb_clusters': nb_clusters,
        'nb_clusters_2': nb_clusters_2,
        'hh_index': hh_index,
        'hh_index_2': hh_index_2,
        'distances': distances,
        'distances_2': distances_2,
        'distances_3': distances_3,
        'stirling_index': stirling_index,
        'stirling_index_2': stirling_index_2
    })

# Convert to DataFrame
final_data_df = pd.DataFrame(data_final)
display(final_data_df)


Unnamed: 0,playlist_id,playlist_name,collection_date,mean_track_date,playlist_type,nb_tracks,nb_clusters,nb_clusters_2,hh_index,hh_index_2,distances,distances_2,distances_3,stirling_index,stirling_index_2
0,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-06 00:00:00+00:00,2022-12-21 10:46:31.836734720+00:00,Charts,49,2,10,7850.895460,1353.602666,0.421087,0.791851,0.916893,[[0.09049589145545066]],[[0.3441370020476721]]
1,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-13 00:00:00+00:00,2022-12-17 11:45:18.367346944+00:00,Charts,49,2,10,7551.020408,1445.231154,0.393253,0.813050,0.865353,[[0.09630677762414355]],[[0.3166078538612369]]
2,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-20 00:00:00+00:00,2022-12-15 01:57:33.061224448+00:00,Charts,49,2,8,5168.679717,1745.106206,0.282685,0.740350,0.861659,[[0.13657408344440947]],[[0.313607749701486]]
3,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-10-27 00:00:00+00:00,2023-01-01 10:17:08.571428608+00:00,Charts,49,2,7,5251.978342,1928.363182,0.291876,0.734196,0.849986,[[0.13858359032419254]],[[0.2961431043284729]]
4,11e84493-6b0e-6dc0-a8d5-a0369fe50396,Top 50 - France,2023-11-03 00:00:00+00:00,2021-10-07 16:09:47.755101952+00:00,Charts,49,2,9,7850.895460,1311.953353,0.447736,0.718217,0.853486,[[0.09622313392002896]],[[0.3275860211187086]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-08-30 00:00:00+00:00,2023-07-16 16:54:32.727272704+00:00,Charts,44,2,5,7644.628099,2489.669421,0.459385,0.724420,0.937626,[[0.1082023198327935]],[[0.2888285040801568]]
100,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-06 00:00:00+00:00,2022-03-15 18:40:00+00:00,Charts,45,2,7,7688.888889,2059.259259,0.529418,0.906750,0.916984,[[0.1223543328202901]],[[0.3215595379794304]]
101,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-13 00:00:00+00:00,2023-08-06 21:52:00+00:00,Charts,45,2,6,5555.555556,2009.876543,0.330805,0.702943,0.909112,[[0.1470244363607327]],[[0.3040540370547298]]
102,11e84493-6b26-e814-a42e-a0369fe50396,Viral 50 - France,2024-09-20 00:00:00+00:00,2023-03-07 04:30:00+00:00,Charts,48,3,7,3880.208333,1579.861111,0.570318,0.788977,0.950311,[[0.23179181649222128]],[[0.35070962138132544]]


In [19]:
# Export data
final_data_df.to_csv("charts_playlists_23-24_final.csv", index=False)
