# Data Management

## Setup

In [None]:
# Import modules
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

from scipy.spatial.distance import cdist

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances


In [None]:
# Load data
df = pd.read_excel('/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_requests/df_voyage.xlsx')

# Group by playlist name
groups = df.groupby('playlist_name')


## PCA dimensionnality reduction

In [None]:
# Select the number of components
scaler = MinMaxScaler()

optimal_num_components = []

for group_name, group in groups:
    # Subset and scale
    subset = group.loc[:, "danceability":"time_signature"]
    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA
    pca = PCA()
    pca.fit(scaled_columns)
    
    # Calculate cumulative explained variance
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    optimal_components = next(i for i, var in enumerate(cumulative_variance) if var >= 0.8) + 1
    optimal_num_components.append(optimal_components)

    # Plot scree plot
    plt.plot(range(1, pca.n_components_ + 1), cumulative_variance, 'bo-', linewidth=2)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title(group_name)
    plt.show()

# Calculate the most modal value
most_common_optimal = Counter(optimal_num_components).most_common(1)
most_modal_value = most_common_optimal[0][0]

print("Most modal value of optimal components:", most_modal_value)


In [None]:
# Apply PCA to reduce dimensionality
# Loop over each group and apply PCA with optimal number of components
scaler = MinMaxScaler()

reduced_data_dict = {}

for group_name, group in groups:
    # Subset data
    subset = group.loc[:, "danceability":"time_signature"]

    # Scale the specified columns
    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA with the optimal number of components
    n_components = 6
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(scaled_columns)
    reduced_data_dict[group_name] = reduced_data


## Clustering

### Elbow method

In [None]:
# Apply the elbow method to determine the optimal number of clusters
# Calculate the elbow
wcss = {}

for k in range(1, 11):
  for group_name, group in groups:
    reduced_data = reduced_data_dict[group_name]
    
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    
    if group_name not in wcss:
      wcss[group_name] = []

    wcss[group_name].append(kmeans.inertia_)

# Plot the elbow
for group_name, values in wcss.items():
  plt.plot(range(1, 11), values, label=group_name)
  plt.xlabel('Number of clusters (k)')
  plt.ylabel('Within-cluster sum of squares (WCSS)')
  plt.legend()
  plt.show()


### Gap statistic method

In [None]:
# Set random seeds
random.seed(42)
np.random.seed(42)


In [None]:
# Define a function to compute the gap statistic
def compute_gap(data, k):
  # Compute the WCSS for the real data
  kmeans = KMeans(n_clusters=k, random_state=42).fit(data)
  wcss = kmeans.inertia_

  # Compute the null reference distribution by shuffling the data and
  # re-assigning it to clusters
  n_samples, n_features = data.shape
  wcss_null = []

  for _ in range(20):
    data_shuffled = np.random.permutation(data)
    wcss_null.append(KMeans(n_clusters=k).fit(data_shuffled).inertia_)
  
  wcss_null = np.array(wcss_null)
  
  # Compute the gap statistic and gap*
  gap = np.mean(np.log(wcss_null)) - np.log(wcss)

  # Compute the standard deviation of the null reference distribution
  gap_std = np.std(np.log(wcss_null))

  return gap, gap_std


In [None]:
# Calculate the optimal k
# Loop over each group and generate scree plot
optimal_num_k = {}
optimal_num_k_2 = {}

for group_name, group in groups:
    group_gaps = []
    group_errors = []

    for k in range(1, 11):
        reduced_data = reduced_data_dict[group_name]

        # Compute the gap statistic and standard deviation for the current value of k
        gap, gap_std = compute_gap(reduced_data, k)
        group_gaps.append(gap)
        group_errors.append(gap_std)

    # Find the optimal number of components based on the gap statistic criterion
    optimal_k = None
    for i in range(1, len(group_gaps) - 1):
        s_k = group_errors[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps[i] >= group_gaps[i + 1] - threshold:
            optimal_k = i + 1
            break

    if optimal_k is None:
        optimal_k = np.argmax(group_gaps) + 1

    optimal_num_k[group_name] = optimal_k

    # Find the optimal number of components based on the gap* statistic criterion
    for i in range(1, len(group_gaps)):
        optimal_k_2 = np.argmax(group_gaps) + 1

    optimal_num_k_2[group_name] = optimal_k_2

    # Plot the gap statistics
    plt.plot(range(1, 11), group_gaps, label=group_name)

    # Set labels and display the plot
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Gap statistic')
    plt.legend()
    plt.show()

    print(f"Group: {group_name}, Optimal k (gap): {optimal_num_k[group_name]}, Optimal k (gap*): {optimal_num_k_2[group_name]}")


### Apply KMeans

In [None]:
# Apply KMeans with optimal number of clusters
df = pd.DataFrame()

kmeans_dict = {}
kmeans_2_dict = {}
cluster_labels_dict = {}
cluster_labels_2_dict = {}

for group_name, group in groups:
    reduced_data = reduced_data_dict[group_name]

    k = optimal_num_k[group_name]
    k_2 = optimal_num_k_2[group_name]

    # Apply KMeans with optimal number of clusters
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    kmeans_2 = KMeans(n_clusters=k_2, random_state=42).fit(reduced_data)
    
    kmeans_dict[group_name] = kmeans
    kmeans_2_dict[group_name] = kmeans_2

    cluster_labels_dict[group_name] = kmeans.labels_
    cluster_labels_2_dict[group_name] = kmeans_2.labels_

    print(f"Group {group_name}: {kmeans.labels_}")

    # Assign cluster labels to original group data
    group = group.assign(cluster=kmeans.labels_)
    group = group.assign(cluster_2=kmeans_2.labels_)

    # Append group to original dataframe
    df = pd.concat([df, group])
    
    # Plot clusters
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(group_name)
    plt.show()

display(df)


## Variables creation

### HH-Index

In [None]:
groups = df.groupby('playlist_name')


In [None]:
# Calculate the shares of each cluster
shares_perc = {}

for group_name, group in groups:
    shares = group['cluster'].value_counts(normalize=True)
    shares_perc[group_name] = shares * 100

# Calculate the HHI
hhi = {}
hhi_2 = {}

for group_name, group in groups:
  shares = group['cluster'].value_counts(normalize=True)
  hhi[group_name] = sum((shares*100)**2)

  shares_2 = group['cluster_2'].value_counts(normalize=True)
  hhi_2[group_name] = sum((shares_2*100)**2)

print(hhi)
print(hhi_2)

print("Minimum HHI: ", min(hhi.values()))
print("Mean HHI: ", np.mean(list(hhi.values())))
print("Median HHI: ", np.median(list(hhi.values())))
print("Maximum HHI: ", max(hhi.values()))
print("Standard deviation: ", np.std(list(hhi.values())))


### Euclidean distances

In [None]:
# Calculate distances between cluster centroids
distances_dict = {}
distances_dict_2 = {}

for group_name, group in groups:
    # Get the cluster centroids for the group
    cluster_centroids = kmeans_dict[group_name].cluster_centers_
    cluster_centroids_2 = kmeans_2_dict[group_name].cluster_centers_

    # Calculate pairwise euclidean distances between cluster centroids
    distances = pairwise_distances(cluster_centroids)
    distances_2 = pairwise_distances(cluster_centroids_2)

    # Calculate mean pairwise distance
    mean_distance = distances.mean()
    mean_distance_2 = distances_2.mean()

    # Print mean pairwise distance for the group
    print(f"Group {group_name}: Mean pairwise distance between cluster centroids = {mean_distance}")

    distances_dict[group_name] = mean_distance
    distances_dict_2[group_name] = mean_distance_2

# Calculate max, min, mean, median and standard deviation of distances
distances_list = [d for distances in distances_dict.values() for d in distances.flatten() if not np.isnan(d)]
print(f"Max distance: {np.max(distances_list):.5f}")
print(f"Min distance: {np.min(distances_list):.5f}")
print(f"Mean distance: {np.mean(distances_list):.5f}")
print(f"Median distance: {np.median(distances_list):.5f}")
print(f"Standard deviation of distances: {np.std(distances_list):.5f}")


In [None]:
# Calculate distances bewteen tracks
scaler = MinMaxScaler()

distances_dict_3 = {}

for group_name, group in groups:
    subset = group.loc[:, "danceability":"time_signature"]
    scaled_columns = scaler.fit_transform(subset)

    pairwise_dist = pairwise_distances(scaled_columns)
    mean_distance = pairwise_dist.mean()
    
    # Print mean pairwise distance for the group
    print(f"Group {group_name}: Mean pairwise distance between tracks = {mean_distance}")

    distances_dict_3[group_name] = mean_distance

# Calculate max, min, mean, median and standard deviation of distances
distances_list = [d for distances in distances_dict_3.values() for d in distances.flatten() if not np.isnan(d)]
print(f"Max distance: {np.max(distances_list):.5f}")
print(f"Min distance: {np.min(distances_list):.5f}")
print(f"Mean distance: {np.mean(distances_list):.5f}")
print(f"Median distance: {np.median(distances_list):.5f}")
print(f"Standard deviation of distances: {np.std(distances_list):.5f}")


### Stirling diversity index

In [None]:
# k-alpha Rao-Stirling index
stirling_index_dict = {}

for group_name, group in groups:
    clusters = group.groupby('cluster')

    total_index = 0

    cluster_centroids = kmeans_dict[group_name].cluster_centers_
    
    for i, (cluster_i_name, cluster_i) in enumerate(clusters):
        for j, (cluster_j_name, cluster_j) in enumerate(clusters):
            if i >= j:
                continue
            
            centroid_i = cluster_centroids[i]
            centroid_j = cluster_centroids[j]
            dist = cdist([centroid_i], [centroid_j], 'euclidean')

            share_i = len(cluster_i) / len(group)
            share_j = len(cluster_j) / len(group)

            index = dist * share_i * share_j

            total_index += index

    stirling_index_dict[group_name] = total_index

print(stirling_index_dict)

print("Minimum index: ", min(stirling_index_dict.values()))
print("Mean index: ", np.mean(list(stirling_index_dict.values())))
print("Median index: ", np.median(list(stirling_index_dict.values())))
print("Maximum index: ", max(stirling_index_dict.values()))
print("Standard deviation: ", np.std(list(stirling_index_dict.values())))


In [None]:
# k-beta Rao-Stirling index
stirling_index_2_dict = {}

for group_name, group in groups:
    clusters = group.groupby('cluster_2')

    total_index = 0

    cluster_centroids = kmeans_2_dict[group_name].cluster_centers_
    
    for i, (cluster_i_name, cluster_i) in enumerate(clusters):
        for j, (cluster_j_name, cluster_j) in enumerate(clusters):
            if i >= j:
                continue
            
            centroid_i = cluster_centroids[i]
            centroid_j = cluster_centroids[j]
            dist = cdist([centroid_i], [centroid_j], 'euclidean')

            share_i = len(cluster_i) / len(group)
            share_j = len(cluster_j) / len(group)

            index = dist * share_i * share_j

            total_index += index

    stirling_index_2_dict[group_name] = total_index

print(stirling_index_2_dict)

print("Minimum index: ", min(stirling_index_2_dict.values()))
print("Mean index: ", np.mean(list(stirling_index_2_dict.values())))
print("Median index: ", np.median(list(stirling_index_2_dict.values())))
print("Maximum index: ", max(stirling_index_2_dict.values()))
print("Standard deviation: ", np.std(list(stirling_index_2_dict.values())))


## Append to dataframe

In [None]:
# Append playlist-level indicators
panel_data = []

for group_name, group in groups:
    # Retrieve playlist-level indicators
    # Variety
    nb_clusters = optimal_num_k[group_name]
    nb_clusters_2 = optimal_num_k_2[group_name]

    # Balance
    hh_index = hhi[group_name]
    hh_index_2 = hhi_2[group_name]

    # Distparity
    distances = distances_dict[group_name]
    distances_2 = distances_dict_2[group_name]
    distances_3 = distances_dict_3[group_name]

    # Diversity
    stirling_index = stirling_index_dict[group_name]
    stirling_index_2 = stirling_index_2_dict[group_name]
    
    # Add playlist-level indicators to each track in the playlist
    for _, track in group.iterrows():
        track_data = track.to_dict()
        track_data.update({
            'nb_clusters': nb_clusters,
            'nb_clusters_2': nb_clusters_2,
            'hh_index': hh_index,
            'hh_index_2': hh_index_2,
            'distances': distances,
            'distances_2': distances_2,
            'distances_3': distances_3,
            'stirling_index': stirling_index,
            'stirling_index_2': stirling_index_2
        })
        panel_data.append(track_data)

# Convert to DataFrame
panel_data_df = pd.DataFrame(panel_data)
display(panel_data_df)


In [None]:
# Export to Excel
panel_data_df.to_excel("df_voyage_final.xlsx", index=False)
