Import modules

In [1]:
import random

from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

from scipy.spatial.distance import cdist

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score


Load the dataset and data management

In [None]:
random.seed(42)

np.random.seed(42)


In [None]:
df = pd.read_excel('df_afro.xlsx')

# Group by playlist name
groups = df.groupby('playlist_name')


PCA

Select the number of components

In [None]:
# Initialize scaler
scaler = MinMaxScaler()

# Store optimal number of components for each group
optimal_num_components = []

for group_name, group in groups:
    # Subset data
    subset = group.loc[:, "danceability":"duration_ms"]
    
    # Scale the specified columns
    scaled_columns = scaler.fit_transform(subset)

    # Perform PCA to reduce dimensionality of data
    pca = PCA()
    pca.fit(scaled_columns)
    
    # Determine the optimal number of components
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    optimal_components = next(i for i, var in enumerate(cumulative_variance) if var >= 0.8) + 1
    optimal_num_components.append(optimal_components)

    # Plot scree plot
    plt.plot(range(1, pca.n_components_ + 1), cumulative_variance, 'bo-', linewidth=2)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title(group_name)
    plt.show()

# Calculate the most modal value
most_common_optimal = Counter(optimal_num_components).most_common(1)
most_modal_value = most_common_optimal[0][0]

print("Most modal value of optimal components:", most_modal_value)


Apply PCA

In [None]:
# Loop over each group and apply PCA with optimal number of components
reduced_data_dict = {}

for group_name, group in groups:
    # Subset data
    subset = group.loc[:, "danceability":"duration_ms"]

    # Scale the specified columns
    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA with the optimal number of components
    n_components = 6
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(scaled_columns)
    reduced_data_dict[group_name] = reduced_data


Clustering

Elbow method

In [None]:
# Apply the elbow method to determine the optimal number of clusters
wcss = {}

for k in range(1, 11):
  for group_name, group in groups:
    reduced_data = reduced_data_dict[group_name]
    
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    
    if group_name not in wcss:
      wcss[group_name] = []

    wcss[group_name].append(kmeans.inertia_)

for group_name, values in wcss.items():
  plt.plot(range(1, 11), values, label=group_name)
  plt.xlabel('Number of clusters (k)')
  plt.ylabel('Within-cluster sum of squares (WCSS)')
  plt.legend()
  plt.show()


Gap statistic method

In [None]:
def compute_gap(data, k):
  """
  Compute the gap statistic for a given value of k.

  Parameters:
  - data: the data to cluster, with shape (n_samples, n_features)
  - k: the number of clusters

  Returns:
  - gap: the gap statistic for the given value of k
  """
  # Compute the WCSS for the real data
  kmeans = KMeans(n_clusters=k, random_state=42).fit(data)
  wcss = kmeans.inertia_

  # Compute the null reference distribution by shuffling the data and
  # re-assigning it to clusters
  n_samples, n_features = data.shape
  wcss_null = []

  for _ in range(20):
    data_shuffled = np.random.permutation(data)
    wcss_null.append(KMeans(n_clusters=k).fit(data_shuffled).inertia_)
  
  wcss_null = np.array(wcss_null)
  
  # Compute the gap statistic and gap*
  gap = np.log(np.mean(wcss_null)) - np.log(wcss)

  # Compute the standard deviation of the null reference distribution
  gap_std = np.std(np.log(wcss_null))

  return gap, gap_std


In [None]:
# Loop over each group and generate scree plot
optimal_num_k = {}
optimal_num_k_2 = {}

for group_name, group in groups:
    # Initialize lists to store the gap statistics and error bars for different values of k
    group_gaps = []
    group_errors = []

    # Loop over different values of k
    for k in range(1, 11):
        reduced_data = reduced_data_dict[group_name]

        # Compute the gap statistic and standard deviation for the current value of k
        gap, gap_std = compute_gap(reduced_data, k)
        group_gaps.append(gap)
        group_errors.append(gap_std)

    # Find the optimal number of components based on the gap statistic criterion
    optimal_k = None
    for i in range(1, len(group_gaps) - 1):
        s_k = group_errors[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps[i] >= group_gaps[i + 1] - threshold:
            optimal_k = i + 1
            break

    if optimal_k is None:
        optimal_k = np.argmax(group_gaps) + 1

    optimal_num_k[group_name] = optimal_k

    # Find the optimal number of components based on the gap* statistic criterion
    optimal_k_2 = np.argmax(group_gaps) + 1

    optimal_num_k_2[group_name] = optimal_k_2

    # Plot the gap statistics
    plt.plot(range(1, 11), group_gaps, label=group_name)

    # Set labels and display the plot
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Gap statistic')
    plt.legend()
    plt.show()

    print(f"Group: {group_name}, Optimal k (gap): {optimal_num_k[group_name]}, Optimal k (gap*): {optimal_num_k_2[group_name]}")


Apply KMeans

In [None]:
# Create empty dataframe to store results
df = pd.DataFrame()

# Loop over each group and apply KMeans with optimal number of clusters
kmeans_dict = {}
kmeans_2_dict = {}
cluster_labels_dict = {}
cluster_labels_2_dict = {}

for group_name, group in groups:
    # Get reduced data
    reduced_data = reduced_data_dict[group_name]

    # Get optimal number of clusters
    k = optimal_num_k[group_name]
    k_2 = optimal_num_k_2[group_name]

    # Apply KMeans with optimal number of clusters
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    kmeans_2 = KMeans(n_clusters=k_2, random_state=42).fit(reduced_data)
    
    # Store KMeans object in dictionary
    kmeans_dict[group_name] = kmeans
    kmeans_2_dict[group_name] = kmeans_2

    cluster_labels_dict[group_name] = kmeans.labels_
    cluster_labels_2_dict[group_name] = kmeans_2.labels_

    # Print cluster labels
    print(f"Group {group_name}: {kmeans.labels_}")

    # Assign cluster labels to original group data
    group = group.assign(cluster=kmeans.labels_)
    group = group.assign(cluster_2=kmeans_2.labels_)

    # Append group to original dataframe
    df = pd.concat([df, group])
    
    # Plot clusters
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(group_name)
    plt.show()

display(df)


Data analysis

% shares

In [None]:
groups = df.groupby('playlist_name')

shares_perc = {}

for group_name, group in groups:
    shares = group['cluster'].value_counts(normalize=True)
    shares_perc[group_name] = shares * 100

print(shares_perc)


HH-Index

In [None]:
groups = df.groupby('playlist_name')

hhi = {}
hhi_2 = {}

for group_name, group in groups:
  shares = group['cluster'].value_counts(normalize=True)
  shares_2 = group['cluster_2'].value_counts(normalize=True)

  hhi[group_name] = sum((shares*100)**2)
  hhi_2[group_name] = sum((shares_2*100)**2)

print(hhi)
print (hhi_2)

print("Minimum HHI: ", min(hhi.values()))
print("Mean HHI: ", np.mean(list(hhi.values())))
print("Median HHI: ", np.median(list(hhi.values())))
print("Maximum HHI: ", max(hhi.values()))
print("Standard deviation: ", np.std(list(hhi.values())))


Euclidean distances

In [None]:
# create an empty dataframe to store the results
results_df = pd.DataFrame()

# loop over each playlist and compute the weighted Stirling index
for group_name, group in df.groupby('playlist_name'):
    # Subset data
    subset = group.loc[:, "danceability":"duration_ms"]

    # Scale the specified columns
    scaled_columns = scaler.fit_transform(subset)

    # Calculate pairwise euclidean distances between cluster centroids
    distances = pairwise_distances(scaled_columns)

    # Calculate mean pairwise distance
    mean_distance = distances.mean()

    # get all info
    nb_artists = len(group['artist_name'].unique())
    hh_index = hhi[group_name]
    hh_index_2 = hhi_2[group_name]
    playlist_followers = group['playlist_followers'].iloc[0]
    track_popularity = group['popularity_track'].mean()
    artist_popularity = group['popularity_artist'].mean()
    nb_tracks = len(group)
    nb_clusters = optimal_num_k[group_name]
    nb_clusters_2 = optimal_num_k_2[group_name]
    

    results_df = results_df.append({'playlist_name': group_name, 'playlist_followers': playlist_followers,
                                    'track_pop': track_popularity, 'artist_pop': artist_popularity,
                                    'nb_tracks': nb_tracks, 'nb_artists': nb_artists, 'nb_dimensions':n_components, 
                                    'nb_clusters': nb_clusters, 'nb_clusters_2': nb_clusters_2,
                                    'hhi': hh_index, 'hhi_2': hh_index_2, 'mean_distance': mean_distance},
                                   ignore_index=True)


# print the results dataframe
print(results_df)

Parse final results into excel

In [None]:
results_df.to_excel("df_detente_final_2.xlsx", index=False)


Robustness checks

In [45]:
df = pd.read_excel('df_electro.xlsx')

# Group by playlist name
groups = df.groupby('playlist_name')


In [46]:
# Initialize scaler
scaler = MinMaxScaler()

# Initialize dictionaries to store clustering results
reduced_data_dict_5 = {}
reduced_data_dict_4 = {}

# Loop over each group and apply PCA with the specified number of components
for group_name, group in groups:
    # Subset data
    subset = group.loc[:, "danceability":"duration_ms"]

    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA with 5 dimensions
    pca_5 = PCA(n_components=5)
    reduced_data_5 = pca_5.fit_transform(scaled_columns)

    # Apply PCA with 4 dimensions
    pca_4 = PCA(n_components=4)
    reduced_data_4 = pca_4.fit_transform(scaled_columns)

    reduced_data_dict_5[group_name] = reduced_data_5
    reduced_data_dict_4[group_name] = reduced_data_4
    

In [47]:
def compute_gap(data, k):
  """
  Compute the gap statistic for a given value of k.

  Parameters:
  - data: the data to cluster, with shape (n_samples, n_features)
  - k: the number of clusters

  Returns:
  - gap: the gap statistic for the given value of k
  """
  # Compute the WCSS for the real data
  kmeans = KMeans(n_clusters=k, random_state=42).fit(data)
  wcss = kmeans.inertia_

  # Compute the null reference distribution by shuffling the data and
  # re-assigning it to clusters
  n_samples, n_features = data.shape
  wcss_null = []

  for _ in range(20):
    data_shuffled = np.random.permutation(data)
    wcss_null.append(KMeans(n_clusters=k).fit(data_shuffled).inertia_)
  
  wcss_null = np.array(wcss_null)
  
  # Compute the gap statistic and gap*
  gap = np.log(np.mean(wcss_null)) - np.log(wcss)

  # Compute the standard deviation of the null reference distribution
  gap_std = np.std(np.log(wcss_null))

  return gap, gap_std

In [48]:
# Loop over each group and generate scree plot
optimal_num_k_pca_5 = {}
optimal_num_k_2_pca_5 = {}

optimal_num_k_pca_4 = {}
optimal_num_k_2_pca_4 = {}

for group_name, group in groups:
    # Initialize lists to store the gap statistics and error bars for different values of k
    group_gaps_5 = []
    group_errors_5 = []

    group_gaps_4 = []
    group_errors_4 = []

    # Loop over different values of k
    for k in range(1, 11):
        reduced_data_5 = reduced_data_dict_5[group_name]
        reduced_data_4 = reduced_data_dict_4[group_name]

        # Compute the gap statistic and standard deviation for the current value of k
        gap_5, gap_std_5 = compute_gap(reduced_data_5, k)
        group_gaps_5.append(gap_5)
        group_errors_5.append(gap_std_5)

        gap_4, gap_std_4 = compute_gap(reduced_data_4, k)
        group_gaps_4.append(gap_4)
        group_errors_4.append(gap_std_4)

    # Find the optimal number of components based on the gap statistic criterion
    optimal_k_5 = None
    for i in range(1, len(group_gaps_5) - 1):
        s_k = group_errors_5[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps_5[i] >= group_gaps_5[i + 1] - threshold:
            optimal_k_5 = i + 1
            break

    if optimal_k_5 is None:
        optimal_k_5 = np.argmax(group_gaps_5) + 1

    optimal_num_k_pca_5[group_name] = optimal_k_5

    optimal_k_4 = None
    for i in range(1, len(group_gaps_4) - 1):
        s_k = group_errors_4[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps_4[i] >= group_gaps_4[i + 1] - threshold:
            optimal_k_4 = i + 1
            break

    if optimal_k_4 is None:
        optimal_k_4 = np.argmax(group_gaps_4) + 1

    optimal_num_k_pca_5[group_name] = optimal_k_5
    optimal_num_k_pca_4[group_name] = optimal_k_4

    # Find the optimal number of components based on the gap* statistic criterion
    optimal_k_2_5 = np.argmax(group_gaps_5) + 1
    optimal_k_2_4 = np.argmax(group_gaps_4) + 1

    optimal_num_k_2_pca_5[group_name] = optimal_k_2_5
    optimal_num_k_2_pca_4[group_name] = optimal_k_2_4

    print(f"Group: {group_name}, Optimal k 5 (gap): {optimal_num_k_pca_5[group_name]}, Optimal k 4 (gap*): {optimal_num_k_2_pca_5[group_name]} and Optimal k 4 (gap): {optimal_num_k_pca_4[group_name]}, Optimal k 4 (gap*): {optimal_num_k_2_pca_4[group_name]}")

Group: 80s Dance Hits, Optimal k 5 (gap): 2, Optimal k 4 (gap*): 9 and Optimal k 4 (gap): 2, Optimal k 4 (gap*): 10
Group: 90s Dance Party, Optimal k 5 (gap): 2, Optimal k 4 (gap*): 6 and Optimal k 4 (gap): 3, Optimal k 4 (gap*): 9
Group: AMAPIANO grooves, Optimal k 5 (gap): 2, Optimal k 4 (gap*): 9 and Optimal k 4 (gap): 2, Optimal k 4 (gap*): 8
Group: Altar, Optimal k 5 (gap): 2, Optimal k 4 (gap*): 7 and Optimal k 4 (gap): 2, Optimal k 4 (gap*): 7
Group: Banger, Optimal k 5 (gap): 2, Optimal k 4 (gap*): 9 and Optimal k 4 (gap): 2, Optimal k 4 (gap*): 6
Group: Big Room Dance, Optimal k 5 (gap): 3, Optimal k 4 (gap*): 3 and Optimal k 4 (gap): 3, Optimal k 4 (gap*): 10
Group: Chill Tracks, Optimal k 5 (gap): 3, Optimal k 4 (gap*): 10 and Optimal k 4 (gap): 3, Optimal k 4 (gap*): 8
Group: Dance Classics, Optimal k 5 (gap): 2, Optimal k 4 (gap*): 7 and Optimal k 4 (gap): 4, Optimal k 4 (gap*): 8
Group: Dance Hits, Optimal k 5 (gap): 2, Optimal k 4 (gap*): 8 and Optimal k 4 (gap): 2, Opti

In [49]:
results_df = pd.DataFrame()

# Loop over each group and apply PCA with the specified number of components
for group_name, group in groups:
    # Get reduced data
    reduced_data_5 = reduced_data_dict_5[group_name]
    reduced_data_4 = reduced_data_dict_4[group_name]

    # Perform k-means clustering for both configurations
    kmeans_5 = KMeans(n_clusters=optimal_num_k_pca_5[group_name], random_state=42).fit(reduced_data_5)
    kmeans_2_5 = KMeans(n_clusters=optimal_num_k_2_pca_5[group_name], random_state=42).fit(reduced_data_5)

    kmeans_4 = KMeans(n_clusters=optimal_num_k_pca_4[group_name], random_state=42).fit(reduced_data_4)
    kmeans_2_4 = KMeans(n_clusters=optimal_num_k_2_pca_4[group_name], random_state=42).fit(reduced_data_4)

    # Get cluster assignments for both configurations
    cluster_assignments_5 = kmeans_5.labels_
    cluster_assignments_2_5 = kmeans_2_5.labels_

    cluster_assignments_4 = kmeans_4.labels_
    cluster_assignments_2_4 = kmeans_2_4.labels_

    # Compute silhouette scores for both configurations
    silhouette_score_5 = silhouette_score(reduced_data_5, cluster_assignments_5)
    silhouette_score_2_5 = silhouette_score(reduced_data_5, cluster_assignments_2_5)

    silhouette_score_4 = silhouette_score(reduced_data_4, cluster_assignments_4)
    silhouette_score_2_4 = silhouette_score(reduced_data_4, cluster_assignments_2_4)

    results_df = results_df.append({
        'playlist_name': group_name,
        'nb_cluster_5': optimal_num_k_pca_5[group_name],
        'nb_cluster_5_2': optimal_num_k_2_pca_5[group_name],
        'silhouette_score_5': silhouette_score_5,
        'silhouette_score_5_2': silhouette_score_2_5,
        'nb_cluster_4': optimal_num_k_pca_4[group_name],
        'nb_cluster_4_2': optimal_num_k_2_pca_4[group_name],
        'silhouette_score_4': silhouette_score_4,
        'silhouette_score_4_2': silhouette_score_2_4
    }, ignore_index=True)

print(results_df)


  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = resul

              playlist_name  nb_cluster_5  nb_cluster_5_2  silhouette_score_5  \
0            80s Dance Hits             2               9            0.215823   
1           90s Dance Party             2               6            0.333600   
2          AMAPIANO grooves             2               9            0.194626   
3                     Altar             2               7            0.370072   
4                    Banger             2               9            0.290572   
5            Big Room Dance             3               3            0.304428   
6              Chill Tracks             3              10            0.301699   
7            Dance Classics             2               7            0.383150   
8                Dance Hits             2               8            0.297221   
9          Dance Hits 2000s             4              10            0.289556   
10         Dance Hits 2010s             2               9            0.195942   
11       Dance Hits of 2010 

  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({


In [50]:
results_df.to_excel('robust_electro.xlsx')
