# Data Management

## Setup

In [1]:
# Import modules
import ast
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from collections import Counter

from scipy.spatial.distance import cdist

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances

from tqdm import tqdm


In [2]:
# Load data
df = pd.read_csv('/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_management/db_soundcharts/major_tracks_info_22-23.csv')


## Clean data

In [3]:
# Collapse to weekly level playlists data
# Determine weekday
df['collection_date'] = pd.to_datetime(df['collection_date'])

df['weekday'] = df['collection_date'].dt.weekday

# Filter to keep only rows that fall on Fridays
df_weekly = df[df['weekday'] == 4].copy()
df_weekly.drop(columns=['weekday'], inplace=True)

# Print data
display(df_weekly)


Unnamed: 0,playlist_id,playlist_name,collection_date,playlist_type,track_position,track_name,track_uuid,collection_date_clean,first_appearance_date,artist_name,artist_id,artist_first_track,artist_isnew,label_name,label_type,track_date,track_features,isrc,track_genres
973,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,1,Balader,ac2caabe-ee53-45e0-b70a-02ec68cf7b03,2022-10-07 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['Soolking', 'Niska']","['11e81bba-d00e-f2d2-a406-a0369fe50396', '11e8...","['2016-04-13T00:00:00+00:00', '2013-11-12T00:0...","[False, False]","['Hyper Focal', 'Affranchis']","['Other independent labels: ""Indie""', 'Other i...",2022-05-25T00:00:00+00:00,"{'acousticness': 0.32, 'danceability': 0.77, '...",FR,['hip-hop & rap']
974,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,2,DIE,0ec15f62-cb25-4377-a641-d74931272f3d,2022-10-07 00:00:00+00:00,2022-10-01 00:00:00+00:00,['Gazo'],['59a6ff7c-c650-11e8-9647-549f35141000'],['1900-01-01T00:00:00+00:00'],[False],"['Epic', 'BSB Productions']","['Sony', 'Self released']",2022-07-01T00:00:00+00:00,"{'acousticness': 0.23, 'danceability': 0.7, 'e...",FR,['hip-hop & rap']
975,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,3,"Plus belle la vie, plus belle la mort",d7e50ec4-656a-484f-b7b0-e17417e2df79,2022-10-07 00:00:00+00:00,2022-10-02 00:00:00+00:00,"['Dosseh', 'Tiakola']","['11e81bbe-a3cb-bb5c-841c-a0369fe50396', '3498...","['2024-12-18T14:42:20+00:00', '2019-04-26T00:0...","[True, False]",['SPKTAQLR'],"['Other independent labels: ""Indie""']",2022-09-28T00:00:00+00:00,"{'acousticness': 0.34, 'danceability': 0.75, '...",FR,['hip-hop & rap']
976,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,4,FADE UP,74e54b31-a373-4cc4-8cc9-0d52adc7ef97,2022-10-07 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['Hamza', 'SCH', 'Zeg P']","['11e81bbc-db5c-de54-b94b-a0369fe50396', '11e8...","['2001-10-09T00:00:00+00:00', '2000-01-01T00:0...","[False, False, True]","['All Points', 'On Da Trax']","['Believe', 'Other independent labels: ""Indie""']",2022-06-24T00:00:00+00:00,"{'acousticness': 0.57, 'danceability': 0.76, '...",FR,['hip-hop & rap']
977,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,5,Quand j'y repense,19392a13-9b64-4796-906a-1355d408d337,2022-10-07 00:00:00+00:00,2022-10-02 00:00:00+00:00,"['Mig', 'Tiakola']","['11e81bbf-8634-19e4-9e14-a0369fe50396', '3498...","['2019-11-28T00:00:00+00:00', '2019-04-26T00:0...","[False, False]","['Royal Music', 'Adebizik']","['unknown', 'unknown']",2022-09-27T00:00:00+00:00,"{'acousticness': 0.65, 'danceability': 0.58, '...",FR,['hip-hop & rap']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773569,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,126,Wellerman,8aeff98a-2d97-4002-b6bb-656694c06402,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['220 KID', 'Billen Ted', 'Nathan Evans']","['662c18ec-f52e-11e8-800f-549f35141000', 'c86c...","['2018-04-26T00:00:00+00:00', '2020-11-04T00:0...","[False, True, False]",['Polydor'],['Universal'],2021-01-21T00:00:00+00:00,"{'acousticness': 0.04, 'danceability': 0.72, '...",GB,['electronic']
2773570,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,127,Moula max,f068ce2e-1fee-4b5e-9eb8-842e0dec5fc5,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['L’Algérino', ""Heuss L'enfoiré""]","['11e81bc4-2d6b-e288-ad9b-a0369fe50396', '11e8...","['2024-12-18T14:49:58+00:00', '1820-06-15T00:0...","[True, False]",['Only Pro'],"['Other independent labels: ""Indie""']",2020-07-01T00:00:00+00:00,"{'acousticness': 0.06, 'danceability': 0.82, '...",FR,['hip-hop & rap']
2773571,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,128,VIDA LOCA,f8a47e5c-efa8-4c7e-b697-638a4a4cd99b,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['Nicky Jam', 'Tyga', 'The Black Eyed Peas']","['11e81bc7-b8eb-c74e-b743-a0369fe50396', '11e8...","['No valid track date found', 'No valid track ...","[False, False, False]","['Epic', 'BEP Music']","['Sony', 'unknown']",2020-06-18T00:00:00+00:00,"{'acousticness': 0.02, 'danceability': 0.89, '...",US,"['latin', 'pop']"
2773572,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,129,ロコ・コンティーゴ,ff5b9dd5-9852-45c2-8fe8-570db43ed4a0,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['DJ Snake', 'J Balvin', 'Tyga']","['11e81bc3-b108-2dd2-9858-a0369fe50396', '11e8...","['2005-01-17T00:00:00+00:00', '1999-08-23T00:0...","[False, False, False]","['DJ Snake Music Productions', 'Interscpe']","['Self released', 'unknown']",2019-07-25T00:00:00+00:00,"{'acousticness': 0.28, 'danceability': 0.69, '...",US,"['electronic', 'hip-hop & rap']"


In [4]:
# Normalize track_features
# Convert track features to dict
def string_to_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return []

df_weekly['track_features'] = df_weekly['track_features'].apply(string_to_list)
df_weekly['track_features'] = df_weekly['track_features'].apply(dict)

# Normalize features
df_features = pd.json_normalize(df_weekly['track_features'])
print(df_features)

# Concatenate the new columns back to the original DataFrame
df_final = pd.concat(
    [df_weekly.reset_index(drop=True), df_features.reset_index(drop=True)],
    axis=1
)
df_final.drop(columns='track_features', inplace=True)

# Print and export data
df_final.to_csv('major_tracks_info_22-23_weekly_normalized.csv', index=False)
display(df_final)


        acousticness  danceability  energy  instrumentalness   key  liveness  \
0               0.32          0.77    0.78              0.00   4.0      0.12   
1               0.23          0.70    0.63              0.00   8.0      0.12   
2               0.34          0.75    0.75              0.00   2.0      0.23   
3               0.57          0.76    0.60              0.00   4.0      0.13   
4               0.65          0.58    0.69              0.00   4.0      0.14   
...              ...           ...     ...               ...   ...       ...   
395161          0.04          0.72    0.89              0.00   0.0      0.07   
395162          0.06          0.82    0.80              0.00   8.0      0.14   
395163          0.02          0.89    0.62              0.00  10.0      0.33   
395164          0.28          0.69    0.76              0.00  11.0      0.10   
395165          0.09          0.58    0.88              0.21  10.0      0.08   

        loudness  mode  speechiness   t

Unnamed: 0,playlist_id,playlist_name,collection_date,playlist_type,track_position,track_name,track_uuid,collection_date_clean,first_appearance_date,artist_name,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,timeSignature,valence
0,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,1,Balader,ac2caabe-ee53-45e0-b70a-02ec68cf7b03,2022-10-07 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['Soolking', 'Niska']",...,0.78,0.00,4.0,0.12,-5.77,0.0,0.07,86.68,5.0,0.83
1,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,2,DIE,0ec15f62-cb25-4377-a641-d74931272f3d,2022-10-07 00:00:00+00:00,2022-10-01 00:00:00+00:00,['Gazo'],...,0.63,0.00,8.0,0.12,-7.16,0.0,0.04,130.97,4.0,0.55
2,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,3,"Plus belle la vie, plus belle la mort",d7e50ec4-656a-484f-b7b0-e17417e2df79,2022-10-07 00:00:00+00:00,2022-10-02 00:00:00+00:00,"['Dosseh', 'Tiakola']",...,0.75,0.00,2.0,0.23,-5.97,0.0,0.04,161.95,4.0,0.56
3,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,4,FADE UP,74e54b31-a373-4cc4-8cc9-0d52adc7ef97,2022-10-07 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['Hamza', 'SCH', 'Zeg P']",...,0.60,0.00,4.0,0.13,-5.99,0.0,0.09,129.96,4.0,0.56
4,11e84480-ad44-deb2-8ac8-a0369fe50396,"Rap Fr | Rap Francais 2024 | Hits Rap | SDM, ...",2022-10-07 00:00:00+00:00,Major label,5,Quand j'y repense,19392a13-9b64-4796-906a-1355d408d337,2022-10-07 00:00:00+00:00,2022-10-02 00:00:00+00:00,"['Mig', 'Tiakola']",...,0.69,0.00,4.0,0.14,-5.90,0.0,0.22,139.94,4.0,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395161,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,126,Wellerman,8aeff98a-2d97-4002-b6bb-656694c06402,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['220 KID', 'Billen Ted', 'Nathan Evans']",...,0.89,0.00,0.0,0.07,-3.26,0.0,0.05,119.93,4.0,0.44
395162,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,127,Moula max,f068ce2e-1fee-4b5e-9eb8-842e0dec5fc5,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['L’Algérino', ""Heuss L'enfoiré""]",...,0.80,0.00,8.0,0.14,-3.73,0.0,0.09,128.09,4.0,0.76
395163,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,128,VIDA LOCA,f8a47e5c-efa8-4c7e-b697-638a4a4cd99b,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['Nicky Jam', 'Tyga', 'The Black Eyed Peas']",...,0.62,0.00,10.0,0.33,-5.71,0.0,0.05,127.98,4.0,0.77
395164,11e84480-ad6e-2cf4-a2cd-a0369fe50396,"Été 2024 ☀️ Piscine, soleil & fête | Summer hi...",2023-09-29 00:00:00+00:00,Major label,129,ロコ・コンティーゴ,ff5b9dd5-9852-45c2-8fe8-570db43ed4a0,2023-09-29 00:00:00+00:00,2022-10-01 00:00:00+00:00,"['DJ Snake', 'J Balvin', 'Tyga']",...,0.76,0.00,11.0,0.10,-2.99,1.0,0.22,192.09,4.0,0.37


## PCA dimensionnality reduction

In [5]:
# Load data
df = pd.read_csv('major_tracks_info_22-23_weekly_normalized.csv')

# Drop NAs
selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]
df = df.dropna(subset=selected_columns)
print(len(df))

# Group by playlist_name
groups = df.groupby(['playlist_name', 'collection_date'], as_index=False)


358489


In [6]:
# Select the number of components
scaler = MinMaxScaler()

selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]

optimal_num_components = []

for group_name, group in groups:
    # Subset and scale
    subset = group.loc[:, selected_columns]
    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA
    pca = PCA()
    pca.fit(scaled_columns)
    
    # Calculate cumulative explained variance
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    optimal_components = next(i for i, var in enumerate(cumulative_variance) if var >= 0.8) + 1
    optimal_num_components.append(optimal_components)
    print(group_name, optimal_components)

    # Plot scree plot
    '''
    plt.plot(range(1, pca.n_components_ + 1), cumulative_variance, 'bo-', linewidth=2)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title(group_name)
    plt.show()
    '''

# Calculate the most modal value
most_common_optimal = Counter(optimal_num_components).most_common(1)
most_modal_value = most_common_optimal[0][0]

print("Most modal value of optimal components:", most_modal_value)


('#NUITVIE', '2022-10-07 00:00:00+00:00') 5
('#NUITVIE', '2022-10-14 00:00:00+00:00') 5
('#NUITVIE', '2022-10-21 00:00:00+00:00') 5
('#NUITVIE', '2022-10-28 00:00:00+00:00') 5
('#NUITVIE', '2022-11-04 00:00:00+00:00') 5
('#NUITVIE', '2022-11-11 00:00:00+00:00') 5
('#NUITVIE', '2022-11-18 00:00:00+00:00') 5
('#NUITVIE', '2022-11-25 00:00:00+00:00') 5
('#NUITVIE', '2022-12-02 00:00:00+00:00') 5
('#NUITVIE', '2022-12-09 00:00:00+00:00') 5
('#NUITVIE', '2022-12-16 00:00:00+00:00') 5
('#NUITVIE', '2022-12-23 00:00:00+00:00') 5
('#NUITVIE', '2022-12-30 00:00:00+00:00') 5
('#NUITVIE', '2023-01-06 00:00:00+00:00') 5
('#NUITVIE', '2023-01-13 00:00:00+00:00') 5
('#NUITVIE', '2023-01-20 00:00:00+00:00') 5
('#NUITVIE', '2023-01-27 00:00:00+00:00') 5
('#NUITVIE', '2023-02-03 00:00:00+00:00') 5
('#NUITVIE', '2023-02-10 00:00:00+00:00') 5
('#NUITVIE', '2023-02-17 00:00:00+00:00') 5
('#NUITVIE', '2023-02-24 00:00:00+00:00') 5
('#NUITVIE', '2023-03-03 00:00:00+00:00') 5
('#NUITVIE', '2023-03-10 00:00:0

In [7]:
# Apply PCA to reduce dimensionality
# Loop over each group and apply PCA with optimal number of components
scaler = MinMaxScaler()

selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]

reduced_data_dict = {}

for group_name, group in groups:
    # Subset data
    subset = group.loc[:, selected_columns]

    # Scale the specified columns
    scaled_columns = scaler.fit_transform(subset)

    # Apply PCA with the optimal number of components
    n_components = 5
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(scaled_columns)
    reduced_data_dict[group_name] = reduced_data


## Clustering

### Elbow method

In [None]:
# Apply the elbow method to determine the optimal number of clusters
# Calculate the elbow
wcss = {}

for k in range(1, 11):
  for group_name, group in groups:
    reduced_data = reduced_data_dict[group_name]
    
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    
    if group_name not in wcss:
      wcss[group_name] = []

    wcss[group_name].append(kmeans.inertia_)

# Plot the elbow
for group_name, values in wcss.items():
  plt.plot(range(1, 11), values, label=group_name)
  plt.xlabel('Number of clusters (k)')
  plt.ylabel('Within-cluster sum of squares (WCSS)')
  plt.legend()
  plt.show()


### Gap statistic method

In [8]:
# Set random seeds
random.seed(42)
np.random.seed(42)


In [9]:
def compute_gap(data, k):
    # Compute the WCSS for the real data
    kmeans = KMeans(n_clusters=k, random_state=42).fit(data)
    wcss = kmeans.inertia_

    # Compute the null reference distribution by shuffling the data and
    # re-assigning it to clusters
    n_samples, n_features = data.shape
    wcss_null = []

    for _ in range(20):
        data_shuffled = np.random.permutation(data)
        wcss_null.append(KMeans(n_clusters=k).fit(data_shuffled).inertia_)
  
    wcss_null = np.array(wcss_null)
  
    # Compute the gap statistic and gap*
    gap = np.mean(np.log(wcss_null)) - np.log(wcss)

    # Compute the standard deviation of the null reference distribution
    gap_std = np.std(np.log(wcss_null))

    return gap, gap_std


In [10]:
# Calculate the optimal k
# Loop over each group and generate scree plot
optimal_num_k = {}
optimal_num_k_2 = {}

for group_name, group in tqdm(groups):
    group_gaps = []
    group_errors = []

    for k in range(1, 11):
        reduced_data = reduced_data_dict[group_name]

        # Compute the gap statistic and standard deviation for the current value of k
        gap, gap_std = compute_gap(reduced_data, k)
        group_gaps.append(gap)
        group_errors.append(gap_std)

    # Find the optimal number of components based on the gap statistic criterion
    optimal_k = None
    for i in range(1, len(group_gaps) - 1):
        s_k = group_errors[i]
        threshold = s_k * np.sqrt(1 + 1 / 20)
        if group_gaps[i] >= group_gaps[i + 1] - threshold:
            optimal_k = i + 1
            break

    if optimal_k is None:
        optimal_k = np.argmax(group_gaps) + 1

    optimal_num_k[group_name] = optimal_k

    # Find the optimal number of components based on the gap* statistic criterion
    for i in range(1, len(group_gaps)):
        optimal_k_2 = np.argmax(group_gaps) + 1

    optimal_num_k_2[group_name] = optimal_k_2

    # Plot the gap statistics
    '''
    plt.plot(range(1, 11), group_gaps, label=group_name)

    # Set labels and display the plot
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Gap statistic')
    plt.legend()
    plt.show()
    
    print(f"Group: {group_name}, Optimal k (gap): {optimal_num_k[group_name]}, Optimal k (gap*): {optimal_num_k_2[group_name]}")
    '''


  0%|          | 3/3172 [00:19<5:41:21,  6.46s/it]


KeyboardInterrupt: 

### Apply KMeans

In [None]:
# Apply KMeans with optimal number of clusters
df = pd.DataFrame()

kmeans_dict = {}
kmeans_2_dict = {}
cluster_labels_dict = {}
cluster_labels_2_dict = {}

for group_name, group in groups:
    reduced_data = reduced_data_dict[group_name]

    k = optimal_num_k[group_name]
    k_2 = optimal_num_k_2[group_name]

    # Apply KMeans with optimal number of clusters
    kmeans = KMeans(n_clusters=k, random_state=42).fit(reduced_data)
    kmeans_2 = KMeans(n_clusters=k_2, random_state=42).fit(reduced_data)
        
    kmeans_dict[group_name] = kmeans
    kmeans_2_dict[group_name] = kmeans_2

    cluster_labels_dict[group_name] = kmeans.labels_
    cluster_labels_2_dict[group_name] = kmeans_2.labels_

    print(f"Group {group_name}: {kmeans.labels_}")

    # Assign cluster labels to original group data
    group = group.assign(cluster=kmeans.labels_)
    group = group.assign(cluster_2=kmeans_2.labels_)

    # Append group to original dataframe
    df = pd.concat([df, group])
    
    # Plot clusters
    '''
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(group_name)
    plt.show()
    '''
display(df)


## Variables creation

### HH-Index

In [12]:
groups = df.groupby(['playlist_name', 'collection_date'], as_index=False)


In [None]:
# Calculate the shares of each cluster
shares_perc = {}

for group_name, group in groups:
    shares = group['cluster'].value_counts(normalize=True)
    shares_perc[group_name] = shares * 100

# Calculate the HHI
hhi = {}
hhi_2 = {}

for group_name, group in groups:
  shares = group['cluster'].value_counts(normalize=True)
  hhi[group_name] = sum((shares*100)**2)

  shares_2 = group['cluster_2'].value_counts(normalize=True)
  hhi_2[group_name] = sum((shares_2*100)**2)

print(hhi)
print(hhi_2)

print("Minimum HHI: ", min(hhi.values()))
print("Mean HHI: ", np.mean(list(hhi.values())))
print("Median HHI: ", np.median(list(hhi.values())))
print("Maximum HHI: ", max(hhi.values()))
print("Standard deviation: ", np.std(list(hhi.values())))


### Euclidean distances

In [None]:
# Calculate distances between cluster centroids
distances_dict = {}
distances_dict_2 = {}

for group_name, group in groups:
    # Get the cluster centroids for the group
    cluster_centroids = kmeans_dict[group_name].cluster_centers_
    cluster_centroids_2 = kmeans_2_dict[group_name].cluster_centers_

    # Calculate pairwise euclidean distances between cluster centroids
    distances = pairwise_distances(cluster_centroids)
    distances_2 = pairwise_distances(cluster_centroids_2)

    # Calculate mean pairwise distance
    mean_distance = distances.mean()
    mean_distance_2 = distances_2.mean()

    # Print mean pairwise distance for the group
    print(f"Group {group_name}: Mean pairwise distance between cluster centroids = {mean_distance}")

    distances_dict[group_name] = mean_distance
    distances_dict_2[group_name] = mean_distance_2

# Calculate max, min, mean, median and standard deviation of distances
distances_list = [d for distances in distances_dict.values() for d in distances.flatten() if not np.isnan(d)]
print(f"Max distance: {np.max(distances_list):.5f}")
print(f"Min distance: {np.min(distances_list):.5f}")
print(f"Mean distance: {np.mean(distances_list):.5f}")
print(f"Median distance: {np.median(distances_list):.5f}")
print(f"Standard deviation of distances: {np.std(distances_list):.5f}")


In [None]:
# Calculate distances bewteen tracks
scaler = MinMaxScaler()

selected_columns = [
    "danceability", "energy", "loudness", "speechiness", 
    "acousticness", "instrumentalness", "liveness", "valence", 
    "tempo"
]

distances_dict_3 = {}

for group_name, group in groups:
    subset = group.loc[:, selected_columns]
    scaled_columns = scaler.fit_transform(subset)

    pairwise_dist = pairwise_distances(scaled_columns)
    mean_distance = pairwise_dist.mean()
    
    # Print mean pairwise distance for the group
    print(f"Group {group_name}: Mean pairwise distance between tracks = {mean_distance}")

    distances_dict_3[group_name] = mean_distance

# Calculate max, min, mean, median and standard deviation of distances
distances_list = [d for distances in distances_dict_3.values() for d in distances.flatten() if not np.isnan(d)]
print(f"Max distance: {np.max(distances_list):.5f}")
print(f"Min distance: {np.min(distances_list):.5f}")
print(f"Mean distance: {np.mean(distances_list):.5f}")
print(f"Median distance: {np.median(distances_list):.5f}")
print(f"Standard deviation of distances: {np.std(distances_list):.5f}")


### Stirling diversity index

In [None]:
# k-alpha Rao-Stirling index
stirling_index_dict = {}

for group_name, group in groups:
    clusters = group.groupby('cluster')

    total_index = 0

    cluster_centroids = kmeans_dict[group_name].cluster_centers_
    
    for i, (cluster_i_name, cluster_i) in enumerate(clusters):
        for j, (cluster_j_name, cluster_j) in enumerate(clusters):
            if i >= j:
                continue
            
            centroid_i = cluster_centroids[i]
            centroid_j = cluster_centroids[j]
            dist = cdist([centroid_i], [centroid_j], 'euclidean')

            share_i = len(cluster_i) / len(group)
            share_j = len(cluster_j) / len(group)

            index = dist * share_i * share_j

            total_index += index

    stirling_index_dict[group_name] = total_index

print(stirling_index_dict)

print("Minimum index: ", min(stirling_index_dict.values()))
print("Mean index: ", np.mean(list(stirling_index_dict.values())))
print("Median index: ", np.median(list(stirling_index_dict.values())))
print("Maximum index: ", max(stirling_index_dict.values()))
print("Standard deviation: ", np.std(list(stirling_index_dict.values())))


In [None]:
# k-beta Rao-Stirling index
stirling_index_2_dict = {}

for group_name, group in groups:
    clusters = group.groupby('cluster_2')

    total_index = 0

    cluster_centroids = kmeans_2_dict[group_name].cluster_centers_
    
    for i, (cluster_i_name, cluster_i) in enumerate(clusters):
        for j, (cluster_j_name, cluster_j) in enumerate(clusters):
            if i >= j:
                continue
            
            centroid_i = cluster_centroids[i]
            centroid_j = cluster_centroids[j]
            dist = cdist([centroid_i], [centroid_j], 'euclidean')

            share_i = len(cluster_i) / len(group)
            share_j = len(cluster_j) / len(group)

            index = dist * share_i * share_j

            total_index += index

    stirling_index_2_dict[group_name] = total_index

print(stirling_index_2_dict)

print("Minimum index: ", min(stirling_index_2_dict.values()))
print("Mean index: ", np.mean(list(stirling_index_2_dict.values())))
print("Median index: ", np.median(list(stirling_index_2_dict.values())))
print("Maximum index: ", max(stirling_index_2_dict.values()))
print("Standard deviation: ", np.std(list(stirling_index_2_dict.values())))


## Append to dataframe

In [None]:
# Append playlist-level indicators
data_final = []

for group_name, group in groups:
    # Variety
    nb_clusters = optimal_num_k[group_name]
    nb_clusters_2 = optimal_num_k_2[group_name]

    # Balance
    hh_index = hhi[group_name]
    hh_index_2 = hhi_2[group_name]

    # Distparity
    distances = distances_dict[group_name]
    distances_2 = distances_dict_2[group_name]
    distances_3 = distances_dict_3[group_name]

    # Diversity
    stirling_index = stirling_index_dict[group_name]
    stirling_index_2 = stirling_index_2_dict[group_name]
    
    # Metadata playlist
    playlist_id = group['playlist_id'].iloc[0]
    playlist_name = group['playlist_name'].iloc[0]
    playlist_type = group['playlist_type'].iloc[0]
    collection_date = group['collection_date'].iloc[0]
    nb_tracks = len(group)

    dates = pd.to_datetime(group['track_date'])
    mean_track_date = dates.mean()

    # Append playlist-level indicators
    data_final.append({
        'playlist_id': playlist_id,
        'playlist_name': playlist_name,
        'collection_date': collection_date,
        'mean_track_date': mean_track_date,
        'playlist_type': playlist_type,
        'nb_tracks': nb_tracks,
        'nb_clusters': nb_clusters,
        'nb_clusters_2': nb_clusters_2,
        'hh_index': hh_index,
        'hh_index_2': hh_index_2,
        'distances': distances,
        'distances_2': distances_2,
        'distances_3': distances_3,
        'stirling_index': stirling_index,
        'stirling_index_2': stirling_index_2
    })

# Convert to DataFrame
final_data_df = pd.DataFrame(data_final)
display(final_data_df)


In [19]:
# Export data
final_data_df.to_csv("major_playlists_22-23_final.csv", index=False)
