In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import umap
from scipy.stats import ttest_ind, ks_2samp, mannwhitneyu
from cliffs_delta import cliffs_delta

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Paths to the folders and playlist CSVs
folder_path_EMD = "" # Path to the folder containing the audio embeddings for the EMD dataset
folder_path_PMD = "" # Path to the folder containing the audio embeddings for the PMD dataset
playlist_path_EMD = "" # Path to the playlist CSV for the EMD dataset
playlist_path_PMD = "" # Path to the playlist CSV for the PMD dataset

# Function to process a folder and return a DataFrame
def process_audio_embeddings(folder_path, playlist_path):
    # Load playlist CSV
    playlist_df = pd.read_csv(playlist_path)
    
    # Extract unique Track IDs from the playlist
    track_id_counts = playlist_df['Track ID'].value_counts()

    # List to store data
    data = []

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.npz'):
            # Extract Track ID (first 22 characters of file name)
            track_id = file_name[:22]
            
            # Load the .npz file
            file_path = os.path.join(folder_path, file_name)
            with np.load(file_path) as npz_file:
                # Assuming embeddings are stored in a key named 'embeddings' in the .npz file
                embeddings = npz_file['embeddings']
            
            # Get the duplication count for the current Track ID
            duplication_count = track_id_counts.get(track_id, 0)
            
            # Append data (duplicate as needed)
            for _ in range(duplication_count):
                data.append([track_id] + embeddings.flatten().tolist())

    # Create DataFrame
    column_names = ['Track ID'] + [f'{i}' for i in range(len(data[0]) - 1)]
    return pd.DataFrame(data, columns=column_names)

# Process both folders and create DataFrames
embeddings_df_EMD = process_audio_embeddings(folder_path_EMD, playlist_path_EMD)
embeddings_df_PMD = process_audio_embeddings(folder_path_PMD, playlist_path_PMD)

In [4]:
# Drop the Track ID column and prepare the data
EMD_embeddings = embeddings_df_EMD.drop(columns=['Track ID']).values
PMD_embeddings = embeddings_df_PMD.drop(columns=['Track ID']).values

# Combine the datasets for UMAP
combined_embeddings = np.vstack((EMD_embeddings, PMD_embeddings))

# Apply UMAP to reduce dimensions to 2
umap_reducer = umap.UMAP(n_components=2, random_state=42)
reduced_embeddings = umap_reducer.fit_transform(combined_embeddings)

# Split the reduced embeddings back into EMD and PMD
n_EMD = EMD_embeddings.shape[0]
reduced_EMD = reduced_embeddings[:n_EMD]
reduced_PMD = reduced_embeddings[n_EMD:]

In [8]:
# Mann-Whitney U Test for UMAP components
u_stat_0, p_value_u_0 = mannwhitneyu(reduced_EMD[:, 0], reduced_PMD[:, 0], alternative='two-sided')
u_stat_1, p_value_u_1 = mannwhitneyu(reduced_EMD[:, 1], reduced_PMD[:, 1], alternative='two-sided')

# Kolmogorov-Smirnov Test for UMAP components
ks_stat_0, p_value_ks_0 = ks_2samp(reduced_EMD[:, 0], reduced_PMD[:, 0])
ks_stat_1, p_value_ks_1 = ks_2samp(reduced_EMD[:, 1], reduced_PMD[:, 1])

# Cliff's Delta for UMAP components
delta_0, magnitude_0 = cliffs_delta(reduced_EMD[:, 0], reduced_PMD[:, 0])
delta_1, magnitude_1 = cliffs_delta(reduced_EMD[:, 1], reduced_PMD[:, 1])

# Print results
print(f"Mann-Whitney U-Test for UMAP Component 1: U-statistic = {u_stat_0}, p-value = {p_value_u_0}")
print(f"Mann-Whitney U-Test for UMAP Component 2: U-statistic = {u_stat_1}, p-value = {p_value_u_1}")

print(f"Kolmogorov-Smirnov Test for UMAP Component 1: KS-statistic = {ks_stat_0}, p-value = {p_value_ks_0}")
print(f"Kolmogorov-Smirnov Test for UMAP Component 2: KS-statistic = {ks_stat_1}, p-value = {p_value_ks_1}")

print(f"Cliff's Delta for UMAP Component 1: Delta = {delta_0:.4f}, Effect Size = {magnitude_0}")
print(f"Cliff's Delta for UMAP Component 2: Delta = {delta_1:.4f}, Effect Size = {magnitude_1}")

Mann-Whitney U-Test for UMAP Component 1: U-statistic = 8921356.0, p-value = 1.9931276763203524e-05
Mann-Whitney U-Test for UMAP Component 2: U-statistic = 6840232.0, p-value = 1.450394096417345e-36
Kolmogorov-Smirnov Test for UMAP Component 1: KS-statistic = 0.11538926049795616, p-value = 2.0346613364331402e-18
Kolmogorov-Smirnov Test for UMAP Component 2: KS-statistic = 0.13363991081382387, p-value = 1.42097727059554e-24
Cliff's Delta for UMAP Component 1: Delta = 0.0626, Effect Size = negligible
Cliff's Delta for UMAP Component 2: Delta = -0.1853, Effect Size = small
