In [207]:
import pandas as pd
import numpy as np
from ast import literal_eval

# Load necessary data

In [235]:
df_meta = pd.read_json("../data/metadata_full.json")
df_audio_features = pd.read_csv("../outputs/df_audio_features.csv").drop(columns=["Unnamed: 0"])
df_annotated = pd.read_csv("../outputs/annotated_compiled_songs_with_strategy_counts.csv").drop(columns=["Unnamed: 0"])

In [236]:
df_annotated['strategy'] = df_annotated['strategy'].apply(
    lambda s: literal_eval(s) if isinstance(s, str) and s.strip().startswith('[') else s
)

# Merge meta-data and audio features data

In [210]:
# manually encode the artist and audiosourcewebpage for missing_song
df_audio_features.loc[162, 'artist'] = 'Snarky Puppy'
df_audio_features.loc[162, 'audiosourcewebpage'] = 'https://open.spotify.com/track/3Vj7fRefltLc0zcNnjux4e'
df_audio_features.loc[162, 'title'] = 'Snarky Puppy - The Curtain - Live From Dordrecht, Het Energiehuis  2014'

In [211]:
def merge_dfs(df_m, df_af):
    '''
    Merge metadata and audio feature dataframes.
    First, tries to match url's. If that fails, match on song titles.
    '''

    matching_urls = df_m["url"].isin(df_af["audiosourcewebpage"])
    matching_urls_idx = df_m.index[matching_urls]
    df_m_matching = df_m.loc[matching_urls_idx]
    
    missing_urls_idx = df_m.index[~matching_urls]
    df_m_missing = df_m.loc[missing_urls_idx]

    # if there are matching urls, merge based on urls
    df_merged_urls = df_af.merge(
        df_m_matching, how="inner",
        left_on="audiosourcewebpage", right_on="url"
    )

    # if there are two different urls, try to merge based on the song title
    df_merged_title = df_af.merge(
        df_m_missing, how="inner",
        left_on="title", right_on="name"
    )

    merged_all = pd.concat([df_merged_urls, df_merged_title])

    return merged_all

In [212]:
df_merged = merge_dfs(df_meta, df_audio_features)

# drop duplicated tracks
df_merged_unique = df_merged.drop_duplicates(subset='audiosourcewebpage', keep='first')

# Filter columns

In [213]:
columns_to_include = [
    # metadata
    'url',
    'name',
    'artists',
    'artist_y', # has to be renamed; should we keep both artists and artist?
    'genres',
    'duration',
    'year',
    'popularity',
    # lowlevel features
    'lowlevel.dissonance.mean',
    'lowlevel.dissonance.stdev',
    'lowlevel.dynamic_complexity',
    'lowlevel.hfc.mean',
    'lowlevel.hfc.stdev',
    'lowlevel.loudness_ebu128.integrated',
    'lowlevel.loudness_ebu128.loudness_range',
    'lowlevel.melbands_crest.mean',
    'lowlevel.melbands_crest.stdev',
    'lowlevel.melbands_flatness_db.mean',
    'lowlevel.melbands_flatness_db.stdev',
    'lowlevel.melbands_kurtosis.mean',
    'lowlevel.melbands_kurtosis.stdev',
    'lowlevel.melbands_skewness.mean',
    'lowlevel.melbands_skewness.stdev',
    'lowlevel.melbands_spread.mean',
    'lowlevel.melbands_spread.stdev',
    'lowlevel.pitch_salience.mean',
    'lowlevel.pitch_salience.stdev',
    'lowlevel.spectral_centroid.mean',
    'lowlevel.spectral_centroid.stdev',
    'lowlevel.spectral_complexity.mean',
    'lowlevel.spectral_complexity.stdev',
    'lowlevel.spectral_decrease.mean',
    'lowlevel.spectral_decrease.stdev',
    'lowlevel.spectral_energy.mean',
    'lowlevel.spectral_energy.stdev',
    'lowlevel.spectral_flux.mean',
    'lowlevel.spectral_flux.stdev',
    'lowlevel.spectral_kurtosis.mean',
    'lowlevel.spectral_kurtosis.stdev',
    'lowlevel.spectral_rms.mean',
    'lowlevel.spectral_rms.stdev',
    'lowlevel.spectral_rolloff.mean',
    'lowlevel.spectral_rolloff.stdev',
    'lowlevel.spectral_skewness.mean',
    'lowlevel.spectral_skewness.stdev',
    'lowlevel.spectral_spread.mean',
    'lowlevel.spectral_spread.stdev',
    'lowlevel.spectral_strongpeak.mean',
    'lowlevel.spectral_strongpeak.stdev',
    'lowlevel.zerocrossingrate.mean',
    'lowlevel.zerocrossingrate.stdev',
    'lowlevel.mfcc.mean',
    'lowlevel.spectral_contrast_coeffs.mean',
    'lowlevel.spectral_contrast_coeffs.stdev',
    'lowlevel.spectral_contrast_valleys.mean',
    'lowlevel.spectral_contrast_valleys.stdev',
    # rhythm features
    'rhythm.beats_loudness.mean',
    'rhythm.beats_loudness.stdev',
    'rhythm.bpm',
    'rhythm.danceability',
    'rhythm.onset_rate',
    # tonal features
    'tonal.chords_changes_rate',
    'tonal.chords_number_rate',
    'tonal.chords_strength.mean',
    'tonal.chords_strength.stdev',
    'tonal.key_krumhansl.strength',
    'tonal.key_krumhansl.key',
    'tonal.key_krumhansl.scale'
]

In [239]:
df_filtered = df_merged_unique[columns_to_include]

Examine columns with NaN values and remove these features

In [240]:
print(df_filtered.isna().sum()[df_filtered.isna().sum() > 0])
print(len(df_filtered[df_filtered['genres'].apply(lambda x: isinstance(x, list) and len(x) == 0)])/len(df_filtered))


lowlevel.spectral_spread.mean     439
lowlevel.spectral_spread.stdev    212
dtype: int64
0.42289039767216297


In [241]:
len(df_filtered[df_filtered['genres'].apply(lambda x: isinstance(x, list) and len(x) == 0)])

436

In [242]:
cols_to_leave_out = [# due to NaN entries
                     "genres",
                     "lowlevel.spectral_spread.mean",
                     "lowlevel.spectral_spread.stdev",
                     # due to array-like structure
                     "lowlevel.spectral_contrast_coeffs.mean",
                     "lowlevel.spectral_contrast_coeffs.stdev", 
                     "lowlevel.spectral_contrast_valleys.mean", 
                     "lowlevel.spectral_contrast_valleys.stdev"]

df_filtered = df_filtered.drop(columns= cols_to_leave_out)

Create a feature for each value in lowlevel.mfcc.mean

In [245]:
df_filtered['lowlevel.mfcc.mean'] = df_filtered['lowlevel.mfcc.mean'].apply(
    lambda s: literal_eval(s) if isinstance(s, str) and s.strip().startswith('[') else s
)

for i in range(13):
    df_filtered[f"lowlevel.mfcc.mean.{i}"] = df_filtered["lowlevel.mfcc.mean"].apply(lambda x: x[i])

df_filtered.drop(columns="lowlevel.mfcc.mean", inplace=True)

# Annotate each track with strateg(ies)

In [246]:
# Drop duplicates
strategies_df = df_annotated.drop_duplicates(subset='url', keep='first')
strategies_df = df_annotated[['url', 'strategy', 'response_id']]

# Merge labels with df_filtered
df_labeled = strategies_df.merge(df_filtered, how='right', on='url').drop_duplicates('url', keep='first')

In [247]:
# Manually and algorithmically find mismatching tracks and labels

map_urls = {
    "わたしのアール": "私のアール",
    "Deriheru Yondara Kimi ga Kita - Original Ver.": "デリヘル呼んだら君が来た",
    "mirrorball": "Mirrorball"
}

def find_missing_strategies(df_labeled, all_batches_annotated, strategies_df):
    y = df_labeled['strategy']
    no_label_idx = y[y.isna()].index
    mismatching_songs = df_labeled.loc[no_label_idx].copy()

    for i, row in mismatching_songs.iterrows():
        title = row['name']
        name_match = all_batches_annotated.loc[all_batches_annotated['song_name'] == title, 'url']
        if len(name_match) > 0:
            new_url = name_match.iloc[0]
        else:
            alt_title = map_urls[title]
            name_match = all_batches_annotated.loc[all_batches_annotated['song_name'] == alt_title, 'url']
            new_url = name_match.iloc[0]

        df_labeled.at[i, 'url'] = new_url

        m = all_batches_annotated.loc[all_batches_annotated['song_name'] == title, 'url']
        s = strategies_df.loc[strategies_df['url'] == new_url, 'strategy']
        if len(s) > 0:
            new_strategy = s.iloc[0]
        else:
            s2 = all_batches_annotated.loc[all_batches_annotated['url'] == new_url, 'strategy']
            new_strategy = s2.iloc[0] if len(s2) > 0 else None

        df_labeled.at[i, 'strategy'] = new_strategy
    
    return df_labeled

In [248]:
df_labeled = find_missing_strategies(df_labeled, df_annotated, strategies_df)

df_labeled.loc[43, 'response_id'] = "R_2q7KRMeEj2Ki93e"
df_labeled.loc[132, 'response_id'] = "R_2oS0VRFGHi3daXX"
df_labeled.loc[411, 'response_id'] = "R_2oS0VRFGHi3daXX"
df_labeled.loc[764, 'response_id'] = "R_8eLKaSUgqXjGFkl"
df_labeled.loc[831, 'response_id'] = "R_88HQeKMOYExgqkB"
df_labeled.loc[897, 'response_id'] = "R_2cD4oWRk402VtjH"
df_labeled.loc[922, 'response_id'] = "R_8eLKaSUgqXjGFkl"

In [249]:
# Add "meta." as a suffix to metadata columns
meta_rename_mapping = {col: f"meta.{col}" for col in df_labeled.columns[5:8]}
df_labeled = df_labeled.rename(columns=meta_rename_mapping)

In [226]:
df_labeled.to_csv("../outputs/full_dataset_without_openl3.csv", index=False)

In [250]:
df_labeled_cleaned = df_labeled.drop(columns=["url", "response_id", "name", "artists", "meta.artist_y", "tonal.key_krumhansl.key", "tonal.key_krumhansl.scale"])

def process_multi_strategies(x):
    """
    If songs occur more than once, but for the same strategy (e.g. [('solace', 2)]) keep the strategy
    """
    if isinstance(x,str):
        # keep current x
        return x
    elif isinstance(x,list) and len(x) == 1:
        new_x = x[0][0]
        return new_x
    elif isinstance(x,list) and len(x) > 1:
        return np.nan
    
df_labeled_cleaned['strategy'] = df_labeled_cleaned['strategy'].apply(process_multi_strategies)

# drop tracks corresponding to multiple strategies
df_labeled_cleaned = df_labeled_cleaned.dropna(subset=['strategy'])

df_labeled_cleaned.to_csv("../data/pca_dataset.csv")

# Incorporate Open L3 audio features

In [229]:
df_openl3 = pd.read_csv("../data/openl3_full_dataset.csv")
df_openl3[['artist', 'title']] = df_openl3['track_id'].str.split(' - ', n=1, expand=True)

In [230]:
# Remove question marks and slashes
df_labeled_copy = df_labeled.copy()

df_labeled_copy['name'] = (
    df_labeled_copy['name']
    .str.replace('?', '', regex=False)
    .str.replace('/', '', regex=False)
)

df_labeled_copy['meta.artist_y'] = df_labeled_copy['meta.artist_y'].str.replace('/', '', regex=False)

In [231]:
# Manually re-encode track names 
df_labeled_copy.loc[22, 'name'] = 'The Gadfly Suite, Op. 97a- Romance'
df_labeled_copy.loc[78, 'name'] = "Tchaikovsky- The Nutcracker, Op. 71, Act I, Scene 1- No. 4, Dancing Scene. Arrival of Drosselmeyer"
df_labeled_copy.loc[91, 'name'] = "Stay Alive - From 'The Secret Life of Walter Mitty' Soundtrack"
df_labeled_copy.loc[172, 'name'] = "Mrs. Robinson - From 'The Graduate' Soundtrack"
df_labeled_copy.loc[225, 'name'] = "Requiem, K. 626- Lacrimosa"
df_labeled_copy.loc[283, 'name'] = "Violin Concerto No. 1- II. Crotchet = c. 108"
df_labeled_copy.loc[327, 'name'] = "Moon Halo - Honkai Impact 3Rd 'Everlasting Flames' Animated Short Theme"
df_labeled_copy.loc[410, 'name'] = "Piano Concerto No. 2 in C Minor, Op. 18- 2. Adagio sostenuto"
df_labeled_copy.loc[428, 'name'] = "Coconut Mall (From 'Mario Kart Wii')"
df_labeled_copy.loc[898, 'name'] = "Scheherazade- The Tale of the Kalendar Prince"
df_labeled_copy.loc[971, 'name'] = "Liebestraume, S541R211 - No. 3- Nocturne in A-Flat Major"

In [232]:
# Merge Open L3 features to df_labeled
df_af_openl3 = df_openl3.merge(df_labeled_copy, left_on=["artist", "title"], right_on=["meta.artist_y", "name"])
df_af_openl3 = df_af_openl3.drop(columns=['artist', 'title', 'url', 'name', 'artists', 'meta.artist_y']).sort_index(axis=1)

df_af_openl3.to_csv("../outputs/full_dataset_with_openl3.csv", index=False)

In [233]:
def drop_l3_embeddings(df):
    def is_l3(col):
        if not col.startswith("e"): return False
        suf = col[1:]
        if not suf.isdigit(): return False
        idx = int(suf)
        return 0 <= idx <= 511
    cols_to_drop = [c for c in df.columns if is_l3(c)]
    return df.drop(columns=cols_to_drop, errors="ignore")

dropl3 = drop_l3_embeddings(df_af_openl3)
dropl3 = dropl3.drop(columns=["track_id", "strategy"])