In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [13]:
GENRES = ["classical", "country", "edm", "hip-hop", "jazz", "pop", "rap", "rnb", "rock"]

In [14]:
all_tracks = pd.DataFrame()

for genre in GENRES:
    genre = pd.read_pickle(f"../data/spotify_dataset_{genre}.pkl")
    all_tracks = pd.concat([all_tracks, genre])

# Finding all duplicates
all_dupes = all_tracks[all_tracks.index.duplicated()]
all_dupes.shape

(835, 16)

In [15]:
# Creating multilabels
df = all_tracks
df[[GENRES]] = 0
df.sample(n=1)

Unnamed: 0,artists,name,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
3QFInJAm9eyaho5vBzxInN,"[Baby Keem, Kendrick Lamar]",family ties (with Kendrick Lamar),rap,0.711,0.611,1,-5.453,1,0.33,0.00588,...,4,0,0,0,0,0,0,0,0,0


In [16]:
# Update genre to 1 for each song's genre
for genre in GENRES:
    df.loc[df['genre'] == genre, genre] = 1
df.sample(n=3)

Unnamed: 0,artists,name,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
6EafRI0UtUidrZbbKpFRYq,"[Aluna, Diplo, Durante, AlunaGeorge]",Forget About Me,edm,0.721,0.894,3,-7.59,0,0.0488,0.0428,...,4,0,0,1,0,0,0,0,0,0
2MJz8BxxMsERULatmBikDH,[Pink Sweat$],At My Worst,pop,0.813,0.415,0,-5.926,1,0.0349,0.777,...,4,0,0,0,0,0,1,0,0,0
4I3573QbRpER4HPzyFSFJa,[The Raconteurs],Salute Your Solution,rock,0.266,0.946,9,-2.855,1,0.171,0.00781,...,4,0,0,0,0,0,0,0,0,1


In [17]:
# Now that every song has a genre attached in the multilabels, drop the genre column
df.drop('genre', axis=1, inplace=True)
df.sample(n=1)

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
3oyc1mIdCBGaU55wX7otqM,[Elvis Costello & The Attractions],Pump It Up - 2021 Remaster,0.645,0.809,11,-6.12,1,0.0385,0.00921,0.00108,...,4,0,0,0,0,0,0,0,0,1


In [18]:
# Merge duplicates together
tracks_and_genres = df.groupby(df.index).agg({genre: 'sum' for genre in GENRES})
tracks_and_genres

Unnamed: 0,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
003vvx7Niy0yvhvHt4a68B,0,0,0,0,0,0,0,0,1
00762tXbSj2w2bQMVydJZn,0,0,0,1,0,0,0,0,0
00CXUMREit80f2McJsjcIz,1,0,0,0,0,0,0,0,0
00KyYtT6NaXwbPecina5Pj,0,1,0,0,0,0,0,0,0
00Mb3DuaIH1kjrwOku9CGU,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
7zI3nF4mahUbVKjxRjhmEt,0,0,0,0,1,0,0,0,0
7zSEUE2ebqU2zS3ibaNw8r,0,0,1,0,0,0,0,0,0
7zjEyeBsaw9gV0jofJLfOM,0,0,0,1,0,0,1,0,0
7zmleW3XZx0uUsL2CkFuDe,0,0,0,0,0,1,0,1,0


In [19]:
# Get unique tracks with audio features only
tracks_without_dupes = df[~df.index.duplicated(keep='first')]
tracks_without_dupes = tracks_without_dupes.drop(GENRES, axis=1)
tracks_without_dupes.sample(n=3)

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0vQdNTM18KusMJJrXWXDAy,"[Quintino, Thomas Gold]",Quechua,0.592,0.983,5,-3.429,0,0.084,0.000611,0.00764,0.175,0.375,131.967,179091,4
4djIFfof5TpbSGRZUpsTXq,[COIN],Talk Too Much,0.598,0.888,4,-5.119,1,0.0807,0.00947,0.0,0.347,0.482,103.966,187133,4
3Up1MsHqcxUVgB2iuqFRq2,"[Kasbo, Frida Sundemo]",Shut The World Out,0.496,0.679,8,-6.893,1,0.0345,0.0308,0.00327,0.316,0.224,105.059,268966,4


In [20]:
# Merge unique tracks with multilabels
all_tracks_cleaned = pd.merge(tracks_without_dupes, tracks_and_genres, left_index=True, right_index=True)
all_tracks_cleaned.sample(n=3)

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
6rdmyLJtIibFRaCU7XRg3x,[Josh Gracin],Love Like,0.651,0.646,1,-5.305,1,0.0264,0.0243,0.0,...,4,0,1,0,0,0,0,0,0,0
2WN7xpcY4zmcqF57HFEGZY,"[Belly, The Weeknd, Young Thug]",Better Believe (ft The Weeknd),0.799,0.569,0,-7.821,1,0.318,0.154,0.0,...,4,0,0,0,0,0,0,1,0,0
0jTTjg6q4jsd2RhuHwTvLj,[Ashley Tisdale],He Said She Said,0.694,0.709,7,-6.205,0,0.103,0.0966,0.0,...,4,0,0,0,0,0,1,0,0,0


In [21]:
# Inspect two previously duplicated tracks and check if more than one genre is listed
all_tracks_cleaned.loc[['1rIKgCH4H52lrvDcz50hS8', '2wxO3ZydJTZRi4Bbc02q7i']]

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
1rIKgCH4H52lrvDcz50hS8,[Zara Larsson],Lush Life,0.694,0.712,7,-3.923,0,0.046,0.132,0.0,...,4,0,0,1,0,0,1,0,0,0
2wxO3ZydJTZRi4Bbc02q7i,"[Scarface, Johnny P, 2Pac]",Smile,0.7,0.402,5,-9.93,0,0.278,0.0498,1e-06,...,4,0,0,0,1,0,0,1,0,0


In [11]:
all_tracks_cleaned.to_pickle("../data/spotify_dataset_all.pkl")