In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
GENRES = ["classical", "country", "edm", "hip-hop", "jazz", "pop", "rap", "rnb", "rock"]

In [3]:
all_tracks = pd.DataFrame()

for genre in GENRES:
    genre = pd.read_pickle(f"../data/spotify_dataset_{genre}.pkl")
    all_tracks = pd.concat([all_tracks, genre])

# Finding all duplicates
all_dupes = all_tracks[all_tracks.index.duplicated()]
all_dupes.shape

(833, 16)

In [4]:
# Creating multilabels
df = all_tracks
df[[GENRES]] = 0
df.sample(n=1)

Unnamed: 0,artists,name,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
1QV6tiMFM6fSOKOGLMHYYg,[Lady Gaga],Poker Face,pop,0.851,0.806,4,-4.62,1,0.0787,0.118,...,4,0,0,0,0,0,0,0,0,0


In [5]:
# Update genre to 1 for each song's genre
for genre in GENRES:
    df.loc[df['genre'] == genre, genre] = 1
df.sample(n=3)

Unnamed: 0,artists,name,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
7B5p2XX9Prn5OkI5mye6gE,[Donny McCaslin],Stadium Jazz,jazz,0.434,0.851,4,-8.736,1,0.085,0.0133,...,4,0,0,0,0,1,0,0,0,0
1NNDZc9BHaWe9JqIsPRlNV,[Next],Wifey - Club Mix/Dirty Version,rnb,0.831,0.713,7,-6.967,0,0.104,0.0678,...,4,0,0,0,0,0,0,0,1,0
5OCFyuIk7vF5wE58T5v9jZ,[Marty Stuart],Burn Me Down,country,0.551,0.785,7,-9.405,1,0.0549,0.0204,...,4,0,1,0,0,0,0,0,0,0


In [6]:
# Now that every song has a genre attached in the multilabels, drop the genre column
df.drop('genre', axis=1, inplace=True)
df

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
5bu9A6uphPWg39RC3ZKeku,"[Johann Sebastian Bach, Glenn Gould]","Goldberg Variations, BWV 988: Aria",0.4540,0.01390,4,-29.966,0,0.0514,0.995000,0.943000,...,4,1,0,0,0,0,0,0,0,0
1Oo7XEqkrwifJn17aA0ocF,"[Johannes Brahms, Arcadi Volodos]","Sechs Klavierstücke, Op. 118: II. Intermezzo i...",0.2990,0.00626,9,-30.923,1,0.0478,0.994000,0.901000,...,4,1,0,0,0,0,0,0,0,0
1upQiytDIEZfl9ItruoXuC,"[George Frideric Handel, Alexander Briger, Aca...",Handel / Orch. Hale: Keyboard Suite No. 4 in D...,0.0939,0.03360,2,-24.041,0,0.0606,0.927000,0.830000,...,3,1,0,0,0,0,0,0,0,0
47DAIw0cnLLHEXcuUHHC6a,"[Franz Joseph Haydn, Alban Berg Quartett]","Haydn: String Quartet in C Major, Op. 76 No. 3...",0.5000,0.12800,7,-18.042,1,0.0489,0.953000,0.618000,...,4,1,0,0,0,0,0,0,0,0
3DNRdudZ2SstnDCVKFdXxG,"[Ludwig van Beethoven, Paul Lewis]","Sonata No. 14 ""Moonlight"" in C-Sharp Minor"", O...",0.1840,0.00527,1,-37.264,0,0.0432,0.995000,0.887000,...,3,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6vUssnJuFxgw1EFG717Kd5,[The Zombies],This Will Be Our Year [Mono],0.4000,0.51600,9,-9.317,1,0.0316,0.779000,0.000024,...,3,0,0,0,0,0,0,0,0,1
1p50Ir1Hm6Sa1urjaLGi7L,"[Anthrax, Public Enemy]",Bring The Noise,0.6010,0.83100,2,-8.582,1,0.1700,0.000860,0.000000,...,4,0,0,0,0,0,0,0,0,1
7LbfuQVct78YoghmoPtsQ8,[Grateful Dead],Casey Jones - 2013 Remaster,0.6710,0.40500,0,-10.052,1,0.0392,0.385000,0.000000,...,4,0,0,0,0,0,0,0,0,1
3T76zPJz3tWL27FrjJe2ot,[Jack White],Lazaretto,0.3810,0.83800,6,-6.562,0,0.0709,0.004600,0.000038,...,4,0,0,0,0,0,0,0,0,1


In [7]:
# Merge duplicates together
tracks_and_genres = df.groupby(df.index).agg({genre: 'sum' for genre in GENRES})
tracks_and_genres

Unnamed: 0,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
003vvx7Niy0yvhvHt4a68B,0,0,0,0,0,0,0,0,1
00762tXbSj2w2bQMVydJZn,0,0,0,1,0,0,0,0,0
00CXUMREit80f2McJsjcIz,1,0,0,0,0,0,0,0,0
00KyYtT6NaXwbPecina5Pj,0,1,0,0,0,0,0,0,0
00Mb3DuaIH1kjrwOku9CGU,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
7zSEUE2ebqU2zS3ibaNw8r,0,0,1,0,0,0,0,0,0
7zVZF0OgDJkFJohnlKwEpa,0,1,0,0,0,0,0,0,0
7zjEyeBsaw9gV0jofJLfOM,0,0,0,0,0,0,1,0,0
7zmleW3XZx0uUsL2CkFuDe,0,0,0,0,0,0,0,1,0


In [8]:
# Get unique tracks with audio features only
tracks_without_dupes = df[~df.index.duplicated(keep='first')]
tracks_without_dupes = tracks_without_dupes.drop(GENRES, axis=1)
tracks_without_dupes.sample(n=3)

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
3kZoay4ANo86ehb6s4RwS9,"[Chamillionaire, Krayzie Bone]",Ridin',0.787,0.799,8,-4.68,0,0.0993,0.189,0.0,0.3,0.835,143.052,303053,4
5DRUgJmwLvCHQjiFzb4LSQ,[Mad Season],I Don't Know Anything,0.341,0.912,6,-4.03,0,0.0393,0.000577,0.0,0.428,0.416,177.051,300240,4
0PG9fbaaHFHfre2gUVo7AN,"[Cardi B, Bruno Mars]",Please Me,0.747,0.57,1,-6.711,1,0.081,0.0642,0.0,0.0832,0.65,133.992,200890,4


In [9]:
# Merge unique tracks with multilabels
all_tracks_cleaned = pd.merge(tracks_without_dupes, tracks_and_genres, left_index=True, right_index=True)
all_tracks_cleaned.sample(n=3)

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
48fPdAwGVoSkGK8GSez9yx,"[Alison Wonderland, Valentino Khan]",Anything,0.768,0.908,5,-5.014,0,0.0486,0.184,0.546,...,4,0,0,1,0,0,0,0,0,0
0E0EAgEgdVDqzPtouCOqQk,[Joe Henderson],Felicidade,0.683,0.285,9,-16.486,0,0.0472,0.812,0.587,...,4,0,0,0,0,1,0,0,0,0
4MflGTO2ZTcSQ12bWcyRgI,[Andy Grammer],"Honey, I'm Good.",0.752,0.775,9,-7.289,1,0.0546,0.0324,0.0,...,4,0,0,0,0,0,1,0,0,0


In [10]:
# Inspect two previously duplicated tracks and check if more than one genre is listed
all_tracks_cleaned.loc[['1rIKgCH4H52lrvDcz50hS8', '2wxO3ZydJTZRi4Bbc02q7i']]

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
1rIKgCH4H52lrvDcz50hS8,[Zara Larsson],Lush Life,0.694,0.712,7,-3.923,0,0.046,0.132,0.0,...,4,0,0,1,0,0,1,0,0,0
2wxO3ZydJTZRi4Bbc02q7i,"[Scarface, Johnny P, 2Pac]",Smile,0.7,0.402,5,-9.93,0,0.278,0.0498,1e-06,...,4,0,0,0,1,0,0,1,0,0


In [11]:
all_tracks_cleaned.to_pickle("../data/spotify_dataset_all.pkl")