In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
sns.set()

Read data

In [2]:
tracks = pd.read_pickle("../../data/spotify_dataset_all.pkl")

Remove unnecessary variables

In [3]:
cleaned_tracks = tracks.drop(['artists', 'name'], axis=1)
cleaned_tracks.head(n=1)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
5bu9A6uphPWg39RC3ZKeku,0.454,0.0139,4,-29.966,0,0.0514,0.995,0.943,0.0736,0.244,...,4,1,0,0,0,0,0,0,0,0


Normalise

In [4]:
GENRES = ['classical', 'country', 'edm', 'hip-hop', 'jazz', 'pop', 'rap', 'rnb', 'rock']

In [5]:
normaliser = MinMaxScaler()

In [13]:
cleaned_tracks_no_genres = cleaned_tracks.drop(GENRES, axis=1)
normalised_tracks = normaliser.fit_transform(cleaned_tracks_no_genres)
cleaned_tracks_no_genres = pd.DataFrame(normalised_tracks, columns=cleaned_tracks_no_genres.columns).set_index(cleaned_tracks_no_genres.index)
cleaned_tracks = pd.merge(cleaned_tracks_no_genres, cleaned_tracks[GENRES], left_index=True, right_index=True)

cleaned_tracks.sample(n=3)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
3xKsGYkJKy0bbQuUHRYrei,0.388286,0.970916,0.454545,0.886077,1.0,0.199947,0.000324,0.000686,0.081935,0.505877,...,0.75,0,0,0,0,0,0,0,0,1
3yXzqYnNtLHXDOuTjZonXL,0.645336,0.562739,0.909091,0.83103,0.0,0.647775,0.70281,0.0,0.154755,0.367523,...,0.75,0,0,0,0,0,0,0,1,0
53QF56cjZA9RTuuMZDrSA6,0.457701,0.302989,0.363636,0.754261,1.0,0.05429,0.696786,0.0,0.105034,0.119942,...,0.5,0,0,0,0,0,1,0,0,0


Lookup proportions

In [14]:
counts = {}

for genre in GENRES:
    count = cleaned_tracks.loc[cleaned_tracks[genre] == 1].shape[0]
    counts[genre] = count

counts

{'classical': 432,
 'country': 608,
 'edm': 980,
 'hip-hop': 969,
 'jazz': 499,
 'pop': 1220,
 'rap': 885,
 'rnb': 735,
 'rock': 1136}

Resample to get equal counts of all genres

In [9]:
resampled_df = pd.DataFrame()

for genre in GENRES:
    genre_df = cleaned_tracks.loc[cleaned_tracks[genre] == 1].sample(n=420)
    resampled_df = pd.concat([resampled_df, genre_df])

resampled_df.sample(n=9)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,classical,country,edm,hip-hop,jazz,pop,rap,rnb,rock
1SHixA41Vy0TTlJJSX9Gbo,0.317787,0.207714,0.909091,0.581753,1.0,0.043432,0.951807,0.934827,0.068987,0.31551,...,0.75,1,0,0,0,0,0,0,0,0
5KBUiiDRPsocZlY2v3EtRk,0.872017,0.641967,0.909091,0.830538,0.0,0.369703,0.061543,0.000108,0.211726,0.766982,...,0.75,0,0,0,1,0,0,0,0,0
5IUtvfNvOyVYZUa6AJFrnP,0.781996,0.510588,0.363636,0.85132,0.0,0.051642,0.143571,0.0,0.105034,0.098096,...,0.75,0,0,0,0,0,1,0,0,0
3gEm37rTZf4mf3xmr3aTZp,0.073753,0.003786,0.363636,0.281034,1.0,0.050847,0.963855,0.63442,0.061115,0.0,...,0.75,1,0,0,0,0,0,0,0,0
4pDU8T52yTGZutyDQd8qSF,0.404555,0.482507,0.454545,0.830336,1.0,0.028072,0.639557,0.0,0.178579,0.182357,...,0.75,0,1,0,0,0,0,0,0,0
51rXHuKN8Loc4sUlKPODgH,0.633406,0.706152,0.090909,0.867355,1.0,0.732521,0.000813,0.0,0.1185,0.366483,...,0.75,0,0,0,1,0,0,1,0,0
2TScNFVGQ6PQS9cGLMyULI,0.364425,0.419325,0.181818,0.7651,1.0,0.025159,0.374496,0.0,0.284235,0.708728,...,0.75,0,1,0,0,0,0,0,0,0
0BKNINLyNmQUvs9biD0ynB,0.366594,0.335082,0.909091,0.728506,0.0,0.088718,0.830321,0.763747,0.110213,0.301987,...,0.75,0,0,0,0,1,0,0,0,0
7C5egNfLY7qELLhPnQIFtP,0.112798,0.168601,0.818182,0.410028,1.0,0.046081,0.995984,0.954175,0.079967,0.087694,...,1.0,1,0,0,0,0,0,0,0,0


Prepare data for models

In [10]:
X = resampled_df.drop(GENRES, axis=1).to_numpy()
y = resampled_df.loc[:, GENRES].to_numpy()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Models

In [12]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.linear_model import LogisticRegression

cls = LabelPowerset(LogisticRegression(max_iter=400))

cls.fit(X_train, y_train)

pred = cls.predict(X_test)

accuracy_score(y_test, pred)

0.49603174603174605