In [1]:
import spotipy
import spotipy.util as util
import spotipy.oauth2 as oauth2

In [2]:
username = '1223827041'

In [9]:
# specific code and details for retrieving the OAuth token were deleted to maintain the privacy of my account
sp = spotipy.Spotify(auth=token)
playlists = sp.user_playlists(username)

In [12]:
# I will be dealing with two playlists worth of songs
# My goal is to predict which playlist a given song would be on
# In other words, I am predciting the genre of that song
for p in playlists['items'][:2]:
    print(p['name'], p['id'])

Hot Country 37i9dQZF1DX1lVhptIYRda
RapCaviar 37i9dQZF1DX0XUsuxWHRQd


In [13]:
# my playlist ids
playlist_ids = [p['id'] for p in playlists['items'][:2]]
playlist_labels = [p['name'] for p in playlists['items'][:2]]

In [14]:
# defining functions for our different labels
def genre2label(genre):
    g2l_dict = {'Hot Country': 0,
                'RapCaviar': 1}
    
    return g2l_dict[genre]

def label2genre(label):
    l2g_dict = {0: 'Hot Country',
                1: 'RapCaviar'}
    
    return l2g_dict[label]

In [15]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd

# Spotify's audio analysis returns more data about the songs, 
# but these are the relevant categories that I will use as my features
desired_features = ['acousticness', 'danceability', 'energy', 'loudness', 'speechiness', 'tempo', 'valence']

# this function iterates through the tracks of playlist
# and returns a dataframe containing all the values of the feature categories
def get_playlist_features(playlist_id, genre, desired_features):
    features = []
    tracks = sp.user_playlist_tracks('spotify', playlist_id)['items']
    for track in tqdm(tracks):
        audio_features = sp.audio_features(track['track']['id'])[0]
        if audio_features:
            final_features = [audio_features[key] for key in desired_features]
            # need to add label to row
            final_features.append(genre2label(genre))
            features.append(final_features)
        
    df = pd.DataFrame(features)
    df.columns = desired_features + ['label']
    return df

In [16]:
data = pd.DataFrame()

# here we get the song features from both playlists and combine them into a single dataframe
for i in range(2):
    print(playlist_labels[i])
    pl_features = get_playlist_features(playlist_ids[i], playlist_labels[i], desired_features)
    data = pd.concat([data, pl_features])

Hot Country

RapCaviar



In [17]:
data = data.reset_index(drop=True)
data.shape

(100, 8)

In [19]:
# due to the small amount of songs and features,
# I will use a simple model, a Decision Tree,
# to try and predict the genre of a song given its audio features

# if we had more data, perhaps something like a Neural Network would be appropriate
# but here I show that accurate predictions can still be made given a small amount of data

# a decision tree is useful in that the "why" of the prediction can be explicitly analyzed,
# whereas a neural network is more of a black box and it is unknown how the model may have made certain predictions

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dec_tree = tree.DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)
preds = dec_tree.predict(X_test)

print(accuracy_score(y_test, preds))

0.95


In [20]:
# creating a visual model of the decision tree
tree.export_graphviz(dec_tree, 'dec_tree.dot', feature_names=desired_features,
                     class_names=playlist_labels, filled=True, impurity=False)