# CS109A Final Project - Spotify Baseline Model

## Introduction - Features

In [11]:
import pandas as pd
import os
import glob
import csv
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
#import pd.rpy.common as com
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.cm as cm
import random


%matplotlib inline

In [12]:
# get path to the folder
DATA_PATH = "../Songs_features/"
All_Files = glob.glob('{}*.csv'.format(DATA_PATH))

# Train Set

In [13]:
train_files, test_files = train_test_split(All_Files, test_size=0.2, random_state=42)
params = ["duration_ms", "danceability", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity"]
count = 0
for n,file in enumerate(train_files[0:100]):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:
        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params].copy()
        if count == 0:
            playlist_mean_params = [df_songs_temp_restricted.mean(axis=0).values]
            count += 1
        else:
            playlist_mean_params = np.append(playlist_mean_params,[df_songs_temp_restricted.mean(axis=0).values],axis=0)
    # check execution
    if (n+1) % 50 == 0:
        print((n+1), "files done!")


50 files done!
100 files done!


# Create labels for Playlists in train set

In [14]:
clusterer = KMeans(n_clusters=4, random_state=0)
clusterer.fit(playlist_mean_params)
cluster_labels = clusterer.fit_predict(playlist_mean_params)

# Create labels for Songs contained in Playlists in train set

In [15]:
params2 = ["track_uri", "duration_ms", "danceability", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity"]
count = 0
for n,file in enumerate(train_files[0:10]):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:

        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params2].copy()
        df_songs_temp_restricted["cluster_id"] = cluster_labels[n]
        if count == 0:
            full_frame = df_songs_temp_restricted.copy()
            count += 1
        else:
            full_frame = pd.concat([full_frame,df_songs_temp_restricted])
    # check execution
    print((n+1), "files done!")

1 files done!
2 files done!
3 files done!
4 files done!
5 files done!
6 files done!
7 files done!
8 files done!
9 files done!
10 files done!


# Fit a Random Forest Classifier on the Song contained in training Playlists

In [16]:
X_train = full_frame[params]
y_train = full_frame["cluster_id"]

clf = RandomForestClassifier(n_estimators=50, max_depth=20, max_features='sqrt')
clf.fit(X_train,y_train)

#compute accuracy
random_forest_train_score = clf.score(X_train, y_train)
print("Accuracy on train set",random_forest_train_score)

Accuracy on train set 0.6117055580282125


# Test set

In [17]:
count = 0
for n,file in enumerate(test_files[0:1]):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:
        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params].copy()
        if count == 0:
            playlist_mean_params_test = [df_songs_temp_restricted.mean(axis=0).values]
            count += 1
        else:
            playlist_mean_params_test = np.append(playlist_mean_params_test,[df_songs_temp_restricted.mean(axis=0).values],axis=0)
    # check execution
    if (n+1) % 100 == 0:
        print((n+1), "files done!")

# Create Labels for Playlists and hence for songs contained in playlist using KMeans for the train set

In [18]:
cluster_labels_test = clusterer.predict(playlist_mean_params_test)

# Create labels for Songs contained in Playlists in test set

In [28]:
params2 = ["track_uri", "duration_ms", "danceability", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity"]
count = 0
for n,file in enumerate(test_files[0:1]):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:
        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params2].copy()
        df_songs_temp_restricted["cluster_id"] = cluster_labels[n]
        if count == 0:
            full_frame_test = df_songs_temp_restricted.copy()
            count += 1
        else:
            full_frame_test = pd.concat([full_frame_test,df_songs_temp_restricted])
    
    # check execution
print((n+1), "files done!")

1 files done!


# Predict Song labels in Test Playlist using Random Forest Classifier

In [29]:
X_test = full_frame_test[params]
y_test = full_frame_test["cluster_id"]

#compute accuracy
random_forest_test_score = clf.score(X_test, y_test)
print("Accuracy on test set",random_forest_test_score)


Accuracy on test set 0.9930741335235718
