# CS109A Final Project - Spotify Baseline Model

## Introduction - Features

In [86]:
import pandas as pd
import os
import glob
import csv
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
#import pd.rpy.common as com
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.cm as cm
import random


%matplotlib inline

In [3]:
# get path to the folder
DATA_PATH = "../Songs_features/"
All_Files = glob.glob('{}*.csv'.format(DATA_PATH))

# Train Set

In [113]:
train_files, test_files = train_test_split(All_Files, test_size=0.2, random_state=42)
params = ["duration_ms", "danceability", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity"]
count = 0
for n,file in enumerate(train_files):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:
        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params].copy()
        
        if play_id == 0:
            playlist_mean_params_temp = [df_songs_temp_restricted.mean(axis=0).values]
        else:
            playlist_mean_params_temp = np.append(playlist_mean_params_temp,[df_songs_temp_restricted.mean(axis=0).values],axis=0)
        
        if play_id == len(df_songs_playlist_ids):
            if count == 0:
                playlist_mean_params = playlist_mean_params_temp.copy()
                count += 1
            else:
                playlist_mean_params = np.append(playlist_mean_params,[playlist_mean_params_temp],axis=0)
        
        #if count == 0:
        #    playlist_mean_params = [df_songs_temp_restricted.mean(axis=0).values]
        #    count += 1
        #else:
        #    playlist_mean_params = np.append(playlist_mean_params,[df_songs_temp_restricted.mean(axis=0).values],axis=0)
    # check execution
    if (n+1) % 100 == 0:
        print((n+1), "files done!")

100 files done!
200 files done!
300 files done!
400 files done!
500 files done!
600 files done!
700 files done!
800 files done!


# Create labels for Playlists in train set

In [114]:
clusterer = KMeans(n_clusters=4, random_state=0)
clusterer.fit(playlist_mean_params)
cluster_labels = clusterer.fit_predict(playlist_mean_params)

# Create labels for Songs contained in Playlists in train set

In [115]:
params2 = ["track_uri", "duration_ms", "danceability", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity"]
count = 0
for n,file in enumerate(train_files):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:

        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params2].copy()
        df_songs_temp_restricted["cluster_id"] = cluster_labels[n]
        if play_id == 0:
            full_frame_temp = df_songs_temp_restricted.copy()
        else:
            full_frame_temp = pd.concat([full_frame_temp,df_songs_temp_restricted])
            
        if play_id == len(df_songs_playlist_ids):
            if count == 0:
                full_frame = full_frame_temp.copy()
                count += 1
            else:
                full_frame = pd.concat([full_frame,full_frame_temp])
    # check execution
    if (n+1) % 100 == 0:
        print((n+1), "files done!")

100 files done!
200 files done!
300 files done!
400 files done!
500 files done!
600 files done!
700 files done!
800 files done!


# Fit a Random Forest Classifier on the Song contained in training Playlists

In [116]:
X_train = full_frame[params]
y_train = full_frame["cluster_id"]

clf = RandomForestClassifier(n_estimators=50, max_depth=20, max_features='sqrt')
clf.fit(X_train,y_train)

#compute accuracy
random_forest_train_score = clf.score(X_train, y_train)
print("Accuracy on train set",random_forest_train_score)

Accuracy on train set 1.0


# Test Set

In [117]:
train_files, test_files = train_test_split(All_Files, test_size=0.2, random_state=42)
params = ["duration_ms", "danceability", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity"]
count = 0
for n,file in enumerate(test_files):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:
        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params].copy()
        

        if play_id == 0:
            playlist_mean_params_temp = [df_songs_temp_restricted.mean(axis=0).values]
        else:
            playlist_mean_params_temp = np.append(playlist_mean_params_temp,[df_songs_temp_restricted.mean(axis=0).values],axis=0)
        
        if play_id == len(df_songs_playlist_ids):
            if count == 0:
                playlist_mean_params_test = playlist_mean_params_temp.copy()
                count += 1
            else:
                playlist_mean_params_test = np.append(playlist_mean_params_test,[playlist_mean_params_temp],axis=0)
    # check execution
    if (n+1) % 50 == 0:
        print((n+1), "files done!")

50 files done!
100 files done!
150 files done!
200 files done!


# Create Labels for Playlists and hence for songs contained in playlist using KMeans for the train set

In [118]:
cluster_labels_test = clusterer.predict(playlist_mean_params_test)

# Create labels for Songs contained in Playlists in test set

In [119]:
params2 = ["track_uri", "duration_ms", "danceability", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity"]
count = 0
for n,file in enumerate(test_files):
    df_songs_temp = pd.read_csv(file)
    df_songs_temp = df_songs_temp.dropna()
    
    df_songs_playlist_ids = list(set(df_songs_temp['pid']))
    
    for play_id in df_songs_playlist_ids:
        df_songs_temp_restricted = df_songs_temp[df_songs_temp["pid"] == play_id][params2].copy()
        df_songs_temp_restricted["cluster_id"] = cluster_labels[n]
        if play_id == 0:
            full_frame_temp = df_songs_temp_restricted.copy()
        else:
            full_frame_temp = pd.concat([full_frame_temp,df_songs_temp_restricted])
        
        if play_id == len(df_songs_playlist_ids):
            if count == 0:
                full_frame_test = full_frame_temp.copy()
                count += 1
            else:
                full_frame_test = pd.concat([full_frame_test,full_frame_temp])
    # check execution
    if (n+1) % 50 == 0:
        print((n+1), "files done!")

50 files done!
100 files done!
150 files done!
200 files done!


# Predict Song labels in Test Playlist using Random Forest Classifier

In [120]:
X_test = full_frame_test[params]
y_test = full_frame_test["cluster_id"]

#compute accuracy
random_forest_test_score = clf.score(X_test, y_test)
print("Accuracy on test set",random_forest_test_score)

Accuracy on test set 1.0
