In [2]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
import scipy.stats as sps

from sklearn import tree
## Update to move

In [3]:
songs = pd.read_csv('../data/SpotifyCleaned.csv')
songs.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Opera,Giuseppe Verdi,"Stiffelio, Act III: Ei fugge! … Lina, pensai c...",7EsKYeHtTc4H4xWiTqSVZA,21,0.986,0.313,490867,0.231,0.000431,C#,0.0964,-14.287,Major,0.0547,86.001,4/4,0.0886
1,Opera,Giacomo Puccini,Madama Butterfly / Act 1: ... E soffitto e pareti,7MfmRBvqaW0I6UTxXnad8p,18,0.972,0.36,176797,0.201,0.028,D#,0.133,-19.794,Major,0.0581,131.798,4/4,0.369
2,Opera,Giacomo Puccini,"Turandot / Act 2: Gloria, gloria, o vincitore",7pBo1GDhIysyUMFXiDVoON,10,0.935,0.168,266184,0.47,0.0204,C,0.363,-8.415,Major,0.0383,75.126,3/4,0.0696
3,Opera,Giuseppe Verdi,"Rigoletto, Act IV: Venti scudi hai tu detto?",02mvYZX5aKNzdqEo6jF20m,17,0.961,0.25,288573,0.00605,0.0,D,0.12,-33.44,Major,0.048,76.493,4/4,0.038
4,Opera,Giuseppe Verdi,"Don Carlo / Act 4: ""Ella giammai m'amò!""",03TW0jwGMGhUabAjOpB1T9,19,0.985,0.142,629760,0.058,0.146,D,0.0969,-23.625,Major,0.0493,172.935,4/4,0.0382


In [4]:
songs_class = songs.copy(deep = True)
songs_class['is_popular'] = 0

## Tunable Hyperparameter
popularity_threshold = 60

songs_class.loc[songs_class.popularity > popularity_threshold, "is_popular"] = 1

In [5]:
category_columns = ['genre', 'artist_name', 'track_name', 'key', 'mode', 'time_signature']

for label in category_columns:
    songs_class[label] = LabelEncoder().fit(songs_class[label]).transform(songs_class[label])

In [6]:
features = ['genre', 'artist_name', 'track_name',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']

X = songs_class[features]

y = songs_class.is_popular

In [7]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=131) # 80% training and 20% test

In [8]:
#Tunable Hyperparameters
n_estimators = [25, 30, 35, 40, 45] # 100 Default
max_depth = [15, 20, 25, 30] # None for unlimited (default) #n_features = 16 
# Bootstrapping = True found to consistently slightly increase prediction probability

In [9]:
for i in n_estimators:
    for j in max_depth:          
        # Create Decision Tree classifer object
        clf = RandomForestClassifier(n_estimators = i, max_depth = j,
                         criterion="entropy", random_state = 131)

        # Train Decision Tree Classifer
        clf = clf.fit(X_train,y_train)

        #Predict the response for test dataset
        y_pred = clf.predict(X_test)

        print("n_estimators = ", i , ', max_depth = ', j , ', Accuracy: ', metrics.accuracy_score(y_test, y_pred))

n_estimators =  25 , max_depth =  15 , Accuracy:  0.9326869896216287
n_estimators =  25 , max_depth =  20 , Accuracy:  0.9329147281777662
n_estimators =  25 , max_depth =  25 , Accuracy:  0.9330448644955591
n_estimators =  25 , max_depth =  30 , Accuracy:  0.9326219214627323
n_estimators =  30 , max_depth =  15 , Accuracy:  0.9326544555421804
n_estimators =  30 , max_depth =  20 , Accuracy:  0.9327195237010769
n_estimators =  30 , max_depth =  25 , Accuracy:  0.9332075348928002
n_estimators =  30 , max_depth =  30 , Accuracy:  0.9328496600188698
n_estimators =  35 , max_depth =  15 , Accuracy:  0.9326544555421804
n_estimators =  35 , max_depth =  20 , Accuracy:  0.9332726030516967
n_estimators =  35 , max_depth =  25 , Accuracy:  0.933175000813352
n_estimators =  35 , max_depth =  30 , Accuracy:  0.9329797963366626
n_estimators =  40 , max_depth =  15 , Accuracy:  0.9326869896216287
n_estimators =  40 , max_depth =  20 , Accuracy:  0.9331424667339038
n_estimators =  40 , max_depth =  2

In [10]:
## The Strongest Predictor is n = 30, depth = 20, with accuracy 92.73,
## but values hovering around that area also produce similar results
print('Best number of estimators: 30')
print('Best Maximum Depth: 20')
print('Bootstraping consistently out performs not bootstrapping')

Best number of estimators: 30
Best Maximum Depth: 20
Bootstraping consistently out performs not bootstrapping


The Strongest Predictor is n = 35, depth = 20, with accuracy 93.33, but values hovering around that area also produce similar results Best number of estimators: 35 Best Maximum Depth: 20 Bootstraping consistently out performs not bootstrapping

In [11]:
# The Regressor requires different Labels

yi_test = pd.DataFrame(y_test)
yr_test = yi_test.join(songs_class['popularity']).iloc[:, 1]

yi_train = pd.DataFrame(y_train)
yr_train = yi_train.join(songs_class['popularity']).iloc[:, 1]

In [12]:
#Tunable Hyperparameters
n_estimators1 = [150, 250, 350] # 100 Default
max_depth1 = [15, 20, 25] # None for unlimited (default) #n_features = 16 
# Bootstrapping = True found to consistently slightly increase prediction probability

In [13]:
for i in n_estimators1:
    for j in max_depth1:          
        # Create Decision Tree classifer object
        reg = RandomForestRegressor(n_estimators = i, max_depth = j, random_state = 131)

        # Train Decision Tree Classifer
        reg = reg.fit(X_train,yr_train)

        #Predict the response for test dataset
        #y_pred = reg.predict(X_test)

        print("n_estimators = ", i , ', max_depth = ', j , ', Accuracy: ', reg.score(X_test, yr_test))

n_estimators =  150 , max_depth =  15 , Accuracy:  0.7426836623517061
n_estimators =  150 , max_depth =  20 , Accuracy:  0.7467652753058265
n_estimators =  150 , max_depth =  25 , Accuracy:  0.7466652679812829


KeyboardInterrupt: 