In [4]:
import time
import json
import pandas as pd
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB, CategoricalNB

from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from keras.wrappers.scikit_learn import KerasClassifier
#from scikeras.wrappers import KerasClassifier

In [19]:
def cross_val_analysis(model, x_train, y_train, k = 10, output = True):
    kfold = StratifiedKFold(n_splits=k, shuffle=True)
    results = cross_val_score(model, x_train, y_train, cv=kfold)
    if output:
        print("Cross-Validation Score Standardized: %.2f%% (%.2f%%)" %
            (results.mean() * 100, results.std() * 100))
    return (results.mean() * 100)

In [9]:
#trainingData = pd.DataFrame("DadosSpotify.csv")

## **PREPARACAO**



In [10]:
trainingData = pd.read_csv("DadosSpotify.csv") 

In [11]:
trainingData

Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0,0.01020,0.833,204600,0.434,0.021900,2,0.1650,-8.795,1,0.4310,150.062,4.0,0.286,1,Mask Off,Future
1,1,0.19900,0.743,326933,0.359,0.006110,1,0.1370,-10.401,1,0.0794,160.083,4.0,0.588,1,Redbone,Childish Gambino
2,2,0.03440,0.838,185707,0.412,0.000234,2,0.1590,-7.148,1,0.2890,75.044,4.0,0.173,1,Xanny Family,Future
3,3,0.60400,0.494,199413,0.338,0.510000,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.230,1,Master Of None,Beach House
4,4,0.18000,0.678,392893,0.561,0.512000,5,0.4390,-11.648,0,0.0694,174.004,4.0,0.904,1,Parallel Lines,Junior Boys
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012,2012,0.00106,0.584,274404,0.932,0.002690,1,0.1290,-3.501,1,0.3330,74.976,4.0,0.211,0,Like A Bitch - Kill The Noise Remix,Kill The Noise
2013,2013,0.08770,0.894,182182,0.892,0.001670,1,0.0528,-2.663,1,0.1310,110.041,4.0,0.867,0,Candy,Dillon Francis
2014,2014,0.00857,0.637,207200,0.935,0.003990,0,0.2140,-2.467,1,0.1070,150.082,4.0,0.470,0,Habit - Dack Janiels & Wenzday Remix,Rain Man
2015,2015,0.00164,0.557,185600,0.992,0.677000,1,0.0913,-2.735,1,0.1330,150.011,4.0,0.623,0,First Contact,Twin Moons


In [12]:
train, test = train_test_split(trainingData, test_size = 0.15)

In [13]:
target_features = ["danceability", "loudness", "valence", "energy", "instrumentalness", "acousticness", "key", "speechiness", "duration_ms"]

x_total = trainingData[target_features]
y_total = trainingData["target"]
x_train = train[target_features]
y_train = train["target"]
x_test = test[target_features]
y_test = test["target"]

In [14]:
x_train


Unnamed: 0,danceability,loudness,valence,energy,instrumentalness,acousticness,key,speechiness,duration_ms
1305,0.406,-4.564,0.260,0.936,0.242000,0.000168,2,0.1100,180872
1615,0.623,-2.632,0.247,0.975,0.160000,0.007320,3,0.1090,192000
656,0.375,-5.673,0.394,0.804,0.694000,0.000258,7,0.0372,212947
314,0.446,-6.088,0.585,0.752,0.000000,0.017400,6,0.3680,241800
615,0.798,-10.488,0.640,0.481,0.000000,0.030200,7,0.3140,216585
...,...,...,...,...,...,...,...,...,...
1740,0.685,-6.939,0.806,0.872,0.000000,0.007990,7,0.0321,209560
1970,0.549,-6.282,0.462,0.576,0.000002,0.036400,9,0.0263,202787
1284,0.370,-4.774,0.117,0.723,0.000000,0.032600,0,0.0364,235227
1425,0.501,-6.248,0.615,0.775,0.000000,0.027300,7,0.4360,269720


In [15]:
y_train

1305    0
1615    0
656     1
314     1
615     1
       ..
1740    0
1970    0
1284    0
1425    0
927     1
Name: target, Length: 1714, dtype: int64

In [16]:
x_test

Unnamed: 0,danceability,loudness,valence,energy,instrumentalness,acousticness,key,speechiness,duration_ms
2001,0.484,-5.652,0.2660,0.609,0.000000,0.002620,11,0.0303,224320
788,0.564,-5.897,0.6950,0.846,0.001750,0.006920,4,0.0437,212986
228,0.745,-5.754,0.9460,0.857,0.200000,0.000686,0,0.0393,384907
189,0.633,-7.672,0.3570,0.492,0.000470,0.048800,2,0.0524,224720
345,0.636,-6.195,0.4440,0.720,0.017800,0.089300,11,0.0432,235813
...,...,...,...,...,...,...,...,...,...
1033,0.726,-6.846,0.6660,0.595,0.000433,0.148000,11,0.0393,231173
191,0.430,-4.816,0.2200,0.436,0.000000,0.170000,8,0.1190,154414
704,0.743,-8.224,0.3980,0.633,0.000000,0.007230,9,0.2350,259683
542,0.354,-10.304,0.0745,0.228,0.800000,0.854000,0,0.0296,476164


In [17]:
y_test

2001    0
788     1
228     1
189     1
345     1
       ..
1033    0
191     1
704     1
542     1
1687    0
Name: target, Length: 303, dtype: int64

## Model Evaluation

**Random Forest**

In [20]:
# random forest combines multiple decision trees and uses bagging
# (training multiple trees on different sections of training data, averaging the result)
rfc = RandomForestClassifier(n_estimators=400, random_state=1)
ts_train_start = time.time()
rfc.fit(x_train, y_train)
ts_train_end = time.time()

ts_inference_start = ts_train_end
rfc_pred = rfc.predict(x_test)
ts_inference_end = time.time()

score = accuracy_score(y_test, rfc_pred) * 100
print("Accuracy using Random Forest: ", round(score, 2), "%")
print('Mean squared error: %.2f' % mean_squared_error(y_test, rfc_pred))
print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
    round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))
ts_cv_start = time.time()
cv_score = cross_val_analysis(rfc, x_total, y_total)
ts_cv_end = time.time()
print("Time Spent (Cross-Val): {}sec total".format(round(ts_cv_end - ts_cv_start, 3)))

Accuracy using Random Forest:  74.92 %
Mean squared error: 0.25
Time Spent: 2.664sec total, 2.586sec training, 0.078sec inference
Cross-Validation Score Standardized: 77.79% (2.89%)
Time Spent (Cross-Val): 21.09sec total


**Gradient Boosting**

In [21]:
# I <3 gradient boost
gbc = GradientBoostingClassifier(n_estimators=400, learning_rate=.1, max_depth=2, random_state=1)
ts_train_start = time.time()
gbc.fit(x_train, y_train)
ts_train_end = time.time()

ts_inference_start = ts_train_end
gbc_pred = gbc.predict(x_test)
ts_inference_end = time.time()

score = accuracy_score(y_test, gbc_pred) * 100
print("Accuracy using Gradient Boost: ", round(score, 2), "%")
print('Mean squared error: %.2f' % mean_squared_error(y_test, gbc_pred))
print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
    round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))
ts_cv_start = time.time()
cv_score = cross_val_analysis(gbc, x_total, y_total)
ts_cv_end = time.time()
print("Time Spent (Cross-Val): {}sec total".format(round(ts_cv_end - ts_cv_start, 3)))

Accuracy using Gradient Boost:  73.27 %
Mean squared error: 0.27
Time Spent: 2.196sec total, 2.19sec training, 0.006sec inference
Cross-Validation Score Standardized: 75.01% (3.05%)
Time Spent (Cross-Val): 15.55sec total


**Gradient Boosting w/ Manual Stacking (Random Forest)**

In [22]:
x_train_rfc = (x_train.iloc[0:int(len(x_train) / 2)]).copy()
y_train_rfc = (y_train.iloc[0:int(len(y_train) / 2)]).copy()
x_train_gbc = (x_train.iloc[int(len(x_train) / 2):len(x_train)]).copy()
y_train_gbc = (y_train.iloc[int(len(y_train) / 2):len(y_train)]).copy()

In [23]:
x_test_rfc = x_train_gbc
y_test_rfc = y_train_gbc
x_test_gbc = x_test.copy()
y_test_gbc = y_test.copy()

In [24]:
rfc_stack = RandomForestClassifier(n_jobs=1, random_state=1)
ts_train_start = time.time()
rfc_stack.fit(x_train_rfc, y_train_rfc)
ts_train_end = time.time()

ts_inference_start = ts_train_end
rfc_stack_pred_train = rfc_stack.predict(x_test_rfc)
ts_inference_end = time.time()

score = accuracy_score(y_test_rfc, rfc_stack_pred_train) * 100
print("Accuracy using Random Forest: ", round(score, 2), "%")
print('Mean squared error: %.2f' %
      mean_squared_error(y_test_rfc, rfc_stack_pred_train))
print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
    round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))
ts_cv_start = time.time()
cv_score = cross_val_analysis(rfc_stack, x_total, y_total)
ts_cv_end = time.time()
print("Time Spent (Cross-Val): {}sec total".format(round(ts_cv_end - ts_cv_start, 3)))

Accuracy using Random Forest:  75.26 %
Mean squared error: 0.25
Time Spent: 0.302sec total, 0.277sec training, 0.025sec inference
Cross-Validation Score Standardized: 77.05% (3.33%)
Time Spent (Cross-Val): 6.828sec total


In [25]:
rfc_stack_pred_test = rfc_stack.predict(x_test_gbc)

In [26]:
# stack
x_train_gbc['rfc_stack_pred'] = rfc_stack_pred_train.tolist()
x_test_gbc['rfc_stack_pred'] = rfc_stack_pred_test.tolist()

In [27]:
x_train_gbc

Unnamed: 0,danceability,loudness,valence,energy,instrumentalness,acousticness,key,speechiness,duration_ms,rfc_stack_pred
1354,0.633,-4.749,0.493,0.926,0.540000,0.01060,11,0.0971,245053,0
160,0.427,-6.595,0.462,0.465,0.000055,0.46100,6,0.0291,206387,0
1664,0.591,-6.720,0.482,0.649,0.000000,0.02940,3,0.0417,228347,0
954,0.663,-11.013,0.899,0.602,0.000088,0.26900,9,0.1320,361173,1
1752,0.725,-5.959,0.595,0.487,0.000011,0.26000,8,0.0368,296693,0
...,...,...,...,...,...,...,...,...,...,...
1740,0.685,-6.939,0.806,0.872,0.000000,0.00799,7,0.0321,209560,0
1970,0.549,-6.282,0.462,0.576,0.000002,0.03640,9,0.0263,202787,0
1284,0.370,-4.774,0.117,0.723,0.000000,0.03260,0,0.0364,235227,0
1425,0.501,-6.248,0.615,0.775,0.000000,0.02730,7,0.4360,269720,1


In [28]:
x_test_gbc

Unnamed: 0,danceability,loudness,valence,energy,instrumentalness,acousticness,key,speechiness,duration_ms,rfc_stack_pred
2001,0.484,-5.652,0.2660,0.609,0.000000,0.002620,11,0.0303,224320,0
788,0.564,-5.897,0.6950,0.846,0.001750,0.006920,4,0.0437,212986,1
228,0.745,-5.754,0.9460,0.857,0.200000,0.000686,0,0.0393,384907,1
189,0.633,-7.672,0.3570,0.492,0.000470,0.048800,2,0.0524,224720,1
345,0.636,-6.195,0.4440,0.720,0.017800,0.089300,11,0.0432,235813,1
...,...,...,...,...,...,...,...,...,...,...
1033,0.726,-6.846,0.6660,0.595,0.000433,0.148000,11,0.0393,231173,1
191,0.430,-4.816,0.2200,0.436,0.000000,0.170000,8,0.1190,154414,1
704,0.743,-8.224,0.3980,0.633,0.000000,0.007230,9,0.2350,259683,1
542,0.354,-10.304,0.0745,0.228,0.800000,0.854000,0,0.0296,476164,1


In [29]:
gbc_stack = GradientBoostingClassifier(n_estimators=100, learning_rate=.1, max_depth=1, random_state=1)
ts_train_start = time.time()
gbc_stack.fit(x_train_gbc, y_train_gbc)
ts_train_end = time.time()

ts_inference_start = ts_train_end
gbc_stack_pred = gbc_stack.predict(x_test_gbc)
ts_inference_end = time.time()

score = accuracy_score(y_test_gbc, gbc_stack_pred) * 100
print("Accuracy using Gradient Boost: ", round(score, 2), "%")
print('Mean squared error: %.2f' % mean_squared_error(y_test_gbc, gbc_stack_pred))
print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
    round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))

Accuracy using Gradient Boost:  75.91 %
Mean squared error: 0.24
Time Spent: 0.197sec total, 0.192sec training, 0.005sec inference


**Stacking Classifier**


In [30]:
base_learners = [
  ('rf_1', RandomForestClassifier(n_estimators=400, random_state=1)),
  ('rf_2', GradientBoostingClassifier(n_estimators=400,
   learning_rate=.1, max_depth=2, random_state=1))
]

stk = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())
ts_train_start = time.time()
stk.fit(x_train, y_train)
ts_train_end = time.time()

ts_inference_start = ts_train_end
stk_pred = stk.predict(x_test)
ts_inference_end = time.time()

score = accuracy_score(y_test, stk_pred) * 100
print("Accuracy using Gradient Boost: ", round(score, 2), "%")
print('Mean squared error: %.2f' % mean_squared_error(y_test, stk_pred))
print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
    round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))
ts_cv_start = time.time()
cv_score = cross_val_analysis(stk, x_total, y_total)
ts_cv_end = time.time()
print("Time Spent (Cross-Val): {}sec total".format(round(ts_cv_end - ts_cv_start, 3)))

Accuracy using Gradient Boost:  74.26 %
Mean squared error: 0.26
Time Spent: 19.153sec total, 18.886sec training, 0.267sec inference
Cross-Validation Score Standardized: 76.50% (2.94%)
Time Spent (Cross-Val): 199.087sec total


In [31]:
base_learners = [
    ('rf_1', RandomForestClassifier(n_estimators=400, random_state=1)),
    ('rf_2', KNeighborsClassifier(n_neighbors=25)),
]

stk = StackingClassifier(estimators=base_learners,
                         final_estimator=LogisticRegression())
ts_train_start = time.time()
stk.fit(x_train, y_train)
ts_train_end = time.time()

ts_inference_start = ts_train_end
stk_pred = stk.predict(x_test)
ts_inference_end = time.time()

score = accuracy_score(y_test, stk_pred) * 100
print("Accuracy using Gradient Boost: ", round(score, 2), "%")
print('Mean squared error: %.2f' % mean_squared_error(y_test, stk_pred))
print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
    round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))
ts_cv_start = time.time()
cv_score = cross_val_analysis(stk, x_total, y_total)
ts_cv_end = time.time()
print("Time Spent (Cross-Val): {}sec total".format(round(ts_cv_end - ts_cv_start, 3)))

Accuracy using Gradient Boost:  74.92 %
Mean squared error: 0.25
Time Spent: 10.401sec total, 10.331sec training, 0.07sec inference
Cross-Validation Score Standardized: 76.70% (3.43%)
Time Spent (Cross-Val): 106.055sec total


In [32]:
base_learners = [
  ('rf_1', RandomForestClassifier(n_estimators=10, random_state=42)),
  ('rf_2', KNeighborsClassifier(n_neighbors=5)),             
  ('rf_3', GradientBoostingClassifier(n_estimators=100, learning_rate=.1, max_depth=1, random_state=1))
]

stk = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())
ts_train_start = time.time()
stk.fit(x_train, y_train)
ts_train_end = time.time()

ts_inference_start = ts_train_end
stk_pred = stk.predict(x_test)
ts_inference_end = time.time()


score = accuracy_score(y_test, stk_pred) * 100
print("Accuracy using Gradient Boost: ", round(score, 2), "%")
print('Mean squared error: %.2f' % mean_squared_error(y_test, stk_pred))
print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
    round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))
ts_cv_start = time.time()
cv_score = cross_val_analysis(stk, x_total, y_total)
ts_cv_end = time.time()
print("Time Spent (Cross-Val): {}sec total".format(round(ts_cv_end - ts_cv_start, 3)))

Accuracy using Gradient Boost:  74.59 %
Mean squared error: 0.25
Time Spent: 1.39sec total, 1.381sec training, 0.009sec inference
Cross-Validation Score Standardized: 74.22% (2.63%)
Time Spent (Cross-Val): 14.55sec total


## **Neural Networks**


**Baseline**

In [33]:
def keras_nn_baseline_model_small():
  model = Sequential()
  model.add(Dense(9, input_dim=9, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [34]:
def keras_nn_baseline_model_big():
  model = Sequential()
  model.add(Dense(9, input_dim=9, activation='relu'))
  model.add(Dense(18, input_dim=9, activation='relu'))
  model.add(Dense(90, input_dim=18, activation='sigmoid'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [35]:
def keras_nn_baseline_model_shaped():
  model = Sequential()
  model.add(Dense(5, input_dim=9, activation='relu'))
  model.add(Dense(60, input_dim=5, activation='sigmoid'))
  model.add(Dense(5, input_dim=60, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [36]:
keras_nn_baseline_models = [
    keras_nn_baseline_model_small, keras_nn_baseline_model_big, keras_nn_baseline_model_shaped
]
keras_nn_baseline_model_names = [
    "keras_nn_baseline_model_small", "keras_nn_baseline_model_big", "keras_nn_baseline_model_shaped"
]

# keras_nn_baseline_models = [keras_nn_baseline_model_shaped]
# keras_nn_baseline_model_names = ["keras_nn_baseline_model_shaped"]

num_training_epochs = 100  # 200
batch_size = 5

**Pipeline w/ Normalization**


In [37]:
for keras_nn_baseline_model, keras_nn_baseline_model_name in zip(keras_nn_baseline_models, keras_nn_baseline_model_names):
    # build pipeline
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('mlp', KerasClassifier(build_fn=keras_nn_baseline_model,
                    epochs=num_training_epochs, batch_size=batch_size, verbose=0)))
    keras_pipeline = Pipeline(estimators)
    print('Running Training & Inference for Keras NN Model "{}"'.format(
        keras_nn_baseline_model_name))
    # train model
    ts_train_start = time.time()
    keras_pipeline.fit(x_train, y_train)
    ts_train_end = time.time()
    # inference w model
    ts_inference_start = ts_train_end
    keras_pipeline_pred = keras_pipeline.predict(x_test)
    ts_inference_end = time.time()
    # validation accuracy
    score = accuracy_score(y_test, keras_pipeline_pred) * 100
    print("Accuracy using Keras: ", round(score, 2), "%")
    print('Mean squared error: %.2f' % mean_squared_error(y_test, stk_pred))
    print("Time Spent: {}sec total, {}sec training, {}sec inference".format(
        round(ts_inference_end - ts_train_start, 3), round(ts_train_end - ts_train_start, 3), round(ts_inference_end - ts_inference_start, 3)))
    print(" ")

  


Running Training & Inference for Keras NN Model "keras_nn_baseline_model_small"
Accuracy using Keras:  68.98 %
Mean squared error: 0.25
Time Spent: 83.669sec total, 83.502sec training, 0.167sec inference
 
Running Training & Inference for Keras NN Model "keras_nn_baseline_model_big"


  


Accuracy using Keras:  73.6 %
Mean squared error: 0.25
Time Spent: 82.696sec total, 82.568sec training, 0.128sec inference
 
Running Training & Inference for Keras NN Model "keras_nn_baseline_model_shaped"


  


Accuracy using Keras:  69.31 %
Mean squared error: 0.25
Time Spent: 51.17sec total, 51.015sec training, 0.155sec inference
 
