Business Understanding: 
1) How do audio features impact the popularity of songs on Spotify?
2) Can a song's audio features indicate the likely popularity of the 
   song before it is released?

In [1]:
#import the necessary libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

Gather (see Understanding the Data.ipynb for Assess and Clean Data steps)

In [2]:
#add the playlists from the stored Data file
all_features = pd.read_csv(r'C:\\Users\\Jessica\\Spot50-Data-Analysis\\Data\\Features.csv', 
                          sep = ',', error_bad_lines = False, index_col = False)


Clean: Scale the audio features to the same range

In [3]:
scaler = MinMaxScaler()
col_names = all_features.columns.tolist()
scaled_features = all_features.copy()
scaled_features[col_names] = scaler.fit_transform(all_features[col_names])
scaled_features.describe()

Unnamed: 0,Length,Popularity,Acousticness,Danceability,Energy,Instrumentalness,Liveness,Loudness,Speechiness,Tempo,Valence
count,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0
mean,0.479954,0.721618,0.226932,0.602592,0.590506,0.02024386,0.319941,0.664389,0.111672,0.421139,0.505486
std,0.230432,0.183951,0.252111,0.222625,0.199482,0.1170957,0.234006,0.206107,0.15322,0.211484,0.256201
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.283693,0.644068,0.0182,0.47117,0.448286,0.0,0.169353,0.555319,0.017834,0.265034,0.326279
50%,0.479225,0.762712,0.13154,0.651565,0.610159,8.849077e-07,0.228426,0.67863,0.044979,0.435674,0.480674
75%,0.625931,0.847458,0.352102,0.764003,0.747246,7.798588e-05,0.397843,0.818733,0.134436,0.560037,0.704437
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Analyze: Begin Data Modeling/Evaluation

In [4]:
#split features data into train and test data sets
y = scaled_features['Popularity']
X = scaled_features.drop(['Popularity'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33,
                                                    random_state=42)

In [5]:
#Determine the mean square error for Lasso Regression on the data set
model = Lasso()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, 
                         scoring='neg_root_mean_squared_error', cv=cv, 
                         n_jobs=-1)
scores = np.absolute(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MSE: 0.146 (0.054)


In [6]:
#Determine the mean square error for Ridge Regression on the data set
model = Ridge()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, 
                         scoring='neg_root_mean_squared_error', cv=cv, 
                         n_jobs=-1)
scores = np.absolute(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MSE: 0.147 (0.052)


In [7]:
#Determine the mean square error for XGB Regression on the data set
model = XGBRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, 
                         scoring='neg_root_mean_squared_error', cv=cv, 
                         n_jobs=-1)
scores = np.absolute(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MSE: 0.170 (0.057)


In [8]:
#Determine the mean square error for Random Forest Regression on the data set
model = RandomForestRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, 
                         scoring='neg_root_mean_squared_error', cv=cv, 
                         n_jobs=-1)
scores = np.absolute(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MSE: 0.154 (0.053)


In [9]:
#Determine the mean square error for Decision Tree Regression on the data set
model = DecisionTreeRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, 
                         scoring='neg_root_mean_squared_error', cv=cv, 
                         n_jobs=-1)
scores = np.absolute(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MSE: 0.192 (0.058)


In [10]:
#Determine the mean square error for Support Vector Regression on the data set
model = SVR()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, 
                         scoring='neg_root_mean_squared_error', cv=cv, 
                         n_jobs=-1)
scores = np.absolute(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MSE: 0.155 (0.051)


Use stacked regressors to see if we can improve the model

In [11]:
#create a list of level 0 estimators for stacking comparisons
level0 = list()
level0.append(('rfr', RandomForestRegressor()))
level0.append(('knn', KNeighborsRegressor()))
level0.append(('dtr', DecisionTreeRegressor()))
level0.append(('svm', SVR()))


In [12]:
def evaluate_model(model, X, y):

	'''
    FUNCTION: output the mean squared error of a regression model
    INPUTS: model- the stacked regression model to score
			X- popularity
			y- audio features
    OUTPUT: mean squared error of the model
    '''
	
	cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, 
							scoring='neg_root_mean_squared_error', cv=cv, 
							n_jobs=-1, error_score='raise')
	scores = np.absolute(scores)
	return scores

In [13]:
import itertools
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33,
                                                    random_state=42)
#loop through every possible stacked combination and output the best combination
#and its score
best_combo = ''
lowest_score = 5
for n in range(0, len(level0)+1):
    for subset in itertools.combinations(level0, n):
        if subset:
            #XGB
            model = StackingRegressor(estimators=subset, 
                                      final_estimator=XGBRegressor(), 
                                      cv=5)
            scores = evaluate_model(model,X_train, y_train)
            #print('XGB +',subset,(np.mean(scores), np.std(scores)))
            if np.mean(scores)<lowest_score:
                lowest_score = np.mean(scores)
                best_linear = 'XGB'
                best_combo = subset
            #Lasso
            model = StackingRegressor(estimators=subset, 
                                      final_estimator=Lasso(), 
                                      cv=5)
            scores = evaluate_model(model,X_train, y_train)
            #print('Lasso +',subset,(np.mean(scores), np.std(scores)))
            if np.mean(scores)<lowest_score:
                lowest_score = np.mean(scores)
                best_linear = 'Lasso'
                best_combo = subset
            #Ridge
            model = StackingRegressor(estimators=subset, 
                                      final_estimator=Ridge(), 
                                      cv=5)
            scores = evaluate_model(model,X_train, y_train)
            #print('Ridge +',subset,(np.mean(scores), np.std(scores)))
            if np.mean(scores)<lowest_score:
                lowest_score = np.mean(scores)
                best_linear = 'Ridge'
                best_combo = subset
print('The best combination is:', best_linear, '+', best_combo, 
      'with a score of', lowest_score)

The best combination is: Ridge + (('rfr', RandomForestRegressor()), ('knn', KNeighborsRegressor()), ('dtr', DecisionTreeRegressor())) with a score of 0.1455730104345429


In [14]:
#based on the output of the selection loop, evaluate the regression model and 
#compare the predicted values to the actual popularity values
est_lev0 = [('rfr', RandomForestRegressor()),('knn', KNeighborsRegressor()),
            ('dtr', DecisionTreeRegressor())]
model = StackingRegressor(estimators=est_lev0,final_estimator=Ridge(),cv=5)
scores = evaluate_model(model,X_train, y_train)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
result_compare = pd.DataFrame()
result_compare['Predicted'] = y_pred
result_compare = pd.merge(result_compare,y_test,left_index=True,right_index=True)
result_compare['Difference'] = y_pred-y_test


In [15]:
result_compare

Unnamed: 0,Predicted,Popularity,Difference
0,0.736699,1.0,-0.269091
4,0.734974,0.881356,-0.152355
9,0.728362,0.728814,0.004212
10,0.725598,0.949153,-0.220773
11,0.737617,0.932203,-0.203841
12,0.730909,0.915254,-0.184851
15,0.729704,0.932203,-0.197153
18,0.724646,0.898305,-0.171292
22,0.732675,0.898305,-0.164792
26,0.728044,0.79661,-0.064978
