In [41]:
import pandas as pd
from joblib import dump, load
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [42]:
df = pd.read_csv('../../data/processed/spotify_songs_processed.csv', index_col=0)
df.dropna(axis=0, inplace=True)

In [43]:
df.describe()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0,32828.0
mean,42.483551,0.65485,0.698603,5.373949,-6.719529,0.565737,0.107053,0.175352,0.08476,0.190175,0.510556,120.883642,225796.829779
std,24.980476,0.145092,0.180916,3.611572,2.988641,0.495667,0.101307,0.219644,0.224245,0.154313,0.233152,26.903632,59836.492346
min,0.0,0.0,0.000175,0.0,-46.448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000.0
25%,24.0,0.563,0.581,2.0,-8.17125,0.0,0.041,0.0151,0.0,0.0927,0.331,99.961,187804.5
50%,45.0,0.672,0.721,6.0,-6.166,1.0,0.0625,0.0804,1.6e-05,0.127,0.512,121.984,216000.0
75%,62.0,0.761,0.84,9.0,-4.645,1.0,0.132,0.255,0.00483,0.248,0.693,133.91825,253581.25
max,100.0,0.983,1.0,11.0,1.275,1.0,0.918,0.994,0.994,0.996,0.991,239.44,517810.0


In [44]:
y = df['track_popularity']
genre_dummy = pd.get_dummies(df.playlist_genre)
X = pd.concat([genre_dummy, df.loc[:, 'danceability':'duration_ms']], axis=1)

In [45]:
# split the dataset into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
# standardise loudness, tempo and duration_ms
X_train['loudness'] = StandardScaler().fit(X_train.loc[:, ['loudness']]).transform(X_train.loc[:, ['loudness']])
X_train['tempo'] = StandardScaler().fit(X_train.loc[:, ['tempo']]).transform(X_train.loc[:, ['tempo']])
X_train['duration_ms'] = StandardScaler().fit(X_train.loc[:, ['duration_ms']]).transform(X_train.loc[:, ['duration_ms']])

In [47]:
# Grid search to fint the optiomal model
grid_param = {
    'n_estimators': [200, 300, 400, 500],
    'max_features': [2, 3, 4, 5],
    'max_depth': [3, 4, 5, 6]
}

model = RandomForestRegressor()

model_grid = GridSearchCV(model, param_grid=grid_param, cv=5)
model_grid.fit(X_train, y_train)

best_model = model_grid.best_estimator_
print(best_model)

RandomForestRegressor(max_depth=6, max_features=5, n_estimators=400)


In [48]:
y_train_pred = best_model.predict(X_train)
mean_squared_error(y_train, y_train_pred, squared=False)

23.196956237726656

In [49]:
# standardize the test data
X_test['loudness'] = StandardScaler().fit(X_test.loc[:, ['loudness']]).transform(X_test.loc[:, ['loudness']])
X_test['tempo'] = StandardScaler().fit(X_test.loc[:, ['tempo']]).transform(X_test.loc[:, ['tempo']])
X_test['duration_ms'] = StandardScaler().fit(X_test.loc[:, ['duration_ms']]).transform(X_test.loc[:, ['duration_ms']])

In [50]:
mean_squared_error(y_test, best_model.predict(X_test), squared=False)

23.804148860974852

In [51]:
# build the model using the whole dataset
X['loudness'] = StandardScaler().fit(X.loc[:, ['loudness']]).transform(X.loc[:, ['loudness']])
X['tempo'] = StandardScaler().fit(X.loc[:, ['tempo']]).transform(X.loc[:, ['tempo']])
X['duration_ms'] = StandardScaler().fit(X.loc[:, ['duration_ms']]).transform(X.loc[:, ['duration_ms']])

In [52]:
full_model = RandomForestRegressor(n_estimators=400, max_features=5, max_depth=7)
full_model.fit(X, y)

In [53]:
dump(full_model, 'spotify_model.joblib')

['spotify_model.joblib']

In [54]:
# save the scalar
loud_scalar = StandardScaler().fit(df.loc[:, ['loudness']])
dump(loud_scalar, 'loud_scalar.joblib')

['loud_scalar.joblib']

In [55]:
duration_scalar = StandardScaler().fit(df.loc[:, ['duration_ms']])
dump(duration_scalar, 'duration_scalar.joblib')

['duration_scalar.joblib']

In [56]:
tempo_scalar = StandardScaler().fit(df.loc[:, ['tempo']])
dump(tempo_scalar, 'tempo_scalar.joblib')

['tempo_scalar.joblib']

In [61]:
X.columns

Index(['edm', 'latin', 'pop', 'r&b', 'rap', 'rock', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'],
      dtype='object')