In [1]:
import pandas as pd
from joblib import dump, load
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [30]:
df = pd.read_csv('../processed/data/spotify_songs_processed.csv', index_col=0)
df.dropna(axis=0, inplace=True)

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,nominal_popularity
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-2.634,1,0.0583,0.102000,0.000000,0.0653,0.5180,122.036,194754,medium
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-4.969,1,0.0373,0.072400,0.004210,0.3570,0.6930,99.972,162600,medium
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-3.432,0,0.0742,0.079400,0.000023,0.1100,0.6130,124.008,176616,high
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-3.778,1,0.1020,0.028700,0.000009,0.2040,0.2770,121.956,169093,medium
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-4.672,1,0.0359,0.080300,0.000000,0.0833,0.7250,123.976,189052,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32828,7bxnKAamR3snQ1VGLuVfC1,City Of Lights - Official Radio Edit,Lush & Simon,42,2azRoBBWEEEYhqV6sb7JrT,City Of Lights (Vocal Mix),2014-04-28,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,-1.814,1,0.0936,0.076600,0.000000,0.0668,0.2100,128.170,204375,medium
32829,5Aevni09Em4575077nkWHz,Closer - Sultan & Ned Shepard Remix,Tegan and Sara,20,6kD6KLxj7s8eCE3ABvAyf5,Closer Remixed,2013-03-08,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,-4.462,1,0.0420,0.001710,0.004270,0.3750,0.4000,128.041,353120,low
32830,7ImMqPP3Q1yfUHvsdn7wEo,Sweet Surrender - Radio Edit,Starkillers,14,0ltWNSY9JgxoIZO4VzuCa6,Sweet Surrender (Radio Edit),2014-04-21,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,-4.899,0,0.0481,0.108000,0.000001,0.1500,0.4360,127.989,210112,low
32831,2m69mhnfQ1Oq6lGtXuYhgX,Only For You - Maor Levi Remix,Mat Zo,15,1fGrOkHnHJcStl14zNx8Jy,Only For You (Remixes),2014-01-01,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,...,-3.361,1,0.1090,0.007920,0.127000,0.3430,0.3080,128.008,367432,low


In [67]:
y = df['track_popularity']
genre_dummy = pd.get_dummies(df.playlist_genre)
X = pd.concat([genre_dummy, df.loc[:, 'loudness':'duration_ms']], axis=1)

In [69]:
# split the dataset into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [70]:
# standardise loudness, tempo and duration_ms
X_train['loudness'] = StandardScaler().fit(X_train.loc[:, ['loudness']]).transform(X_train.loc[:, ['loudness']])
X_train['tempo'] = StandardScaler().fit(X_train.loc[:, ['tempo']]).transform(X_train.loc[:, ['tempo']])
X_train['duration_ms'] = StandardScaler().fit(X_train.loc[:, ['duration_ms']]).transform(X_train.loc[:, ['duration_ms']])

In [71]:
# Grid search to fint the optiomal model
grid_param = {
    'n_estimators': [200, 300, 400, 500],
    'max_features': [2, 3, 4, 5],
    'max_depth': [3, 4, 5, 6, 7]
}

model = RandomForestRegressor()

model_grid = GridSearchCV(model, param_grid=grid_param, cv=5)
model_grid.fit(X_train, y_train)

best_model = model_grid.best_estimator_
print(best_model)

RandomForestRegressor(max_depth=7, max_features=5, n_estimators=400)


In [82]:
y_train_pred = best_model.predict(X_train)
mean_squared_error(y_train, y_train_pred, squared=False)

22.93843252738294

In [86]:
# standardize the test data
X_test['loudness'] = StandardScaler().fit(X_test.loc[:, ['loudness']]).transform(X_test.loc[:, ['loudness']])
X_test['tempo'] = StandardScaler().fit(X_test.loc[:, ['tempo']]).transform(X_test.loc[:, ['tempo']])
X_test['duration_ms'] = StandardScaler().fit(X_test.loc[:, ['duration_ms']]).transform(X_test.loc[:, ['duration_ms']])

In [87]:
mean_squared_error(y_test, best_model.predict(X_test), squared=False)

23.828550580981265

In [None]:
# build the model using the whole dataset
X['loudness'] = StandardScaler().fit(X.loc[:, ['loudness']]).transform(X.loc[:, ['loudness']])
X['tempo'] = StandardScaler().fit(X.loc[:, ['tempo']]).transform(X.loc[:, ['tempo']])
X['duration_ms'] = StandardScaler().fit(X.loc[:, ['duration_ms']]).transform(X.loc[:, ['duration_ms']])

In [88]:
full_model = RandomForestRegressor(n_estimators=400, max_features=5, max_depth=7)
full_model.fit(X, y)

In [89]:
dump(full_model, 'spotify_model.joblib')

['spotify_model.joblib']