In [545]:
import pandas as pd
import numpy as np
df = pd.read_csv('spotify_cleaned.csv')

In [546]:
df = df.drop(['track_name'], axis=1)

In [547]:
df_popfilm = df[df['track_genre'] == 'pop-film']
df_sad = df[df['track_genre'] == 'sad']
df_electronic = df[df['track_genre'] == 'electronic']
df_metal = df[df['track_genre'] == 'metal']
df_acoustic = df[df['track_genre'] == 'acoustic']

## Random Forest

In [548]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression


In [549]:
def random_forest(df_genre):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(random_state=42, n_estimators=100)
    
    # train the model
    model.fit(X_train, y_train)
    
    # predictions
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    return {"MSE": mse, "R2": r2, "Feature Rankings": feature_importances}

In [550]:
metrics_popfilm = random_forest(df_popfilm)

In [551]:
metrics_sad = random_forest(df_sad)

In [552]:
metrics_electronic = random_forest(df_electronic)

In [553]:
metrics_metal = random_forest(df_metal)

In [554]:
metrics_acoustic = random_forest(df_acoustic)

In [555]:
with open('random_forest_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{metrics_acoustic}', file=file)


## Ridge Regression

In [556]:
def ridge_regression(df_genre, alpha_parameter):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    importance = np.zeros(X.columns.shape)
    r2 = 0
    r2_train = 0
    mse = 0
    # test 1000 random states
    for random_state in range(1, 1001):
        # split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model = Ridge(alpha=alpha_parameter, random_state=random_state)
    
        # train the model
        model.fit(X_train, y_train)
        
        # predictions
        y_pred = model.predict(X_test)
        mse += mean_squared_error(y_test, y_pred)
        r2 += r2_score(y_test, y_pred)
        train_pred = model.predict(X_train)
        r2_train += r2_score(y_train, train_pred)
        importance = np.add(importance, model.coef_)
    mse = mse/1000
    r2 = r2/1000
    r2_train = r2_train/1000
    importance = importance/1000
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importance
    }).sort_values(by='Importance', key=lambda x: x.abs(), ascending=False)
    
    return {"Test MSE": mse, "Test R2": r2, "Training R2": r2_train, "Feature Rankings": feature_importances}

In [557]:
alphas = [0, 1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_popfilm = ridge_regression(df_popfilm, alpha)
    print(ridge_metrics_popfilm)

# all alphas practically the same, choosing alpha = 10 for reporting 
ridge_metrics_popfilm = ridge_regression(df_popfilm, 10)

{'Test MSE': np.float64(6.448526068032001e+27), 'Test R2': -5.814339517888482e+25, 'Training R2': -0.1152036832644351, 'Feature Rankings':              Feature    Importance
1           explicit -1.097924e+13
10          liveness -1.270761e+01
2       danceability  8.243631e+00
7        speechiness -7.544795e+00
11           valence -4.565213e+00
8       acousticness -3.892657e+00
3             energy -2.860393e+00
9   instrumentalness  2.232615e+00
13    time_signature  1.857020e+00
6               mode  7.859248e-01
5           loudness  1.618406e-01
4                key  1.521964e-01
12             tempo  1.482454e-02
0        duration_ms  4.213510e-06}
{'Test MSE': np.float64(101.62874566748073), 'Test R2': 0.01044227352833017, 'Training R2': 0.06909140268865405, 'Feature Rankings':              Feature  Importance
10          liveness  -11.729262
2       danceability    7.650338
7        speechiness   -5.533745
11           valence   -4.448619
1           explicit    4.096713
8   

In [558]:
alphas = [0, 1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_sad = ridge_regression(df_sad, alpha)
    print(ridge_metrics_sad)

# all alphas were practically even, will use alpha=1 for reporting data
ridge_metrics_sad = ridge_regression(df_sad, 1)

{'Test MSE': np.float64(105.76263326424788), 'Test R2': 0.045350788531410675, 'Training R2': 0.08840755292807416, 'Feature Rankings':              Feature  Importance
10          liveness    7.175468
2       danceability    6.275140
7        speechiness    5.582638
13    time_signature    2.356665
1           explicit    1.641876
3             energy    0.792618
6               mode   -0.662708
9   instrumentalness    0.604811
4                key    0.083980
5           loudness   -0.061375
11           valence    0.029386
8       acousticness   -0.027955
12             tempo   -0.000235
0        duration_ms   -0.000054}
{'Test MSE': np.float64(105.57852443276575), 'Test R2': 0.0451111503131008, 'Training R2': 0.08842400199191887, 'Feature Rankings':              Feature  Importance
10          liveness    6.658594
2       danceability    5.714588
7        speechiness    5.220306
13    time_signature    2.327741
1           explicit    1.672350
3             energy    0.689335
6      

In [559]:
alphas = [0, 1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_electronic = ridge_regression(df_electronic, alpha)
    print(ridge_metrics_electronic)

# all alphas were practically the same, using alpha = 50 for reporting 
ridge_metrics_electronic = ridge_regression(df_electronic, 50)

{'Test MSE': np.float64(264.9253387513107), 'Test R2': 0.0009314660456345364, 'Training R2': 0.04095880596611768, 'Feature Rankings':              Feature  Importance
11           valence   -8.920037
3             energy   -5.930074
2       danceability    5.413909
13    time_signature    4.465581
7        speechiness   -3.128498
1           explicit    2.985759
9   instrumentalness    2.817865
10          liveness   -1.287732
6               mode   -0.911733
8       acousticness   -0.835113
5           loudness    0.475720
4                key   -0.093791
12             tempo    0.003142
0        duration_ms   -0.000018}
{'Test MSE': np.float64(264.8244447698206), 'Test R2': 0.0028764063495388607, 'Training R2': 0.04088224312802725, 'Feature Rankings':              Feature  Importance
11           valence   -8.654998
3             energy   -5.594482
2       danceability    4.926887
13    time_signature    4.429260
1           explicit    2.957665
9   instrumentalness    2.801255
7    

In [560]:
alphas = [0, 1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_metal = ridge_regression(df_metal, alpha)
    print(ridge_metrics_metal)

# alpha = 10 was the best parameter for ridge for this data 
ridge_metrics_metal = ridge_regression(df_metal, 10)

{'Test MSE': np.float64(795.6373369546513), 'Test R2': 0.007261454519886972, 'Training R2': 0.04832198336753132, 'Feature Rankings':              Feature  Importance
3             energy  -29.890703
9   instrumentalness  -10.977238
1           explicit   10.539236
7        speechiness   -6.003407
10          liveness   -5.217461
2       danceability    5.016344
6               mode   -3.654435
11           valence    1.796450
5           loudness    1.655363
13    time_signature    1.375796
8       acousticness   -0.089494
4                key   -0.058813
12             tempo    0.014372
0        duration_ms    0.000024}
{'Test MSE': np.float64(793.3992304573969), 'Test R2': 0.008269640115220444, 'Training R2': 0.048291847316043766, 'Feature Rankings':              Feature  Importance
3             energy  -26.368625
9   instrumentalness  -10.635217
1           explicit   10.443211
7        speechiness   -6.085493
2       danceability    5.100735
10          liveness   -5.078041
6     

In [561]:
alphas = [0, 1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_acoustic = ridge_regression(df_acoustic, alpha)
    print(ridge_metrics_acoustic)

# all alphas were very similar, choosing alpha = 1 for reporting
ridge_metrics_acoustic = ridge_regression(df_acoustic, 10)


{'Test MSE': np.float64(211.3944624816584), 'Test R2': 0.07459630659676923, 'Training R2': 0.10804046156926471, 'Feature Rankings':              Feature  Importance
7        speechiness  -31.324022
1           explicit   -9.895815
11           valence   -8.170422
2       danceability    7.186755
3             energy   -5.698833
9   instrumentalness   -4.659482
10          liveness   -3.563320
6               mode    1.703474
13    time_signature    0.682379
8       acousticness    0.639326
5           loudness   -0.225266
4                key   -0.057916
12             tempo    0.015888
0        duration_ms   -0.000016}
{'Test MSE': np.float64(212.22549253927102), 'Test R2': 0.07601153806239763, 'Training R2': 0.10695654416155373, 'Feature Rankings':              Feature  Importance
7        speechiness  -10.038304
1           explicit   -9.888173
11           valence   -8.290468
2       danceability    6.490575
3             energy   -6.288204
9   instrumentalness   -4.499399
10      

In [562]:
with open('ridge_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{ridge_metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{ridge_metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{ridge_metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{ridge_metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{ridge_metrics_acoustic}', file=file)

## Lasso Regression

In [563]:
def lasso_regression(df_genre, alpha_parameter):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    importance = np.zeros(X.columns.shape)
    r2 = 0
    r2_train = 0
    mse = 0
    # test 1000 random states
    for random_state in range(1, 1001):
        # split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model = Lasso(alpha=alpha_parameter, random_state=random_state)
    
        # train the model
        model.fit(X_train, y_train)
        
        # test predictions
        y_pred = model.predict(X_test)

        # update MSE and R^2
        mse += mean_squared_error(y_test, y_pred)
        r2 += r2_score(y_test, y_pred)

        # training set predicitions
        train_pred = model.predict(X_train)

        # update r^2 and importance
        r2_train += r2_score(y_train, train_pred)
        importance = np.add(importance, model.coef_)
    # take averages over all random states
    mse = mse/1000
    r2 = r2/1000
    r2_train = r2_train/1000
    importance = importance/1000
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importance
    }).sort_values(by='Importance', key=lambda x: x.abs(), ascending=False)
    
    return {"MSE Test": mse, "R2 test": r2, "R2 Train": r2_train, "Feature Rankings": feature_importances}

In [564]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_popfilm = lasso_regression(df_popfilm, alpha)
    print(lasso_metrics_popfilm)

# all alphas were practically the same, choosing alpha = 0.001 for reporting
lasso_metrics_popfilm = lasso_regression(df_popfilm, 0.001)

{'MSE Test': np.float64(102.16098658304018), 'R2 test': 0.011016504717624553, 'R2 Train': 0.0695083156146159, 'Feature Rankings':              Feature  Importance
10          liveness  -12.716141
2       danceability    8.150820
1           explicit    7.553760
7        speechiness   -7.317726
11           valence   -4.540149
8       acousticness   -3.928910
3             energy   -2.929808
13    time_signature    1.901054
9   instrumentalness    1.813069
6               mode    0.787675
5           loudness    0.155677
4                key    0.148968
12             tempo    0.013091
0        duration_ms    0.000004}
{'MSE Test': np.float64(102.40148463275035), 'R2 test': 0.009337224887650736, 'R2 Train': 0.06809022330038302, 'Feature Rankings':              Feature  Importance
10          liveness  -12.021734
2       danceability    7.507641
7        speechiness   -4.966207
11           valence   -4.402186
8       acousticness   -3.583546
3             energy   -2.334757
13    time_s

In [565]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_sad = lasso_regression(df_sad, alpha)
    print(lasso_metrics_sad)

# all alphas were practically the same, choosing alpha = 0.01 for reporting
lasso_metrics_sad = lasso_regression(df_sad, 0.001)


{'MSE Test': np.float64(105.16217712415458), 'R2 test': 0.04409313718577387, 'R2 Train': 0.0886647992455869, 'Feature Rankings':              Feature  Importance
10          liveness    7.076156
2       danceability    6.241312
7        speechiness    5.541786
13    time_signature    2.339000
1           explicit    1.644246
3             energy    0.803420
6               mode   -0.629735
9   instrumentalness    0.588240
4                key    0.087015
11           valence    0.073195
5           loudness   -0.066420
8       acousticness   -0.002958
12             tempo   -0.000395
0        duration_ms   -0.000054}
{'MSE Test': np.float64(104.65941233162474), 'R2 test': 0.0472418220927626, 'R2 Train': 0.0881945887824878, 'Feature Rankings':              Feature  Importance
10          liveness    6.535346
2       danceability    5.537592
7        speechiness    4.985828
13    time_signature    2.263276
1           explicit    1.628532
6               mode   -0.647056
9   instrumental

In [566]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_electronic = lasso_regression(df_electronic, alpha)
    print(lasso_metrics_electronic)

# all alphas were practically the same, choosing alpha = 0.1 for reporting
lasso_metrics_electronic = lasso_regression(df_electronic, 0.001)

{'MSE Test': np.float64(266.0275593230619), 'R2 test': -0.00025403741016709656, 'R2 Train': 0.04108219699012365, 'Feature Rankings':              Feature  Importance
11           valence   -8.903116
3             energy   -5.969997
2       danceability    5.362305
13    time_signature    4.456708
7        speechiness   -3.055319
1           explicit    2.946665
9   instrumentalness    2.827536
10          liveness   -1.193351
6               mode   -0.905163
8       acousticness   -0.901712
5           loudness    0.476455
4                key   -0.096487
12             tempo    0.002675
0        duration_ms   -0.000018}
{'MSE Test': np.float64(264.2922135696547), 'R2 test': 0.0014938374237249785, 'R2 Train': 0.040920045169034754, 'Feature Rankings':              Feature  Importance
11           valence   -8.696703
3             energy   -5.267277
2       danceability    4.765087
13    time_signature    4.384081
1           explicit    2.852612
9   instrumentalness    2.716644
7       

In [567]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_metal = lasso_regression(df_metal, alpha)
    print(lasso_metrics_metal)

# all alphas were practically the same, choosing alpha = 0.1 for reporting
lasso_metrics_metal = lasso_regression(df_metal, 0.01)

{'MSE Test': np.float64(794.411271702152), 'R2 test': 0.0069272492627683886, 'R2 Train': 0.048371849610917804, 'Feature Rankings':              Feature  Importance
3             energy  -29.450797
9   instrumentalness  -11.076611
1           explicit   10.580134
7        speechiness   -6.647940
10          liveness   -5.410234
2       danceability    5.043641
6               mode   -3.632942
5           loudness    1.643652
11           valence    1.609745
13    time_signature    1.409379
4                key   -0.056386
12             tempo    0.015201
8       acousticness   -0.004841
0        duration_ms    0.000024}
{'MSE Test': np.float64(792.6676815615277), 'R2 test': 0.00943040915192619, 'R2 Train': 0.04820231069345525, 'Feature Rankings':              Feature  Importance
3             energy  -28.839473
9   instrumentalness  -10.889497
1           explicit   10.415428
10          liveness   -4.984078
7        speechiness   -4.846774
2       danceability    4.596769
6            

In [568]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_acoustic = lasso_regression(df_acoustic, alpha)
    print(lasso_metrics_acoustic)

# all alphas were practically the same, choosing alpha = 0.001 for reporting
lasso_metrics_acoustic = lasso_regression(df_acoustic, 0.1)

{'MSE Test': np.float64(212.40821966880875), 'R2 test': 0.07421479505076094, 'R2 Train': 0.10805974146988338, 'Feature Rankings':              Feature  Importance
7        speechiness  -30.141088
1           explicit   -9.880420
11           valence   -8.182318
2       danceability    7.090260
3             energy   -5.835054
9   instrumentalness   -4.638752
10          liveness   -3.395895
6               mode    1.702272
13    time_signature    0.646332
8       acousticness    0.624736
5           loudness   -0.216442
4                key   -0.053887
12             tempo    0.015887
0        duration_ms   -0.000015}
{'MSE Test': np.float64(212.75544887229927), 'R2 test': 0.07630430359162385, 'R2 Train': 0.1070682680634096, 'Feature Rankings':              Feature  Importance
7        speechiness  -15.411877
1           explicit   -9.920013
11           valence   -8.178194
2       danceability    6.552033
3             energy   -6.091219
9   instrumentalness   -4.333491
10          li

In [569]:
with open('lasso_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{lasso_metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{lasso_metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{lasso_metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{lasso_metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{lasso_metrics_acoustic}', file=file)

## Multiple Linear Regression

In [570]:
def mlr(df_genre):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    importance = np.zeros(X.columns.shape)
    r2 = 0
    r2_train = 0
    mse = 0

    # test 1000 random states
    for random_state in range(1, 1001):
        # split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
        model = LinearRegression()
        
        # train the model
        model.fit(X_train, y_train)
        
        # test predictions
        y_pred = model.predict(X_test)

        # update MSE and R^2 
        mse += mean_squared_error(y_test, y_pred)
        r2 += r2_score(y_test, y_pred)

        # training predictions 
        train_pred = model.predict(X_train)

        # update r^2 and importace
        r2_train += r2_score(y_train, train_pred)
        importance = np.add(importance, model.coef_)
    # take averages over all random states 
    mse = mse/1000
    r2 = r2/1000
    r2_train = r2_train/1000
    importance = importance/1000
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importance
    }).sort_values(by='Importance', key=lambda x: x.abs(), ascending=False)
    
    return {"MSE": mse, "R2": r2, "Feature Rankings": feature_importances}

In [571]:
mlr_metrics_popfilm = mlr(df_popfilm)

In [572]:
mlr_metrics_sad = mlr(df_sad)

In [573]:
mlr_metrics_electronic = mlr(df_electronic)

In [574]:
mlr_metrics_metal = mlr(df_metal)

In [575]:
mlr_metrics_acoustic = mlr(df_acoustic)

In [576]:
with open('mlr_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{mlr_metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{mlr_metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{mlr_metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{mlr_metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{mlr_metrics_acoustic}', file=file)

# Dummy Model (Always Predicting Mean Popularity)

In [577]:
def dummy(df_genre):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    r2 = 0
    r2_train = 0
    mse = 0

    # test 1000 random states
    for random_state in range(1, 1001):
        # split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

        # find mean of training data
        mean = np.mean(y_train)

        # always predict the mean
        y_pred = np.full(y_test.shape, mean)

        # update MSE and R^2 results
        mse += mean_squared_error(y_test, y_pred)
        r2 += r2_score(y_test, y_pred)
        train_pred = np.full(y_train.shape, mean)
        r2_train += r2_score(y_train, train_pred)
    # take average MSE and R^2 results
    mse = mse/1000
    r2 = r2/1000
    r2_train = r2_train/1000
    return {"MSE Test": mse, "R2 Test": r2, "R2 Train": r2_train}

In [578]:
print("Pop-film dummy results: ", dummy(df_popfilm))
print("Acoustic dummy results: ", dummy(df_acoustic))
print("metal dummy results : ", dummy(df_metal))
print("Sad dummy results: ",dummy(df_sad))
print("Electronic dummy results: ",dummy(df_electronic))

Pop-film dummy results:  {'MSE Test': np.float64(105.4617986493989), 'R2 Test': -0.006622696856615832, 'R2 Train': 0.0}
Acoustic dummy results:  {'MSE Test': np.float64(230.004400656637), 'R2 Test': -0.006741684025135964, 'R2 Train': 0.0}
metal dummy results :  {'MSE Test': np.float64(805.3145575747284), 'R2 Test': -0.007177148780794016, 'R2 Train': 0.0}
Sad dummy results:  {'MSE Test': np.float64(110.42494542922232), 'R2 Test': -0.006272618719272294, 'R2 Train': 0.0}
Electronic dummy results:  {'MSE Test': np.float64(267.39590949438326), 'R2 Test': -0.006503677288971889, 'R2 Train': 0.0}
