In [380]:
import pandas as pd
df = pd.read_csv('spotify_cleaned.csv')

In [381]:
df = df.drop(['track_name'], axis=1)

In [382]:
df_popfilm = df[df['track_genre'] == 'pop-film']
df_sad = df[df['track_genre'] == 'sad']
df_electronic = df[df['track_genre'] == 'electronic']
df_metal = df[df['track_genre'] == 'metal']
df_acoustic = df[df['track_genre'] == 'acoustic']

## Random Forest

In [383]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression


In [384]:
def random_forest(df_genre):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(random_state=42, n_estimators=100)
    
    # train the model
    model.fit(X_train, y_train)
    
    # predictions
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    return {"MSE": mse, "R2": r2, "Feature Rankings": feature_importances}

In [385]:
metrics_popfilm = random_forest(df_popfilm)

In [386]:
metrics_sad = random_forest(df_sad)

In [387]:
metrics_electronic = random_forest(df_electronic)

In [388]:
metrics_metal = random_forest(df_metal)

In [389]:
metrics_acoustic = random_forest(df_acoustic)

In [390]:
with open('random_forest_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{metrics_acoustic}', file=file)


## Ridge Regression

In [391]:
def ridge_regression(df_genre, alpha_parameter):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = Ridge(alpha=alpha_parameter)
    
    # train the model
    model.fit(X_train, y_train)
    
    # predictions
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.coef_
    }).sort_values(by='Importance', key=lambda x: x.abs(), ascending=False)
    
    return {"MSE": mse, "R2": r2, "Feature Rankings": feature_importances}

In [392]:
alphas = [1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_popfilm = ridge_regression(df_popfilm, alpha)
    print(ridge_metrics_popfilm)

# alpha = 10 was the best parameter for ridge for this data 
ridge_metrics_popfilm = ridge_regression(df_popfilm, 10)

{'MSE': np.float64(74.87049189788756), 'R2': 0.03504474958128978, 'Feature Rankings':              Feature  Importance
10          liveness  -12.937415
2       danceability    7.519886
7        speechiness   -6.299753
11           valence   -4.442339
8       acousticness   -3.926994
3             energy   -3.352935
13    time_signature    2.119288
9   instrumentalness    2.008233
6               mode    0.632665
5           loudness    0.111772
4                key    0.099261
12             tempo    0.019158
0        duration_ms    0.000005
1           explicit    0.000000}
{'MSE': np.float64(74.54340871011487), 'R2': 0.039260304085665165, 'Feature Rankings':              Feature  Importance
10          liveness   -7.740536
2       danceability    4.389593
11           valence   -3.524599
8       acousticness   -3.274766
13    time_signature    2.175748
3             energy   -2.113424
7        speechiness   -1.704145
6               mode    0.659032
9   instrumentalness    0.375366
4

In [393]:
alphas = [1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_sad = ridge_regression(df_sad, alpha)
    print(ridge_metrics_sad)

# alpha = 1 was the best parameter for ridge for this data 
ridge_metrics_sad = ridge_regression(df_sad, 1)

{'MSE': np.float64(84.68114657080034), 'R2': 0.16621794483760666, 'Feature Rankings':              Feature  Importance
10          liveness    6.432078
7        speechiness    5.092823
2       danceability    4.854923
13    time_signature    1.950326
1           explicit    1.127747
3             energy    0.848492
6               mode   -0.781926
11           valence    0.662607
8       acousticness    0.314775
9   instrumentalness   -0.098430
5           loudness   -0.098079
4                key    0.078060
12             tempo   -0.002580
0        duration_ms   -0.000047}
{'MSE': np.float64(85.49943731965675), 'R2': 0.15816094313261153, 'Feature Rankings':              Feature  Importance
10          liveness    3.801057
7        speechiness    3.165662
2       danceability    2.704601
13    time_signature    1.768608
1           explicit    1.151061
11           valence    1.047006
6               mode   -0.804795
3             energy    0.506389
8       acousticness    0.207482
5 

In [394]:
alphas = [1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_electronic = ridge_regression(df_electronic, alpha)
    print(ridge_metrics_electronic)

# alpha = 1 was the best parameter for ridge for this data 
ridge_metrics_electronic = ridge_regression(df_electronic, 1)

{'MSE': np.float64(257.4884431563126), 'R2': -0.004956292325423606, 'Feature Rankings':              Feature  Importance
2       danceability    6.415096
11           valence   -5.402970
9   instrumentalness    4.866043
13    time_signature    4.734888
7        speechiness   -2.658372
3             energy   -2.526052
1           explicit    2.414387
10          liveness   -1.680303
6               mode   -1.083148
8       acousticness   -0.530826
5           loudness    0.361995
4                key   -0.119542
12             tempo   -0.004300
0        duration_ms   -0.000016}
{'MSE': np.float64(258.20569695886957), 'R2': -0.0077556751374763255, 'Feature Rankings':              Feature  Importance
9   instrumentalness    4.504116
13    time_signature    4.241446
11           valence   -4.190305
2       danceability    3.670144
1           explicit    2.167991
3             energy   -1.553853
10          liveness   -1.537229
6               mode   -1.082887
7        speechiness   -0.896

In [395]:
alphas = [1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_metal = ridge_regression(df_metal, alpha)
    print(ridge_metrics_metal)

# alpha = 10 was the best parameter for ridge for this data 
ridge_metrics_metal = ridge_regression(df_metal, 10)

{'MSE': np.float64(766.7001760295003), 'R2': 0.019484256022474167, 'Feature Rankings':              Feature  Importance
3             energy  -28.307266
9   instrumentalness  -12.548890
7        speechiness  -10.017717
1           explicit    9.088796
10          liveness   -4.397217
6               mode   -3.489230
2       danceability    1.801218
11           valence    1.707164
5           loudness    1.511889
13    time_signature    0.366127
4                key   -0.248004
8       acousticness   -0.225350
12             tempo    0.036431
0        duration_ms    0.000027}
{'MSE': np.float64(766.1013141422352), 'R2': 0.02025012712475094, 'Feature Rankings':              Feature  Importance
3             energy  -15.221705
9   instrumentalness   -9.647642
1           explicit    7.817285
7        speechiness   -3.751083
10          liveness   -3.700901
6               mode   -3.095329
8       acousticness    2.517037
2       danceability    2.464320
5           loudness    1.083274
1

In [396]:
alphas = [1, 10, 50, 100]
for alpha in alphas:
    ridge_metrics_acoustic = ridge_regression(df_acoustic, alpha)
    print(ridge_metrics_acoustic)

# alpha = 10 was the best parameter for ridge for this data 
ridge_metrics_acoustic = ridge_regression(df_acoustic, 10)


{'MSE': np.float64(217.0587660603769), 'R2': 0.05218659724102659, 'Feature Rankings':              Feature  Importance
1           explicit   -9.921522
11           valence   -9.081233
3             energy   -7.289462
7        speechiness   -6.180774
9   instrumentalness   -5.989467
2       danceability    5.767073
10          liveness   -2.887156
6               mode    2.195162
13    time_signature    1.244514
8       acousticness    0.323252
5           loudness   -0.228441
4                key   -0.095230
12             tempo    0.012283
0        duration_ms   -0.000012}
{'MSE': np.float64(216.55666376455736), 'R2': 0.05437908775486955, 'Feature Rankings':              Feature  Importance
1           explicit   -8.525650
11           valence   -7.329106
3             energy   -5.677203
9   instrumentalness   -4.248166
2       danceability    3.323049
6               mode    2.331812
10          liveness   -2.225460
7        speechiness   -1.227657
8       acousticness    1.144820
1

In [397]:
with open('ridge_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{ridge_metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{ridge_metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{ridge_metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{ridge_metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{ridge_metrics_acoustic}', file=file)

## Lasso Regression

In [398]:
def lasso_regression(df_genre, alpha_parameter):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = Lasso(alpha=alpha_parameter)
    
    # train the model
    model.fit(X_train, y_train)
    
    # predictions
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.coef_
    }).sort_values(by='Importance', key=lambda x: x.abs(), ascending=False)
    
    return {"MSE": mse, "R2": r2, "Feature Rankings": feature_importances}

In [399]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_popfilm = lasso_regression(df_popfilm, alpha)
    print(lasso_metrics_popfilm)

# alpha = 0.01 was the best parameter for this data
lasso_metrics_popfilm = lasso_regression(df_popfilm, 0.01)

{'MSE': np.float64(75.15533369472497), 'R2': 0.0313736158618555, 'Feature Rankings':              Feature  Importance
10          liveness  -14.004759
7        speechiness   -8.878961
2       danceability    8.070172
11           valence   -4.498589
8       acousticness   -4.023086
3             energy   -3.284210
9   instrumentalness    2.630336
13    time_signature    2.069146
6               mode    0.614336
5           loudness    0.115260
4                key    0.099705
12             tempo    0.019574
0        duration_ms    0.000005
1           explicit    0.000000}
{'MSE': np.float64(74.88806608684777), 'R2': 0.034818247718150785, 'Feature Rankings':              Feature  Importance
10          liveness  -13.385158
2       danceability    7.433678
7        speechiness   -5.933218
11           valence   -4.387774
8       acousticness   -3.640065
3             energy   -2.745106
13    time_signature    2.091741
6               mode    0.607219
4                key    0.096921
5 

In [400]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_sad = lasso_regression(df_sad, alpha)
    print(lasso_metrics_sad)

# alpha = 0.001 was the best performing parameter for this data
lasso_metrics_sad = lasso_regression(df_sad, 0.001)


{'MSE': np.float64(84.5764732748345), 'R2': 0.1672485723074184, 'Feature Rankings':              Feature  Importance
10          liveness    6.907460
7        speechiness    5.430647
2       danceability    5.277527
13    time_signature    1.958261
1           explicit    1.117883
3             energy    0.844902
6               mode   -0.772749
11           valence    0.537565
8       acousticness    0.308635
5           loudness   -0.095478
9   instrumentalness   -0.087238
4                key    0.079126
12             tempo   -0.002470
0        duration_ms   -0.000047}
{'MSE': np.float64(84.69998223361668), 'R2': 0.16603248634667533, 'Feature Rankings':              Feature  Importance
10          liveness    6.337774
7        speechiness    5.094362
2       danceability    4.504696
13    time_signature    1.880680
1           explicit    1.110298
6               mode   -0.763699
11           valence    0.598381
4                key    0.078474
5           loudness   -0.075554
8   

In [401]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_electronic = lasso_regression(df_electronic, alpha)
    print(lasso_metrics_electronic)

# alpha = 0.001 was the best parameter for this data
lasso_metrics_electronic = lasso_regression(df_electronic, 0.001)

{'MSE': np.float64(257.4234618416744), 'R2': -0.004702675579643767, 'Feature Rankings':              Feature  Importance
2       danceability    6.974725
11           valence   -5.610262
9   instrumentalness    4.878076
13    time_signature    4.769109
7        speechiness   -3.156981
3             energy   -2.604761
1           explicit    2.441608
10          liveness   -1.628693
6               mode   -1.077942
8       acousticness   -0.478820
5           loudness    0.369510
4                key   -0.118186
12             tempo   -0.003580
0        duration_ms   -0.000016}
{'MSE': np.float64(257.78968399108726), 'R2': -0.006132010616703987, 'Feature Rankings':              Feature  Importance
2       danceability    6.352285
11           valence   -5.369759
9   instrumentalness    4.819840
13    time_signature    4.757888
1           explicit    2.304887
3             energy   -1.929768
7        speechiness   -1.582368
10          liveness   -1.423807
6               mode   -1.0468

In [402]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_metal = lasso_regression(df_metal, alpha)
    print(lasso_metrics_metal)

# alpha = 0.01 was the best parameter for this data
lasso_metrics_metal = lasso_regression(df_metal, 0.01)

{'MSE': np.float64(768.2320652882538), 'R2': 0.017525157038023043, 'Feature Rankings':              Feature  Importance
3             energy  -31.492067
7        speechiness  -13.158363
9   instrumentalness  -12.886571
1           explicit    9.329045
10          liveness   -4.363607
6               mode   -3.559143
11           valence    2.093803
8       acousticness   -2.052757
5           loudness    1.615100
2       danceability    1.088714
13    time_signature    0.458437
4                key   -0.236406
12             tempo    0.037154
0        duration_ms    0.000027}
{'MSE': np.float64(767.6313257280727), 'R2': 0.01829342945428636, 'Feature Rankings':              Feature  Importance
3             energy  -30.778160
9   instrumentalness  -12.629118
7        speechiness   -9.671572
1           explicit    9.158443
10          liveness   -4.101111
6               mode   -3.517304
11           valence    2.068918
5           loudness    1.596679
2       danceability    0.583025
1

In [403]:
alphas = [0.001, 0.01, 0.1, 1]
for alpha in alphas:
    lasso_metrics_acoustic = lasso_regression(df_acoustic, alpha)
    print(lasso_metrics_acoustic)

# alpha = 0.1 was the best parameter for this data
lasso_metrics_acoustic = lasso_regression(df_acoustic, 0.1)

{'MSE': np.float64(216.35113536436808), 'R2': 0.0552765524179315, 'Feature Rankings':              Feature  Importance
7        speechiness  -17.519816
1           explicit   -9.966543
11           valence   -9.157951
3             energy   -7.256143
2       danceability    6.183305
9   instrumentalness   -6.162985
10          liveness   -2.729455
6               mode    2.119447
13    time_signature    1.247088
5           loudness   -0.235661
8       acousticness    0.198656
4                key   -0.093393
12             tempo    0.012942
0        duration_ms   -0.000013}
{'MSE': np.float64(217.38255148895692), 'R2': 0.05077274893443107, 'Feature Rankings':              Feature  Importance
1           explicit  -10.014199
11           valence   -9.185877
3             energy   -7.726312
9   instrumentalness   -5.887246
2       danceability    5.677233
7        speechiness   -2.614721
10          liveness   -2.297981
6               mode    2.149135
13    time_signature    1.200322
5

In [404]:
with open('lasso_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{lasso_metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{lasso_metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{lasso_metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{lasso_metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{lasso_metrics_acoustic}', file=file)

## Multiple Linear Regression

In [405]:
def mlr(df_genre):
    # separate features / target
    X = df_genre.drop(columns=['popularity'])
    y = df_genre['popularity']
    
    X = pd.get_dummies(X, drop_first=True)
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    
    # train the model
    model.fit(X_train, y_train)
    
    # predictions
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.coef_
    }).sort_values(by='Importance', key=lambda x: x.abs(), ascending=False)
    
    return {"MSE": mse, "R2": r2, "Feature Rankings": feature_importances}

In [406]:
mlr_metrics_popfilm = mlr(df_popfilm)

In [407]:
mlr_metrics_sad = mlr(df_sad)

In [408]:
mlr_metrics_electronic = mlr(df_electronic)

In [409]:
mlr_metrics_metal = mlr(df_metal)

In [410]:
mlr_metrics_acoustic = mlr(df_acoustic)

In [411]:
with open('mlr_report.txt', 'w') as file:
    print(f'POPFILM METRICS ==> \n{mlr_metrics_popfilm}', file=file)
    print('', file=file)
    print(f'SAD METRICS ==> \n{mlr_metrics_sad}', file=file)
    print('', file=file)
    print(f'ELECTRONIC METRICS ==> \n{mlr_metrics_electronic}', file=file)
    print('', file=file)
    print(f'METAL METRICS ==> \n{mlr_metrics_metal}', file=file)
    print('', file=file)
    print(f'ACOUSTIC METRICS ==> \n{mlr_metrics_acoustic}', file=file)