# Project Check-In Week 3: Linear Regression

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklego.linear_model import LADRegression
# from sklearn.feature_selection import SequentialFeatureSelector

In [3]:
spotify_cleaned = pd.read_excel('../Data/clean_data.xlsx')
spotify_cleaned.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

In [49]:
# Print correlations between numeric columns -> only print those that have correlations above 0.2
correlations = spotify_cleaned.corr(numeric_only=True)
correlations[correlations.abs() > 0.2].replace(1, np.nan).dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
explicit,,,,,0.325346,,,,,
danceability,,,,0.254835,,,,,0.484892,
energy,,,,0.759726,,-0.733844,,,0.249962,0.247234
loudness,,0.254835,0.759726,,,-0.582554,-0.430668,,0.279826,0.213648
speechiness,0.325346,,,,,,,0.220905,,
acousticness,,,-0.733844,-0.582554,,,,,,-0.217672
instrumentalness,,,,-0.430668,,,,,-0.325794,
liveness,,,,,0.220905,,,,,
valence,,0.484892,0.249962,0.279826,,,-0.325794,,,
tempo,,,0.247234,0.213648,,-0.217672,,,,


We chose loudness as our response variable because the correlation matrix shows that it has the highest correlation with the highest number of other variables. We did not use a function to select oit predictor variables for this check in

In [50]:
# Split data into testing, validation, and training sets
# 60% training, 20% validation, 20% testing

training_data = spotify_cleaned.sample(frac=0.6, random_state=47)
validation_data = spotify_cleaned.drop(training_data.index).sample(frac=0.5, random_state=47)
testing_data = spotify_cleaned.drop(training_data.index).drop(validation_data.index)

In [51]:
# Fit a LAD regression model to the training data using the following variables: instrumentalness, speechiness, energy, valence, danceability, acousticness
# We chose these variables because they have the highest correlation with loudness, our response variable

vars = ["instrumentalness", "energy", "valence", "danceability", "acousticness"]
response = "loudness"
lad_fit = LADRegression()
lad_fit.fit(training_data[vars], y=training_data[response])


In [52]:
# For single predictor variable only
# fig = px.scatter(training_data, x=vars[0], y=response)
# fig.add_trace(go.Scatter(x=training_data[vars[0]], y=lad_fit.intercept_ + lad_fit.coef_[0] * training_data[vars[0]], mode='lines', name='LAD Fit'))

In [53]:
# Fit an LS regression model to the training data using the following variables: instrumentalness, speechiness, energy, valence, danceability, acousticness
ls_fit = LinearRegression()
ls_fit.fit(X=training_data[vars], y=training_data[response])

In [54]:
# Scatter plot for single predictor variable only
# fig = px.scatter(training_data, x=vars[0], y=response)
# fig.add_trace(go.Scatter(x=training_data[vars[0]], y=ls_fit.intercept_ + ls_fit.coef_[0] * training_data[vars[0]], mode='lines', name='LS Fit'))

In [55]:
# # Select predictor variables
# selector = SequentialFeatureSelector(ls_fit, n_features_to_select=2, director="forward", scoring="neg_mean_squared_error", cv=5)
# selector.fit(training_data[vars], training_data[response])
# selector.get_feature_names_out()

In [56]:
# Find the residuals between the true values and the predicted values for the training and validation sets

pred_train_df = pd.DataFrame({'true': training_data[response], 'ls_pred': ls_fit.predict(training_data[vars]), 'lad_pred': lad_fit.predict(training_data[vars])})

pred_val_df = pd.DataFrame({'true': validation_data[response], 'ls_pred': ls_fit.predict(validation_data[vars]), 'lad_pred': lad_fit.predict(validation_data[vars])})

In [57]:
# calculate the rMSE, MAE, MAD, correlation, and R2 of the true price with the LS and LAD predictions
print('Training LS rMSE:', np.sqrt(mean_squared_error(pred_train_df['true'], pred_train_df['ls_pred'])))
print('Training LS MAE:', mean_absolute_error(pred_train_df['true'], pred_train_df['ls_pred']))
print('Training LS MAD:', np.median(np.abs(pred_train_df['true'] - pred_train_df['ls_pred'])))
print('Training LS correlation:', np.corrcoef(pred_train_df['true'], pred_train_df['ls_pred'])[0, 1])
print('Training LS R2:', r2_score(pred_train_df['true'], pred_train_df['ls_pred']))

print('Training LAD rMSE:', np.sqrt(mean_squared_error(pred_train_df['true'], pred_train_df['lad_pred'])))
print('Training LAD MAE:', mean_absolute_error(pred_train_df['true'], pred_train_df['lad_pred']))
print('Training LAD MAD:', np.median(np.abs(pred_train_df['true'] - pred_train_df['lad_pred'])))
print('Training LAD correlation:', np.corrcoef(pred_train_df['true'], pred_train_df['lad_pred'])[0, 1])
print('Training LAD R2:', r2_score(pred_train_df['true'], pred_train_df['lad_pred']))

Training LS rMSE: 2.85345807046718
Training LS MAE: 2.0371157733754037
Training LS MAD: 1.5292847725892287
Training LS correlation: 0.827323340814822
Training LS R2: 0.6844639102569973
Training LAD rMSE: 2.910222629968196
Training LAD MAE: 2.0009179671003765
Training LAD MAD: 1.4513305944908632
Training LAD correlation: 0.8265605160647699
Training LAD R2: 0.6717849618606204


In [58]:
# calculate the rMSE, MAE, MAD, correlation, and R2 of the true price with the LS and LAD predictions
print('Training LS rMSE:', np.sqrt(mean_squared_error(pred_val_df['true'], pred_val_df['ls_pred'])))
print('Training LS MAE:', mean_absolute_error(pred_val_df['true'], pred_val_df['ls_pred']))
print('Training LS MAD:', np.median(np.abs(pred_val_df['true'] - pred_val_df['ls_pred'])))
print('Training LS correlation:', np.corrcoef(pred_val_df['true'], pred_val_df['ls_pred'])[0, 1])
print('Training LS R2:', r2_score(pred_val_df['true'], pred_val_df['ls_pred']))

print('Training LAD rMSE:', np.sqrt(mean_squared_error(pred_val_df['true'], pred_val_df['lad_pred'])))
print('Training LAD MAE:', mean_absolute_error(pred_val_df['true'], pred_val_df['lad_pred']))
print('Training LAD MAD:', np.median(np.abs(pred_val_df['true'] - pred_val_df['lad_pred'])))
print('Training LAD correlation:', np.corrcoef(pred_val_df['true'], pred_val_df['lad_pred'])[0, 1])
print('Training LAD R2:', r2_score(pred_val_df['true'], pred_val_df['lad_pred']))

Training LS rMSE: 2.9429841316347956
Training LS MAE: 2.0876877302647805
Training LS MAD: 1.5589476998098695
Training LS correlation: 0.8241734868078802
Training LS R2: 0.6791162535885883
Training LAD rMSE: 3.0112265207327624
Training LAD MAE: 2.0492301052386312
Training LAD MAD: 1.4735990950433528
Training LAD correlation: 0.8233932289203515
Training LAD R2: 0.664062309569839


Based on the above values, the LS model is better than the LAD model for predicting loudness based on our manually selected predictor variables since the rMSE is lower, and the R^2 value is higher. This occurs in both the training and validation data set. Since the R^2 values for both the training and validation data sets are very similar, it is unlikely that the model is overfitting. However, since the R^2 values are relatively low, it is likely that the model is not a good fit for the data, and is thus underfitting.

In [59]:
from sklearn.linear_model import  Ridge, Lasso
from sklearn.model_selection import cross_val_score, cross_validate


In [60]:
X = training_data.drop(columns=['loudness'])[vars]
# scale the predictors
X_std = (X - X.mean()) / X.std()
y = training_data['loudness']

In [61]:
alphas = np.logspace(-1, 6, 100)
ridge_cv_scores = []
# create a for loop to compute the cross-validation score for each alpha value
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge_cv = cross_validate(estimator=ridge,
                              X=X_std,
                              y=y,
                              cv=10,
                              scoring='neg_root_mean_squared_error')
    ridge_cv_scores.append({'alpha': alpha,
                            'log_alpha': np.log(alpha),
                            'test_mse': -np.mean(ridge_cv['test_score'])})

# convert the cross-validation scores into a data frame
ridge_cv_scores_df = pd.DataFrame(ridge_cv_scores)

In [62]:
ridge_cv_scores_df 

Unnamed: 0,alpha,log_alpha,test_mse
0,0.100000,-2.302585,2.853698
1,0.117681,-2.139776,2.853698
2,0.138489,-1.976967,2.853698
3,0.162975,-1.814158,2.853698
4,0.191791,-1.651349,2.853698
...,...,...,...
95,521400.828800,13.164274,4.604231
96,613590.727341,13.327083,4.664212
97,722080.901839,13.489892,4.717943
98,849753.435909,13.652702,4.765737


In [63]:
# define the alpha values to test
alphas = np.logspace(-1, 4, 100)

# create an empty list to store the cross-validation scores
lasso_cv_scores = []

# create a for loop to compute the cross-validation score for each alpha value
for alpha in alphas:
    lasso = Lasso(alpha=alpha)
    lasso_cv = cross_validate(estimator=lasso,
                              X=X_std,
                              y=y,
                              cv=10,
                              scoring='neg_root_mean_squared_error')
    lasso_cv_scores.append({'alpha': alpha,
                            'log_alpha': np.log(alpha),
                            'test_mse': -np.mean(lasso_cv['test_score'])})

# convert the cross-validation scores into a data frame
lasso_cv_scores_df = pd.DataFrame(lasso_cv_scores)

In [64]:

# identify the value of alpha that minimizes the cross-validation score for ridge
ridge_alpha_min = ridge_cv_scores_df.sort_values(by='test_mse').head(1).alpha.values[0]
# compute the min MSE and the SE of the MSE
mse_se_ridge = ridge_cv_scores_df['test_mse'].std() / np.sqrt(10)
mse_min_ridge = ridge_cv_scores_df['test_mse'].min()


# identify the value of alpha that minimizes the cross-validation score for ridge within 1SE
ridge_alpha_1se = ridge_cv_scores_df[(ridge_cv_scores_df['test_mse'] <= mse_min_ridge + mse_se_ridge) &
                                     (ridge_cv_scores_df['test_mse'] >= mse_min_ridge - mse_se_ridge)].sort_values(by='alpha', ascending=False).head(1).alpha.values[0]


# identify the value of alpha that minimizes the cross-validation score for lasso
lasso_alpha_min = lasso_cv_scores_df.sort_values(by='test_mse').head(1).alpha.values[0]
# compute the min MSE and the SE of the MSE
mse_se_lasso = lasso_cv_scores_df['test_mse'].std() / np.sqrt(10)
mse_min_lasso = lasso_cv_scores_df['test_mse'].min()

# identify the value of alpha that minimizes the cross-validation score for lasso within 1SE
lasso_alpha_1se = lasso_cv_scores_df[(lasso_cv_scores_df['test_mse'] <= mse_min_lasso + mse_se_lasso) &
                                     (lasso_cv_scores_df['test_mse'] >= mse_min_lasso - mse_se_lasso)].sort_values(by='alpha', ascending=False).head(1).alpha.values[0]


In [65]:
print('Ridge (min): ', ridge_alpha_min)
print('Ridge (1SE): ', ridge_alpha_1se)
print('Lasso (min): ', lasso_alpha_min)
print('Lasso (1SE): ', lasso_alpha_1se)

Ridge (min):  0.1
Ridge (1SE):  17073.526474706887
Lasso (min):  0.1
Lasso (1SE):  0.8111308307896871


In [66]:
# use ridge_alpha_min to fit the ridge regression model
ridge_min_fit = Ridge(alpha=ridge_alpha_min).fit(X=X_std, y=y)
ridge_1se_fit = Ridge(alpha=ridge_alpha_1se).fit(X=X_std, y=y)

# use lasso_alpha_min to fit the lasso regression model
lasso_min_fit = Lasso(alpha=lasso_alpha_min).fit(X=X_std, y=y)
lasso_1se_fit = Lasso(alpha=lasso_alpha_1se).fit(X=X_std, y=y)

print(ridge_min_fit.intercept_, ridge_min_fit.coef_[0], ridge_min_fit.coef_[1], ridge_min_fit.coef_[2], ridge_min_fit.coef_[3], ridge_min_fit.coef_[4])

-8.397920565750672 -1.537174344317504 3.413573565203451 -0.26833982207154156 0.6727443499045818 -0.213977635587475


In [75]:
pred_train_df_ridge = pd.DataFrame({'true': training_data[response], 'ridge_min_pred': ridge_min_fit.predict(X_std), 'ridge_1se_pred': ridge_1se_fit.predict(X_std)})

X_val = validation_data.drop(columns=['loudness'])[vars]
X_val_std = (X_val - X_val.mean()) / X_val.std()
pred_val_df_ridge = pd.DataFrame({'true': validation_data[response], 'ridge_min_pred': ridge_min_fit.predict(X_val_std), 'ridge_1se_pred': ridge_1se_fit.predict(X_val_std)})


In [76]:
# calculate the rMSE, MAE, MAD, correlation, and R2 of the true price with the LS and LAD predictions
print('Training Ridge (min) rMSE:', np.sqrt(mean_squared_error(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_min_pred'])))
print('Training Ridge (min) MAE:', mean_absolute_error(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_min_pred']))
print('Training Ridge (min) MAD:', np.median(np.abs(pred_train_df_ridge['true'] - pred_train_df_ridge['ridge_min_pred'])))
print('Training Ridge (min) correlation:', np.corrcoef(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_min_pred'])[0, 1])
print('Training Ridge (min) R2:', r2_score(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_min_pred']))

print('Training Ridge (1SE) rMSE:', np.sqrt(mean_squared_error(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_1se_pred'])))
print('Training Ridge (1SE) MAE:', mean_absolute_error(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_1se_pred']))
print('Training Ridge (1SE) MAD:', np.median(np.abs(pred_train_df_ridge['true'] - pred_train_df_ridge['ridge_1se_pred'])))
print('Training Ridge (1SE) correlation:', np.corrcoef(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_1se_pred'])[0, 1])
print('Training Ridge (1SE) R2:', r2_score(pred_train_df_ridge['true'], pred_train_df_ridge['ridge_1se_pred']))

Training Ridge (min) rMSE: 2.853458070485421
Training Ridge (min) MAE: 2.0371152411872884
Training Ridge (min) MAD: 1.5292989086282294
Training Ridge (min) correlation: 0.8273233408133793
Training Ridge (min) R2: 0.684463910252963
Training Ridge (1SE) rMSE: 2.993667700914229
Training Ridge (1SE) MAE: 2.100063550259546
Training Ridge (1SE) MAD: 1.5639800023892754
Training Ridge (1SE) correlation: 0.8187727771758405
Training Ridge (1SE) R2: 0.6526932435253561


In [77]:
print('Validation Ridge (min) rMSE:', np.sqrt(mean_squared_error(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_min_pred'])))
print('Validation Ridge (min) MAE:', mean_absolute_error(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_min_pred']))
print('Validation Ridge (min) MAD:', np.median(np.abs(pred_val_df_ridge['true'] - pred_val_df_ridge['ridge_min_pred'])))
print('Validation Ridge (min) correlation:', np.corrcoef(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_min_pred'])[0, 1])
print('Validation Ridge (min) R2:', r2_score(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_min_pred']))

print('Validation Ridge (1SE) rMSE:', np.sqrt(mean_squared_error(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_1se_pred'])))
print('Validation Ridge (1SE) MAE:', mean_absolute_error(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_1se_pred']))
print('Validation Ridge (1SE) MAD:', np.median(np.abs(pred_val_df_ridge['true'] - pred_val_df_ridge['ridge_1se_pred'])))
print('Validation Ridge (1SE) correlation:', np.corrcoef(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_1se_pred'])[0, 1])
print('Validation Ridge (1SE) R2:', r2_score(pred_val_df_ridge['true'], pred_val_df_ridge['ridge_1se_pred']))


Validation Ridge (min) rMSE: 2.945051794051611
Validation Ridge (min) MAE: 2.0815143077734044
Validation Ridge (min) MAD: 1.5475140156095613
Validation Ridge (min) correlation: 0.8241655484217629
Validation Ridge (min) R2: 0.6786652064171586
Validation Ridge (1SE) rMSE: 3.09385060097544
Validation Ridge (1SE) MAE: 2.1443770208607713
Validation Ridge (1SE) MAD: 1.5565513009987715
Validation Ridge (1SE) correlation: 0.8165944489258675
Validation Ridge (1SE) R2: 0.6453740149893858


The Ridge model with an optimized alpha value is slightly better than the base LS model for predicting loudness based on our manually selected predictor variables since the rMSE is lower, and the R^2 value is higher. This occurs in both the training and validation data set. Since the R^2 values for both the training and validation data sets are very similar, it is unlikely that the model is overfitting. However, since the R^2 values are relatively low, it is likely that the model is not a good fit for the data, and is thus underfitting.

In [78]:
pred_train_df_lasso = pd.DataFrame({'true': training_data[response], 'lasso_min_pred': lasso_min_fit.predict(X_std), 'lasso_1se_pred': lasso_1se_fit.predict(X_std)})
pred_val_df_lasso = pd.DataFrame({'true': validation_data[response], 'lasso_min_pred': lasso_min_fit.predict(X_val_std), 'lasso_1se_pred': lasso_1se_fit.predict(X_val_std)})

In [79]:
print('Training Lasso (min) rMSE:', np.sqrt(mean_squared_error(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_min_pred'])))
print('Training Lasso (min) MAE:', mean_absolute_error(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_min_pred']))
print('Training Lasso (min) MAD:', np.median(np.abs(pred_train_df_lasso['true'] - pred_train_df_lasso['lasso_min_pred'])))
print('Training Lasso (min) correlation:', np.corrcoef(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_min_pred'])[0, 1])
print('Training Lasso (min) R2:', r2_score(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_min_pred']))

print('Training Lasso (1SE) rMSE:', np.sqrt(mean_squared_error(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_1se_pred'])))
print('Training Lasso (1SE) MAE:', mean_absolute_error(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_1se_pred']))
print('Training Lasso (1SE) MAD:', np.median(np.abs(pred_train_df_lasso['true'] - pred_train_df_lasso['lasso_1se_pred'])))
print('Training Lasso (1SE) correlation:', np.corrcoef(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_1se_pred'])[0, 1])
print('Training Lasso (1SE) R2:', r2_score(pred_train_df_lasso['true'], pred_train_df_lasso['lasso_1se_pred']))

Training Lasso (min) rMSE: 2.8656925690106974
Training Lasso (min) MAE: 2.0340267924852258
Training Lasso (min) MAD: 1.5249533269599533
Training Lasso (min) correlation: 0.8260972861933737
Training Lasso (min) R2: 0.6817523219149417
Training Lasso (1SE) rMSE: 3.1086765115129777
Training Lasso (1SE) MAE: 2.1261891506453634
Training Lasso (1SE) MAD: 1.5808256321830552
Training Lasso (1SE) correlation: 0.8133587571764183
Training Lasso (1SE) R2: 0.6254954381402555


In [80]:
print('Validation Lasso (min) rMSE:', np.sqrt(mean_squared_error(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_min_pred'])))
print('Validation Lasso (min) MAE:', mean_absolute_error(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_min_pred']))
print('Validation Lasso (min) MAD:', np.median(np.abs(pred_val_df_lasso['true'] - pred_val_df_lasso['lasso_min_pred'])))
print('Validation Lasso (min) correlation:', np.corrcoef(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_min_pred'])[0, 1])
print('Validation Lasso (min) R2:', r2_score(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_min_pred']))

print('Validation Lasso (1SE) rMSE:', np.sqrt(mean_squared_error(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_1se_pred'])))
print('Validation Lasso (1SE) MAE:', mean_absolute_error(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_1se_pred']))
print('Validation Lasso (1SE) MAD:', np.median(np.abs(pred_val_df_lasso['true'] - pred_val_df_lasso['lasso_1se_pred'])))
print('Validation Lasso (1SE) correlation:', np.corrcoef(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_1se_pred'])[0, 1])
print('Validation Lasso (1SE) R2:', r2_score(pred_val_df_lasso['true'], pred_val_df_lasso['lasso_1se_pred']))

Validation Lasso (min) rMSE: 2.9632585751877585
Validation Lasso (min) MAE: 2.080455087574899
Validation Lasso (min) MAD: 1.5389957184135739
Validation Lasso (min) correlation: 0.8226013758930494
Validation Lasso (min) R2: 0.6746798391325683
Validation Lasso (1SE) rMSE: 3.222475753516499
Validation Lasso (1SE) MAE: 2.1712137814934334
Validation Lasso (1SE) MAD: 1.5914931718047818
Validation Lasso (1SE) correlation: 0.8101264427220889
Validation Lasso (1SE) R2: 0.6152743028152201


The Lasso model performed slightly worse than the Ridge model, but performed better than the base LS model for the training data set. However, the Lasso model performed worse than the base LS model for the validation data set. This suggests that the Lasso model is overfitting the training data set, and is not a good fit for the data.