# Week 3 Check-In
## Team Spotiflies: Joanna, Aaron, Aubrey, Kennedy, Aster, Ethan
GitHub Link: https://github.com/ketexon/csm148-spotiflies 

In [44]:
%pip install pandas numpy matplotlib seaborn scikit-learn mlxtend

Note: you may need to restart the kernel to use updated packages.


### Reading in the Data
We used the cleaned version of our dataset from the week 2 check in.

In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.image as mpimg

# Reading in the cleaned data from previous week check in
spotify = pd.read_csv("csv_outputs/cleaned_spotify.csv")
numeric_spotify = spotify.select_dtypes(include=[np.number])
numeric_spotify = numeric_spotify.loc[:, ~numeric_spotify.columns.isin(["mode", "key", "time_signature"])]
numeric_spotify

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,73,230666,0.676,0.4610,-6.746,0.1430,0.0322,0.000001,0.3580,0.7150,87.917
1,55,149610,0.420,0.1660,-17.235,0.0763,0.9240,0.000006,0.1010,0.2670,77.489
2,57,210826,0.438,0.3590,-9.734,0.0557,0.2100,0.000000,0.1170,0.1200,76.332
3,71,201933,0.266,0.0596,-18.515,0.0363,0.9050,0.000071,0.1320,0.1430,181.740
4,82,198853,0.618,0.4430,-9.681,0.0526,0.4690,0.000000,0.0829,0.1670,119.949
...,...,...,...,...,...,...,...,...,...,...,...
113994,21,384999,0.172,0.2350,-16.393,0.0422,0.6400,0.928000,0.0863,0.0339,125.995
113995,22,385000,0.174,0.1170,-18.318,0.0401,0.9940,0.976000,0.1050,0.0350,85.239
113996,22,271466,0.629,0.3290,-10.895,0.0420,0.8670,0.000000,0.0839,0.7430,132.378
113997,41,283893,0.587,0.5060,-10.889,0.0297,0.3810,0.000000,0.2700,0.4130,135.960


### Regression

Let's use the `popularity` variable as our response variable, as an accurate analysis of this variable can be useful for predicting the performance of upcoming songs in the real world.

Let's compare this against the `energy` variable for our predictor, as the relationship between the energy level of a song and its subsequent popularity is always something we've been curious about.

In [46]:
# Making scatterplots for each variable vs popularity
# correlated_features = [
#     ("energy", "popularity"),
# ]
# for (i, j) in correlated_features:
#     plt.plot(np.unique(numeric_spotify[i]), np.poly1d(np.polyfit(numeric_spotify[i], numeric_spotify[j], 1))(np.unique(numeric_spotify[i])), color='red')
#     plt.plot(np.unique(numeric_spotify[i]), np.poly1d(np.polyfit(numeric_spotify[i], numeric_spotify[j], 2))(np.unique(numeric_spotify[i])), color='orange')
#     plt.scatter(numeric_spotify[i], numeric_spotify[j], s=5)
#     plt.xlabel(i)
#     plt.ylabel(j)
#     plt.show()

In [47]:
# Modeling regression of energy versus popularity
# We are doing a linear and quadratic regression analysis.

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector

random_seed = 42
response = "energy"

# Splitting the data
# First split: separate out 20% for the test set
spotify_train_val, spotify_test = train_test_split(numeric_spotify, test_size=0.2, random_state=random_seed)

# Second split: separate remaining 80% into 60% training and 40% validation
spotify_train, spotify_val = train_test_split(spotify_train_val, test_size=0.25, random_state=random_seed)  # 0.25 * 0.8 = 0.2

# Reshape the data to fit the model
X = spotify_train.drop(columns=response)
y = spotify_train[response]

linear_reg = LinearRegression()

# Select features
selector = SequentialFeatureSelector(
    linear_reg,
    n_features_to_select='auto',
    direction='forward',
    scoring='r2',
    cv = 5
)

selector.fit(X, y)
print("Selected Features: ", selector.get_feature_names_out(X.columns))

# Transform data sets
X = selector.transform(X)

X_test = selector.transform(spotify_test.drop(columns=response))
y_test = spotify_test[response]

X_val = selector.transform(spotify_val.drop(columns=response))
y_val = spotify_val[response]


# Fit the Linear Regression model
linear_reg.fit(X, y)

# Predict values of y based on the model
y_pred = linear_reg.predict(X)

# Plotting the scatter plot of the original data
# plt.scatter(spotify_train[features], spotify_train['popularity'], color='blue', s=5, label="Data Points")

# # Plotting the regression line
# plt.plot(spotify_train[features], y_pred, color='red', label="Regression Line")

# # Adding labels and title
# plt.xlabel(features)
# plt.ylabel('Popularity')
# plt.title('Energy vs Popularity Linear Regression')
# plt.legend()

# # Show the plot
# plt.show()

Selected Features:  ['loudness' 'acousticness' 'instrumentalness' 'liveness' 'valence']


### Evaluation Metrics

In [48]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlxtend.evaluate import bias_variance_decomp

# Calculating Evaluation Metrics:
y_val_pred = linear_reg.predict(X_val) # Predict on validation set

# Calculate metrics for the training set
train_mse = mean_squared_error(y, y_pred) # Mean squared error
train_rmse = np.sqrt(train_mse) # Root mean squared error
train_mae = mean_absolute_error(y, y_pred) # Mean average error
train_mad = np.mean(np.abs(y - y_pred))  # Mean absolute deviation
train_r2 = r2_score(y, y_pred) # R^2 (coefficient of determination)

# Calculate metrics for the validation set
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_mad = np.mean(np.abs(y_val - y_val_pred))
val_r2 = r2_score(y_val, y_val_pred)

# Print Results:
print(f"Training MSE: {train_mse}, rMSE: {train_rmse}, MAE: {train_mae}, MAD: {train_mad}, R²: {train_r2}")
print(f"Validation MSE: {val_mse}, rMSE: {val_rmse}, MAE: {val_mae}, MAD: {val_mad}, R²: {val_r2}")

# bias variance
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    linear_reg,
    X, y.values,
    X_test, y_test.values,
    loss='mse',
    random_seed=123
)

print(f"Loss, variance, and bias: {avg_expected_loss}, {avg_bias}, {avg_var}")

Training MSE: 0.01627171198159243, rMSE: 0.12756062081062647, MAE: 0.0993956250811578, MAD: 0.0993956250811578, R²: 0.7428611885163132
Validation MSE: 0.016148446469073437, rMSE: 0.12707653783871134, MAE: 0.09929688057551381, MAD: 0.09929688057551381, R²: 0.7455912411058057
Loss, variance, and bias: 0.01626304898193079, 0.016261046075874076, 2.002906056709998e-06


Based on the evaluation metrics we calculated above, our model is likely to be **underfitting** the data. 

Most notably, the MSE is very high for both the training and validation sets, 497.32 and 499.41 respectively. This may suggest that the model doesn't fit the data very well and cannot capture underlying patterns in the data.

Similarly, the R^2 values (1.11e-07 and -7.89e-06) are close to zero, which indicates that the model is explaining almost none of the variance in the dependent variable (popularity). 

### Regularization