In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from spotify_cleaner import clean_data, remove_duplicates

In [None]:
df = pd.read_csv('data/dataset.csv', index_col=0)
df = clean_data(df)
df = remove_duplicates(df)

# reclassifying sleep genre as "negative control group"
df.loc[df['track_genre'] == 'sleep', 'danceability'] = 0

print(df.info())
df.head()

In [None]:
# prepare modelling data (feature & target definition, train/test-split)
features_cols = ['energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']
target_col = ['danceability']

features = df[features_cols]
target = df[target_col]

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)

In [None]:
# scale features
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)
features_test_scaled = scaler.transform(features_test)

# polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
features_train_poly = poly.fit_transform(features_train_scaled)
features_test_poly = poly.transform(features_test_scaled)

model_poly = LinearRegression()
model_poly.fit(features_train_poly, target_train)

# predict & evaluate
target_pred_poly = model_poly.predict(features_test_poly)

rmse_poly = np.sqrt(mean_squared_error(target_test, target_pred_poly))
r2_poly = r2_score(target_test, target_pred_poly)

print('RMSE: {:.4f}'.format(rmse_poly))
print('R² Score: {:.4f}'.format(r2_poly))

In [None]:
# Visualise actual vs predicted danceability
plt.figure(figsize=(6, 6))
sns.scatterplot(x=target_test.values.flatten(), y=target_pred_poly.flatten(), alpha=0.25)
plt.plot([0, 1], [0, 1], '--', color='red')
plt.xlabel('Actual Danceability')
plt.ylabel('Predicted Danceability')
plt.title('Actual vs. Predicted Danceability')

## Verdict
Better than pure linear regression, but still not good. Best scores: RMSE: 0.1408, R² Score: 0.3999

In [None]:
# initialise RF regression model
rf_model = RandomForestRegressor(random_state=42, min_samples_split=5, min_samples_leaf=2)

In [None]:
# GridSearchCV
# search_space = {
#     'n_estimators': np.geomspace(250, 400, num=5, dtype='int'),
#     'max_depth': np.geomspace(18, 25, num=5, dtype='int')
#     }

# grid_search = GridSearchCV(estimator=rf_model, param_grid=search_space, cv=3, scoring='r2', n_jobs=-1)
# grid_search.fit(features_train, target_train.values.ravel()) #flattens df!

# print('Best parameters:', grid_search.best_params_)

# note: Best parameters: {'max_depth': np.int64(25), 'n_estimators': np.int64(400)}

In [None]:
# prediction and evaluation (originally "best estimator")
best_rf_model = RandomForestRegressor(random_state=42, max_depth=25, n_estimators=400, min_samples_split=5, min_samples_leaf=2)
target_pred_rf = best_rf_model.predict(features_test)

rmse_rf = np.sqrt(mean_squared_error(target_test, target_pred_rf))
r2_rf = r2_score(target_test, target_pred_rf)

print('Optimised Random Forest RMSE: {:.4f}'.format(rmse_rf))
print('Optimised Random Forest R² Score: {:.4f}'.format(r2_rf))

In [None]:
# Visualise actual vs predicted danceability
plt.figure(figsize=(6, 6))
sns.scatterplot(x=target_test.values.flatten(), y=target_pred_rf.flatten(), alpha=0.25)
plt.plot([0, 1], [0, 1], '--', color='red')
plt.xlabel('Actual Danceability')
plt.ylabel('Predicted Danceability')
plt.title('Random Forest: Actual vs. Predicted Danceability')

In [None]:
# feature importance

feature_importance = pd.DataFrame({'Feature': features_cols, 'Importance': best_rf_model.feature_importances_})
print(feature_importance.sort_values(by='Importance', ascending=False))

In [None]:
# Overfitting Check I - Model Fit
rf_model = RandomForestRegressor(n_estimators=250, max_depth=18, min_samples_split=15, min_samples_leaf=6, random_state=42)
rf_model.fit(features_train, target_train.values.ravel())

In [None]:
# Overfitting Check II - Prediction & Evaluation
target_pred_rf = rf_model.predict(features_test)

rmse_rf = np.sqrt(mean_squared_error(target_test, target_pred_rf))
r2_rf = r2_score(target_test, target_pred_rf)

target_pred_train = rf_model.predict(features_train)

rmse_train = np.sqrt(mean_squared_error(target_train, target_pred_train))
r2_train = r2_score(target_train, target_pred_train)

# Training vs. Test performance
print('Training RMSE: {:.4f}'.format(rmse_train))
print('Training R² Score: {:.4f}'.format(r2_train))
print('Test RMSE: {:.4f}'.format(rmse_rf))
print('Test R² Score: {:.4f}'.format(r2_rf))

# Overfitting Check: Big Gap?
if r2_train - r2_rf > 0.1:
    print('The model may be overfitting! Consider tuning hyperparameters or using a different model.')
else:
    print('No major overfitting detected.')


## Verdict
Decent up to ~61% but prone to serious overfitting, even with min_samples adjustment. Use classification, instead.