In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from spotify_cleaner import clean_data, remove_duplicates

In [None]:
df = pd.read_csv('data/dataset.csv', index_col=0)
df = clean_data(df)
df = remove_duplicates(df)

print(df.info())
df.head()

In [None]:
# prepare modelling data (feature & target definition, train/test-split)
features_cols = ['energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']
target_col = ['danceability']

features = df[features_cols]
target = df[target_col]

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)

In [None]:
# scale features
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)
features_test_scaled = scaler.transform(features_test)

# initial regression model
model = LinearRegression()
model.fit(features_train_scaled, target_train)

In [None]:
# Predict & Evaluate
target_pred = model.predict(features_test_scaled)

rmse = np.sqrt(mean_squared_error(target_test, target_pred))
r2 = r2_score(target_test, target_pred)

print('RMSE: {:.4f}'.format(rmse))
print('R² Score: {:.4f}'.format(r2))

In [None]:
# Visualise actual vs predicted danceability
plt.figure(figsize=(6, 6))
sns.scatterplot(x=target_test.values.flatten(), y=target_pred.flatten(), alpha=0.25)
plt.plot([0, 1], [0, 1], '--', color='red')
plt.xlabel('Actual Danceability')
plt.ylabel('Predicted Danceability')
plt.title('Actual vs. Predicted Danceability')
plt.show()

In [None]:
# feature importance

coefficients = pd.DataFrame({'Feature': features_cols, 'Coefficient': model.coef_.flatten()})
print(coefficients.sort_values(by='Coefficient', ascending=False))

## Verdict
Simple linear regression does not seem to be our friend here. (see: multiple features have non-linear relationships)
RMSE: 0.1443, R² Score: 0.3393

In [None]:
# polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
features_train_poly = poly.fit_transform(features_train_scaled)
features_test_poly = poly.transform(features_test_scaled)

model_poly = LinearRegression()
model_poly.fit(features_train_poly, target_train)

# predict
target_pred_poly = model_poly.predict(features_test_poly)


In [None]:
# polynom evaluate
rmse_poly = np.sqrt(mean_squared_error(target_test, target_pred_poly))
r2_poly = r2_score(target_test, target_pred_poly)

print('RMSE: {:.4f}'.format(rmse_poly))
print('R² Score: {:.4f}'.format(r2_poly))

In [None]:
# Visualise actual vs predicted danceability
plt.figure(figsize=(6, 6))
sns.scatterplot(x=target_test.values.flatten(), y=target_pred.flatten(), alpha=0.25)
plt.plot([0, 1], [0, 1], '--', color='red')
plt.xlabel('Actual Danceability')
plt.ylabel('Predicted Danceability')
plt.title('Actual vs. Predicted Danceability')
plt.show()

## Verdict
Better, if not exactly good (but possibly the best we can do with linear regression)
RMSE: 0.1384
R² Score: 0.3921