In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from spotify_cleaner import clean_data, remove_duplicates

In [None]:
# Load Data
df = pd.read_csv('data/dataset.csv', index_col=0)
df = clean_data(df)
df = remove_duplicates(df)

print(df.info())
df.head()

In [None]:

# Apply log transformation to popularity to normalize distribution
df['popularity_log'] = np.log1p(df['popularity'])

# Undersample zero-popularity tracks
pop_nonzero = df[df['popularity'] > 0]
pop_zero = df[df['popularity'] == 0].sample(n=min(len(pop_nonzero), len(df[df['popularity'] == 0])), random_state=42)
df_balanced = pd.concat([pop_nonzero, pop_zero])

# prepare modelling data (feature & target definition, train/test-split)
features_cols = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
target_col = 'popularity_log'

features = df_balanced[features_cols]
target = df_balanced[target_col]

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=42)

In [None]:

# scale features
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)
features_test_scaled = scaler.transform(features_test)

# Simple Linear Regression
model_linear = LinearRegression()
model_linear.fit(features_train_scaled, target_train)

# predict & evaluate
target_pred_linear = model_linear.predict(features_test_scaled)

rmse_linear = np.sqrt(mean_squared_error(target_test, target_pred_linear))
r2_linear = r2_score(target_test, target_pred_linear)

print('Linear Regression RMSE: {:.4f}'.format(rmse_linear))
print('Linear Regression R² Score: {:.4f}'.format(r2_linear))

In [None]:
# Visualise actual vs predicted popularity
plt.figure(figsize=(6, 6))
sns.scatterplot(x=target_test.values.flatten(), y=target_pred_linear.flatten(), alpha=0.25)
plt.xlabel("Actual Log Popularity")
plt.ylabel("Predicted Log Popularity")
plt.title("Actual vs Predicted Log Popularity")

In [None]:
# Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=250, max_depth=18, min_samples_split=15, min_samples_leaf=6, random_state=42)
rf_model.fit(features_train_scaled, target_train)

target_pred_rf = rf_model.predict(features_test_scaled)
print("Random Forest R2:", r2_score(target_test, target_pred_rf))
print("Random Forest RMSE:", np.sqrt(mean_squared_error(target_test, target_pred_rf)))

In [None]:
# Visualization
plt.figure(figsize=(10, 5))
sns.scatterplot(x=target_test, y=target_pred_rf, alpha=0.25)
plt.xlabel("Actual Log Popularity")
plt.ylabel("Predicted Log Popularity")
plt.title("Random Forest: Actual vs Predicted Log Popularity")