In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from helper import evaluate_model, target_encode
import xgboost as xgb
from lightgbm import LGBMRegressor

In [42]:
df = pd.read_csv('data/tracks_spectral_reduced.csv')

### Data Preparation

Since we will apply regression models in this part, we have to handle categorical columns. We will either encode them or drop them if they have high cardinality.

In [43]:
# Identify categorical columns excluding 'genres' and 'genres_all'
# These columns should be dropped or encoded if necessary
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
cols_to_handle = [col for col in categorical_cols if col not in ['genres', 'genres_all','artist_name','album_title']]
print(cols_to_handle)
# ['title']
# Since these columns have high cardinality and are not useful for regression, we will drop them
cols_to_drop = cols_to_handle
df = df.drop(columns=cols_to_drop, errors='ignore')

['title']


Now, we will analyse the target feature : duration.

In [44]:
# Outlier clipping and log transformation for 'duration'
print(df['duration'].describe().round(2).T)

# We see an extreme outlier at 11030 seconds, so we will clip durations at 2000 seconds.

# Clipping
MAX_DURATION = 600.0
df['duration_clipped'] = np.clip(df['duration'], a_min=None, a_max=MAX_DURATION)

# log transformation (to reduce skewness for regression)
df['y_duration_log'] = np.log1p(df['duration_clipped'])

count    97288.00
mean       274.82
std        283.36
min          0.00
25%        151.00
50%        218.00
75%        306.00
max      11030.00
Name: duration, dtype: float64


Now, we define the input set and the target 'y_duration_log'.  
For the input, it will exclude the target feature (and its origins, 'duration' and 'duration_clipped'). It will exclude genres and genres_all too because they are a list of ids. Finally, it will exclude the columns 'Unnamed: 0' and the column 'track_id'.

In [45]:
# Split into inputs X and target Y
Y = df['y_duration_log']
cols_to_exclude = [
    'duration',
    'duration_clipped',
    'y_duration_log',
    'genres', 'genres_all',
    'Unnamed: 0',
    'track_id'
]
X = df.drop(columns=[col for col in cols_to_exclude if col in df.columns], errors='ignore')
print(f"Shape of X (Input Features): {X.shape}")
print(f"Shape of Y (Target): {Y.shape}")

Shape of X (Input Features): (97288, 21)
Shape of Y (Target): (97288,)


Here, we split the data into train and test data and we standarize based on X_train.

In [46]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, 
    test_size=0.2, 
    random_state=42 
)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (77830, 21)
Shape of X_test: (19458, 21)


We then need to make some changes regarding the names of the artists and the album titles. We can't apply hot encoding on these specific features because of their high cardinality, but we can identify each artist/album with their average duration time. We apply the target encoding. This allows us to turn a high-cardinality categorical feature into a single, powerful numerical feature.

In [33]:
cols_to_encode = ['artist_name', 'album_title']
cols_to_encode = [c for c in cols_to_encode if c in X_train.columns]

for col in cols_to_encode:
    smoothing = 5 if col == 'album_title' else 10
    X_train, X_test = target_encode(X_train, X_test, Y_train, col, m=smoothing)

In [34]:
# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

## Baseline : KNN Regressor

In [35]:
# Baseline : KNN Regressor

model_knn = KNeighborsRegressor(n_neighbors=5) 
model_knn.fit(X_train_scaled, Y_train)

print("Performance of KNN Regressor")
#predict and evaluate
evaluate_model(model_knn, X_test_scaled, Y_test)

Performance of KNN Regressor
R² Score: 0.4470
RMSE (Log-Seconds): 0.5009
MAE (Log-Seconds): 0.3624
MAE (Seconds): 80.95 seconds


## Random Forest Regressor

In [36]:
#Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model_rf.fit(X_train_scaled, Y_train)

print("Performance of Random Forest Regressor")
#predict and evaluate
evaluate_model(model_rf, X_test_scaled, Y_test)

Performance of Random Forest Regressor
R² Score: 0.5044
RMSE (Log-Seconds): 0.4741
MAE (Log-Seconds): 0.3367
MAE (Seconds): 74.58 seconds


## Merge with Echonest

In this part, we will add features from `echonest_features.tsv`. This merge will reduce the size of the dataset to 90%.  
We will compare the performance of a random forest regressor on a small data set with more features and on a large data set but with less features.

In [37]:
# Merging with Echonest features
echonest = pd.read_csv('data/echonest_features.tsv', sep='\t')

df_complet = df.merge(echonest, on='track_id')

print(f"Old shape : {df.shape}")
print(f"New shape (after merge) : {df_complet.shape}")

Old shape : (97288, 28)
New shape (after merge) : (10552, 36)


In [38]:
categorical_cols_after = df_complet.select_dtypes(include=['object']).columns.tolist()
cols_to_handle_after = [col for col in categorical_cols_after if col not in ['genres', 'genres_all','artist_name','album_title']]
print(cols_to_handle_after)
# No new categorical columns introduced

[]


In [39]:
#Verifying missing values after merging
df_complet.isnull().sum()/len(df_complet) *100

Unnamed: 0                   0.000000
track_id                     0.000000
album_title                  0.000000
album_tracks                 0.000000
artist_latitude              0.000000
artist_longitude             0.000000
artist_name                  0.000000
duration                     0.000000
genre_top                    0.000000
genres                       0.000000
genres_all                   0.000000
spectral_bandwidth_max_01    0.000000
spectral_bandwidth_min_01    0.000000
spectral_bandwidth_std_01    0.000000
spectral_centroid_max_01     0.000000
spectral_centroid_min_01     0.000000
spectral_centroid_std_01     0.000000
spectral_rolloff_max_01      0.000000
spectral_rolloff_min_01      0.000000
spectral_rolloff_std_01      0.000000
artist_location_unknown      0.000000
g4_pc1                       0.000000
g5_pc1                       0.000000
g6_pc1                       0.000000
g6_pc2                       0.000000
g6_pc3                       0.000000
duration_cli

In [40]:
# Impute missing values with median
cols_to_impute = ['speechiness', 'valence', 'danceability']

for col in cols_to_impute:
    median_val = df_complet[col].median()
    df_complet[col] = df_complet[col].fillna(median_val)

In [47]:
# Preparing data after merging with Echonest features
X_EXCLUDE = [
    'Unnamed: 0', 
    'track_id',
    'duration',           
    'duration_clipped',   
    'y_duration_log',     
    'genres',             
    'genres_all'          
]

Y1 = df_complet['y_duration_log']
X1 = df_complet.drop(columns=[col for col in X_EXCLUDE if col in df_complet.columns], errors='ignore')

X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.2, random_state=42)

cols_to_encode1 = ['artist_name', 'album_title']
cols_to_encode1 = [c for c in cols_to_encode if c in X_train1.columns]

for col in cols_to_encode1:
    smoothing = 5 if col == 'album_title' else 10
    X_train1, X_test1 = target_encode(X_train1, X_test1, Y_train1, col, m=smoothing)

scaler = StandardScaler()
X_train_scaled1 = scaler.fit_transform(X_train1)
X_test_scaled1 = scaler.transform(X_test1)
X_train_scaled1 = pd.DataFrame(X_train_scaled1, columns=X_train1.columns, index=X_train1.index)
X_test_scaled1 = pd.DataFrame(X_test_scaled1, columns=X_test1.columns, index=X_test1.index)


In [49]:
# Apply Random Forest Regressor on the new dataset
model_rf1 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model_rf1.fit(X_train_scaled1, Y_train1)

print("\nPerformance Random Forest Regressor after merging with Echonest features:")

# predict and evaluate
evaluate_model(model_rf1, X_test_scaled1, Y_test1)


Performance Random Forest Regressor after merging with Echonest features:
R² Score: 0.3853
RMSE (Log-Seconds): 0.3824
MAE (Log-Seconds): 0.2801
MAE (Seconds): 61.16 seconds


## Gradient Boosting

In [50]:
model_gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model_gbr.fit(X_train_scaled1, Y_train1)

print("\nPerformance of Gradient Boosting Regressor on the merged dataset:")
# predict and evaluate
evaluate_model(model_gbr, X_test_scaled1, Y_test1)


Performance of Gradient Boosting Regressor on the merged dataset:
R² Score: 0.3936
RMSE (Log-Seconds): 0.3798
MAE (Log-Seconds): 0.2780
MAE (Seconds): 60.53 seconds


### XGBoost Regressor

In [66]:
model_xgb = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=10,        # Deeper trees to capture interactions
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print(" XGBoost Regressor")



model_xgb.fit(X_train_scaled, Y_train)
evaluate_model(model_xgb, X_test_scaled, Y_test)


 XGBoost Regressor
R² Score: 0.5247
RMSE (Log-Seconds): 0.4644
MAE (Log-Seconds): 0.3328
MAE (Seconds): 73.87 seconds


In [67]:
model_lgb = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model_lgb.fit(X_train_scaled, Y_train)
evaluate_model(model_lgb, X_test_scaled, Y_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4659
[LightGBM] [Info] Number of data points in the train set: 77830, number of used features: 21
[LightGBM] [Info] Start training from score 5.325285
R² Score: 0.5245
RMSE (Log-Seconds): 0.4644
MAE (Log-Seconds): 0.3336
MAE (Seconds): 74.05 seconds
