In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

In [5]:
df = pd.read_csv("analysis_dataset/music_running_weather.csv")
df.head()

Unnamed: 0,Sr. no.,start_date_local,type,distance,moving_time,elapsed_time,total_elevation_gain,start_latlng,end_latlng,sport_type,...,playlist_id,intensity,mood,lyrics,music_condition,temp,humidity,windspeed,precip,conditions
0,1,2023-12-09T09:09:19Z,Run,10879.7,4023,4617,91.4,"[40.70327935740352, -73.99619171395898]","[40.69340907968581, -73.97922154515982]",Run,...,5ymkuBk3C1Iu4KTdJBK9vy,5,4,1,fast,8.0,84.71,7.3,0.0,Overcast
1,2,2023-12-07T17:31:50Z,Run,1304.4,722,62993,0.0,"[40.7220459356904, -74.03641730546951]","[40.719722136855125, -74.03257705271244]",Run,...,5Egmw4o1GAN8QkhXGaxao6,3,3,1,medium,3.8,68.71,14.2,0.0,Overcast
2,3,2023-12-03T09:18:13Z,Run,17503.0,7370,7462,68.4,"[40.74991073459387, -73.98751585744321]","[40.73538766242564, -73.9793517999351]",Run,...,7pWNfjCrJmS4ILho1S4CvK,1,2,0,slow,11.5,83.85,9.0,0.759,"Rain, Overcast"
3,4,2023-12-02T09:41:14Z,Run,3457.8,1791,2170,3.9,"[40.72203076444566, -74.03641068376601]","[40.71398631669581, -74.03902692720294]",Run,...,5Egmw4o1GAN8QkhXGaxao6,3,3,1,medium,9.5,87.37,9.3,0.0,Overcast
4,5,2023-12-01T17:06:05Z,Run,10108.2,4128,4221,6.3,"[40.72190687991679, -74.03624514117837]","[40.71669920347631, -74.03264125809073]",Run,...,7pWNfjCrJmS4ILho1S4CvK,1,2,0,slow,7.8,78.2,6.4,0.472,"Rain, Overcast"


In [7]:
baseline_features = [
    "distance_km",
    "duration_min",
    "total_elevation_gain",
    "hour",
    "weekday",
]

music_features = baseline_features + [
    "intensity",
    "mood",
    "lyrics",
    "music_condition",
]

full_features = music_features + [
    "temp",
    "humidity",
    "windspeed",
    "precip",
    "conditions",
]

In [10]:
def extract_features(df, features):
    data = df[features + ["pace_min_per_km"]].copy()
    categoricals = [c for c in features if data[c].dtype == "object"]

    data = pd.get_dummies(data, columns=categoricals, drop_first=True)
    X = data.drop(columns=["pace_min_per_km"])
    y = data["pace_min_per_km"].values
    return X, y

X_base, y = extract_features(df, baseline_features)
X_music, _ = extract_features(df, music_features)
X_full, _ = extract_features(df, full_features)

print("Baseline X:", X_base.shape)
print("Music X:", X_music.shape)
print("Full X:", X_full.shape)

Baseline X: (98, 5)
Music X: (98, 10)
Full X: (98, 18)
