# Import Libraries & Load Data

In [7]:
import os
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import PartialDependenceDisplay

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
DATA_DIR = "./data"
EXERCISE_PATH = os.path.join(DATA_DIR, "exercise.csv")
CALORIES_PATH = os.path.join(DATA_DIR, "calories.csv")

print("Dataset Paths")
print("Exercise CSV path:", EXERCISE_PATH)
print("Calories CSV path:", CALORIES_PATH)

exercise = pd.read_csv(EXERCISE_PATH)
calories = pd.read_csv(CALORIES_PATH)

print("\nDataset Dimensions")
print("Exercise Shape:", exercise.shape)
print("Calories Shape:", calories.shape)

print("\n \nDataset First 5 rows!")
display(exercise.head())
display(calories.head())

Dataset Paths
Exercise CSV path: ./data/exercise.csv
Calories CSV path: ./data/calories.csv

Dataset Dimensions
Exercise Shape: (15000, 8)
Calories Shape: (15000, 2)

 
Dataset First 5 rows!


Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8


Unnamed: 0,User_ID,Calories
0,14733363,231.0
1,14861698,66.0
2,11179863,26.0
3,16180408,71.0
4,17771927,35.0


# Data Preprocessing & Create DataFrame

In [14]:
# Merge on User_ID
df = pd.merge(exercise, calories, on="User_ID", how="inner")
print("Merged shape:", df.shape)

# Clean column names
df.columns = [col.strip().replace(" ", "_") for col in df.columns]

# Key numeric columns
numeric_cols = ["Age", "Height", "Weight", "Duration", "Heart_Rate", "Body_Temp", "Calories"]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Drop rows with missing key info
df = df.dropna(subset=numeric_cols + ["Gender"]).reset_index(drop=True)

# Normalize gender text and encode: male -> 1, female -> 0
df["Gender"] = df["Gender"].str.lower().str.strip()
gender_map = {"male": 1, "female": 0}
df["Gender_Code"] = df["Gender"].map(gender_map)

# Drop any unexpected genders
df = df.dropna(subset=["Gender_Code"]).reset_index(drop=True)
df["Gender_Code"] = df["Gender_Code"].astype(int)

# ðŸ”¹ Copy numeric gender to 'Gender' column as well
df["Gender"] = df["Gender_Code"]

# BMI feature: kg / m^2
df["Height_m"] = df["Height"] / 100.0
df["BMI"] = df["Weight"] / (df["Height_m"] ** 2)

# Final feature set and target
feature_cols = ["Gender_Code", "Age", "Height", "Weight",
                "Duration", "Heart_Rate", "Body_Temp", "BMI"]
target_col = "Calories"

print("Final columns used for modeling:")
print("Features:", feature_cols)
print("Target  :", target_col)
display(df.head())

Merged shape: (15000, 9)
Final columns used for modeling:
Features: ['Gender_Code', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']
Target  : Calories


Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,Gender_Code,Height_m,BMI
0,14733363,1,68,190.0,94.0,29.0,105.0,40.8,231.0,1,1.9,26.038781
1,14861698,0,20,166.0,60.0,14.0,94.0,40.3,66.0,0,1.66,21.773842
2,11179863,1,69,179.0,79.0,5.0,88.0,38.7,26.0,1,1.79,24.65591
3,16180408,0,34,179.0,71.0,13.0,100.0,40.5,71.0,0,1.79,22.159109
4,17771927,0,27,154.0,58.0,10.0,81.0,39.8,35.0,0,1.54,24.456063
