In [None]:
import pandas as pd

DATA_PATH = "../../Data/raw/gym_members_exercise_tracking_synthetic_data.csv"

df = pd.read_csv(DATA_PATH)
print("✅ Dataset loaded:", df.shape)
df.head(10)


✅ Dataset loaded: (1800, 15)


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,34.0,Female,86.7,1.86,174,152.0,74.0,1.12,712.0,Strength,12.8,2.4,5.0,2.0,14.31
1,26.0,Female,84.7,1.83,166,156.0,73.0,1.0,833.0,Strength,27.9,2.8,5.0,2.0,33.49
2,22.0,Male,64.8,1.85,187,166.0,64.0,1.24,1678.0,Cardio,28.7,1.9,3.0,2.0,12.73
3,54.0,Female,75.3,1.82,187,169.0,58.0,1.45,628.0,Cardio,31.8,2.4,4.0,1.0,20.37
4,34.0,Female,52.8,1.74,177,169.0,66.0,1.6,1286.0,Strength,26.4,3.2,4.0,2.0,20.83
5,38.0,Female,53.0,1.58,161,128.0,74.0,1.62,953.0,HIIT,23.4,2.5,2.0,2.0,13.02
6,44.0,Female,46.5,1.81,191,142.0,74.0,1.46,1238.0,Cardio,11.9,3.7,2.0,2.0,18.12
7,50.0,Female,88.5,1.63,181,136.0,63.0,1.63,829.0,Strength,11.6,1.8,4.0,2.0,19.16
8,18.0,Female,82.9,1.54,174,169.0,64.0,0.77,802.0,HIIT,27.8,2.2,3.0,1.0,49.84
9,34.0,Female,65.9,1.74,195,169.0,73.0,2.0,1231.0,Yoga,30.5,3.2,5.0,2.0,12.32


In [5]:
# Compute BMI (kg/m²)
df["BMI"] = df["Weight (kg)"] / (df["Height (m)"] ** 2)
df["BMI"] = df["BMI"].round(1)  # round to 1 decimal for clarity

# Check BMI summary
df[["BMI"]].describe()

Unnamed: 0,BMI
count,1753.0
mean,22.719966
std,7.665842
min,10.0
25%,16.9
50%,21.6
75%,27.2
max,57.5


In [None]:
import numpy as np


if "Goal" not in df.columns:

    if "Workout_Type" in df.columns:
        cardio_like = {"Cardio","HIIT","Cycling","Running","Jogging","Rowing","Swimming","Elliptical","Treadmill"}
        strength_like = {"Strength","Weightlifting","Powerlifting","Bodyweight","Resistance","Machines"}
        def infer_goal(wt):
            wt = str(wt).strip().title()
            if wt in cardio_like:
                return "FatLoss"
            if wt in strength_like:
                return "MuscleGain"
            return "FatLoss"   # safe default if unknown
        df["Goal"] = df["Workout_Type"].apply(infer_goal)
    else:
        # No column to infer from → set a default and we can edit later in UI
        df["Goal"] = "FatLoss"

# If exists but blanks, fill them
df["Goal"] = df["Goal"].fillna("FatLoss")

# Encode Gender & Goal ---
# (Adjust values )
df["gender_enc"] = df["Gender"].map({"Female": 0, "Male": 1}).fillna(0).astype(int)
df["goal_enc"]   = df["Goal"].map({"FatLoss": 0, "MuscleGain": 1}).fillna(0).astype(int)

# Quick check
print(df[["Gender", "gender_enc", "Goal", "goal_enc"]].head())
print("Unique goal_enc:", df["goal_enc"].unique())


   Gender  gender_enc        Goal  goal_enc
0  Female           0  MuscleGain         1
1  Female           0  MuscleGain         1
2    Male           1     FatLoss         0
3  Female           0     FatLoss         0
4  Female           0  MuscleGain         1
Unique goal_enc: [1 0]


In [8]:
# Keep features relevant to the model
df_model = df[["Age", "gender_enc", "BMI", "goal_enc"]].copy()

print("✅ Clean dataset ready. Shape:", df_model.shape)
df_model.head()

✅ Clean dataset ready. Shape: (1800, 4)


Unnamed: 0,Age,gender_enc,BMI,goal_enc
0,34.0,0,25.1,1
1,26.0,0,25.3,1
2,22.0,1,18.9,0
3,54.0,0,22.7,0
4,34.0,0,17.4,1


In [17]:
import numpy as np

def assign_plan_key(age, bmi, goal):
    if goal == 0:  # Fat loss
        if (bmi >= 30) or (age >= 50):
            return "C_CONS_3D_20_30"
        elif (25 <= bmi < 30) or (30 <= age <= 49):
            return "C_MOD_3_4D_30_40"
        else:
            return "C_STD_4D_35_45"
    else:  # Muscle gain
        if (bmi >= 30) or (age >= 50):
            return "S_CONS_2_3D_30_40"
        elif (25 <= bmi < 30) or (30 <= age <= 49):
            return "S_MOD_3D_35_50"
        else:
            return "S_STD_3D_45_60"

df_model["plan_key"] = np.vectorize(assign_plan_key)(
    df_model["Age"], df_model["BMI"], df_model["goal_enc"]
)

df_model["plan_key"].value_counts()


plan_key
C_MOD_3_4D_30_40     569
C_CONS_3D_20_30      372
C_STD_4D_35_45       357
S_MOD_3D_35_50       183
S_CONS_2_3D_30_40    131
S_STD_3D_45_60       131
Name: count, dtype: int64

In [23]:
df_model.to_csv("../../Data/clean/fitness_profiles_clean.csv", index=False)

In [24]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(
    df_model, test_size=0.2, random_state=42, stratify=df_model["plan_key"]
)
train_df.to_csv("../../Data/processed/train.csv", index=False)
valid_df.to_csv("../../Data/processed/valid.csv", index=False)


In [25]:
import os

# Ensure folders exist
os.makedirs("../../Data/clean", exist_ok=True)
os.makedirs("../../Data/processed", exist_ok=True)

# Keep only the columns we need (features + label)
cols = ["Age", "gender_enc", "BMI", "goal_enc", "plan_key"]
df_model = df_model[cols].dropna().copy()

# Save the full cleaned/labeled dataset
clean_path = "../../Data/clean/fitness_profiles_clean.csv"
df_model.to_csv(clean_path, index=False)
print("✅ Saved cleaned dataset:", clean_path, "→", df_model.shape)


✅ Saved cleaned dataset: ../../Data/clean/fitness_profiles_clean.csv → (1743, 5)


In [26]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    df_model, test_size=0.20, random_state=42, stratify=df_model["plan_key"]
)

train_df.to_csv("../../Data/processed/train.csv", index=False)
valid_df.to_csv("../../Data/processed/valid.csv", index=False)

print("✅ Wrote:")
print(" - ../../Data/processed/train.csv", train_df.shape)
print(" - ../../Data/processed/valid.csv", valid_df.shape)

# Optional: sanity check class balance
print("\nTrain class counts:\n", train_df["plan_key"].value_counts())
print("\nValid class counts:\n", valid_df["plan_key"].value_counts())


✅ Wrote:
 - ../../Data/processed/train.csv (1394, 5)
 - ../../Data/processed/valid.csv (349, 5)

Train class counts:
 plan_key
C_MOD_3_4D_30_40     455
C_CONS_3D_20_30      297
C_STD_4D_35_45       286
S_MOD_3D_35_50       146
S_CONS_2_3D_30_40    105
S_STD_3D_45_60       105
Name: count, dtype: int64

Valid class counts:
 plan_key
C_MOD_3_4D_30_40     114
C_CONS_3D_20_30       75
C_STD_4D_35_45        71
S_MOD_3D_35_50        37
S_CONS_2_3D_30_40     26
S_STD_3D_45_60        26
Name: count, dtype: int64


In [None]:
# Compare row counts
print("Original df rows:", len(df))
print("df_model rows:   ", len(df_model))

# Check for any missing values in model features
df_model.isna().sum()

# drop any rows with missing feature values (safe before saving):
df_model = df_model.dropna().copy()
print("After dropna rows:", len(df_model))


Original df rows: 1800
df_model rows:    1800
After dropna rows: 1743


In [10]:
import numpy as np

def assign_plan_key(age, bmi, goal):
    if goal == 0:  # Fat loss
        if (bmi >= 30) or (age >= 50):
            return "C_CONS_3D_20_30"
        elif (25 <= bmi < 30) or (30 <= age <= 49):
            return "C_MOD_3_4D_30_40"
        else:
            return "C_STD_4D_35_45"
    else:  # Muscle gain
        if (bmi >= 30) or (age >= 50):
            return "S_CONS_2_3D_30_40"
        elif (25 <= bmi < 30) or (30 <= age <= 49):
            return "S_MOD_3D_35_50"
        else:
            return "S_STD_3D_45_60"

df_model["plan_key"] = np.vectorize(assign_plan_key)(
    df_model["Age"], df_model["BMI"], df_model["goal_enc"]
)

df_model["plan_key"].value_counts()


plan_key
C_MOD_3_4D_30_40     569
C_CONS_3D_20_30      372
C_STD_4D_35_45       357
S_MOD_3D_35_50       183
S_CONS_2_3D_30_40    131
S_STD_3D_45_60       131
Name: count, dtype: int64

In [14]:
df_model.to_csv("../../Data/clean/fitness_profiles_clean.csv", index=False)
print("✅ Saved to ../../Data/clean/fitness_profiles_clean.csv")


✅ Saved to ../../Data/clean/fitness_profiles_clean.csv


In [15]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    df_model, test_size=0.2, random_state=42, stratify=df_model["plan_key"]
)

train_df.to_csv("../../Data/processed/train.csv", index=False)
valid_df.to_csv("../../Data/processed/valid.csv", index=False)

print("✅ Wrote train.csv and valid.csv to Data/processed/")


✅ Wrote train.csv and valid.csv to Data/processed/
