In [17]:
import pandas as pd
import numpy as np

# 1. Load the dataset
file_name = "kaggle - synthetic fitness and nutrition data.csv"
df = pd.read_csv(file_name)

# 2. Initial Inspection (Confirming previous analysis)
print("Data Head:")
print(df.head())
print("\nData Info:")
df.info()

Data Head:
     Age  Gender  Weight (kg)  Height (m)  Max_BPM  Avg_BPM  Resting_BPM  \
0  34.91    Male        65.27        1.62   188.58   157.65        69.05   
1  23.37  Female        56.41        1.55   179.43   131.75        73.18   
2  33.20  Female        58.98        1.67   175.04   123.95        54.96   
3  38.69  Female        93.78        1.70   191.21   155.10        50.07   
4  45.09    Male        52.42        1.88   193.58   152.88        70.84   

   Session_Duration (hours)  Calories_Burned Workout_Type  ...  \
0                      1.00          1080.90     Strength  ...   
1                      1.37          1809.91         HIIT  ...   
2                      0.91           802.26       Cardio  ...   
3                      1.10          1450.79         HIIT  ...   
4                      1.08          1166.40     Strength  ...   

   cal_from_macros  pct_carbs  protein_per_kg   pct_HRR  pct_maxHR  \
0          2139.59   0.500432        1.624789  0.741237   0.83598

In [18]:
# columns to drop (high-cardinality, descriptive, "useless" columns)
cols_to_drop = ['meal_name', 'Physical exercise', 'BMI_calc', 'expected_burn','Burns Calories (per 30 min)_bc', 'Carbs', 'diet_type', 'Proteins', 'cook_time_min', 'cooking_method', 'Fats', 'meal_type', 'serving_size_g', 'sodium_mg', 'sugar_g', 'cholesterol_mg', 'prep_time_min', 'rating', 'Target Muscle Group', 'Equipment Needed', 'Name of Exercise', 'Type of Muscle', 'Workout', 'cal_balance', 'Benefit']

df_cleaned = df.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} columns. New shape: {df_cleaned.shape}")
df_cleaned

Dropped 26 columns. New shape: (20000, 29)


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,...,Burns Calories (per 30 min),Difficulty Level,Body Part,cal_from_macros,pct_carbs,protein_per_kg,pct_HRR,pct_maxHR,lean_mass_kg,Burns_Calories_Bin
0,34.91,Male,65.27,1.62,188.58,157.65,69.05,1.00,1080.90,Strength,...,342.58,Advanced,Legs,2139.59,0.500432,1.624789,0.741237,0.835985,47.777394,Medium
1,23.37,Female,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,HIIT,...,357.16,Intermediate,Chest,1711.65,0.500850,1.514093,0.551247,0.734270,40.809803,High
2,33.20,Female,58.98,1.67,175.04,123.95,54.96,0.91,802.26,Cardio,...,359.63,Intermediate,Arms,1965.92,0.500610,1.663445,0.574534,0.708124,44.635580,High
3,38.69,Female,93.78,1.70,191.21,155.10,50.07,1.10,1450.79,HIIT,...,351.65,Advanced,Shoulders,1627.28,0.499533,0.862017,0.744155,0.811150,63.007432,High
4,45.09,Male,52.42,1.88,193.58,152.88,70.84,1.08,1166.40,Strength,...,329.36,Advanced,Abs,2659.23,0.500581,2.538153,0.668405,0.789751,43.347504,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,46.77,Female,98.31,1.90,199.20,148.18,63.72,0.77,761.61,Strength,...,346.27,Advanced,Arms,1882.80,0.500000,0.954837,0.623413,0.743876,71.269345,Medium
19996,40.38,Female,88.12,1.87,196.18,134.18,54.04,1.97,2303.13,Strength,...,332.64,Intermediate,Arms,1205.47,0.496968,0.689060,0.563810,0.683964,65.049689,Low
19997,50.31,Male,46.20,1.67,163.34,157.92,61.65,1.36,1468.80,Strength,...,352.19,Beginner,Forearms,1947.79,0.500280,2.105844,0.946701,0.966818,35.420708,High
19998,52.36,Male,44.30,1.62,179.27,121.23,60.88,1.41,929.75,Yoga,...,329.22,Intermediate,Shoulders,1921.51,0.499940,2.190745,0.509756,0.676243,35.889260,Low


In [19]:
# Manual mapping for ordinal features
difficulty_mapping = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
intensity_mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}

df_cleaned['difficulty_level_enc'] = df_cleaned['Difficulty Level'].map(difficulty_mapping)
df_cleaned['intensity_of_burn_enc'] = df_cleaned['Burns_Calories_Bin'].map(intensity_mapping)

df_cleaned = df_cleaned.drop(columns=['Difficulty Level', 'Burns_Calories_Bin'])
df_cleaned

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,...,Burns Calories (per 30 min),Body Part,cal_from_macros,pct_carbs,protein_per_kg,pct_HRR,pct_maxHR,lean_mass_kg,difficulty_level_enc,intensity_of_burn_enc
0,34.91,Male,65.27,1.62,188.58,157.65,69.05,1.00,1080.90,Strength,...,342.58,Legs,2139.59,0.500432,1.624789,0.741237,0.835985,47.777394,2,1
1,23.37,Female,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,HIIT,...,357.16,Chest,1711.65,0.500850,1.514093,0.551247,0.734270,40.809803,1,2
2,33.20,Female,58.98,1.67,175.04,123.95,54.96,0.91,802.26,Cardio,...,359.63,Arms,1965.92,0.500610,1.663445,0.574534,0.708124,44.635580,1,2
3,38.69,Female,93.78,1.70,191.21,155.10,50.07,1.10,1450.79,HIIT,...,351.65,Shoulders,1627.28,0.499533,0.862017,0.744155,0.811150,63.007432,2,2
4,45.09,Male,52.42,1.88,193.58,152.88,70.84,1.08,1166.40,Strength,...,329.36,Abs,2659.23,0.500581,2.538153,0.668405,0.789751,43.347504,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,46.77,Female,98.31,1.90,199.20,148.18,63.72,0.77,761.61,Strength,...,346.27,Arms,1882.80,0.500000,0.954837,0.623413,0.743876,71.269345,2,1
19996,40.38,Female,88.12,1.87,196.18,134.18,54.04,1.97,2303.13,Strength,...,332.64,Arms,1205.47,0.496968,0.689060,0.563810,0.683964,65.049689,1,0
19997,50.31,Male,46.20,1.67,163.34,157.92,61.65,1.36,1468.80,Strength,...,352.19,Forearms,1947.79,0.500280,2.105844,0.946701,0.966818,35.420708,0,2
19998,52.36,Male,44.30,1.62,179.27,121.23,60.88,1.41,929.75,Yoga,...,329.22,Shoulders,1921.51,0.499940,2.190745,0.509756,0.676243,35.889260,1,0


In [20]:
# Identify all remaining object columns (Nominal Features)
nominal_cols = df_cleaned.select_dtypes(include=['object']).columns.tolist()
print("Nominal columns to encode:", nominal_cols)

# Apply One-Hot Encoding
df_processed = pd.get_dummies(df_cleaned, columns=nominal_cols, drop_first=True)
print(f"\nFinal processed shape after encoding: {df_processed.shape}")

df_processed['Age'] = df_processed['Age'].astype(int)
print(f"\nRounding down ages")

df_processed

Nominal columns to encode: ['Gender', 'Workout_Type', 'Body Part']

Final processed shape after encoding: (20000, 36)

Rounding down ages


Unnamed: 0,Age,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Fat_Percentage,Water_Intake (liters),...,Gender_Male,Workout_Type_HIIT,Workout_Type_Strength,Workout_Type_Yoga,Body Part_Arms,Body Part_Back,Body Part_Chest,Body Part_Forearms,Body Part_Legs,Body Part_Shoulders
0,34,65.27,1.62,188.58,157.65,69.05,1.00,1080.90,26.800377,1.50,...,True,False,True,False,False,False,False,False,True,False
1,23,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,27.655021,1.90,...,False,True,False,False,False,False,True,False,False,False
2,33,58.98,1.67,175.04,123.95,54.96,0.91,802.26,24.320821,1.88,...,False,False,False,False,True,False,False,False,False,False
3,38,93.78,1.70,191.21,155.10,50.07,1.10,1450.79,32.813572,2.50,...,False,True,False,False,False,False,False,False,False,True
4,45,52.42,1.88,193.58,152.88,70.84,1.08,1166.40,17.307319,2.91,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,46,98.31,1.90,199.20,148.18,63.72,0.77,761.61,27.505498,2.99,...,False,False,True,False,True,False,False,False,False,False
19996,40,88.12,1.87,196.18,134.18,54.04,1.97,2303.13,26.180562,3.52,...,False,False,True,False,True,False,False,False,False,False
19997,50,46.20,1.67,163.34,157.92,61.65,1.36,1468.80,23.331802,2.81,...,True,False,True,False,False,False,False,True,False,False
19998,52,44.30,1.62,179.27,121.23,60.88,1.41,929.75,18.985868,1.59,...,True,False,False,True,False,False,False,False,False,True


In [21]:
import re

def clean_col_name(col):
    # 1. Convert to lowercase
    name = col.lower()
    # 2. Replace spaces, parentheses, and hyphens with underscores
    name = re.sub(r'[()\s-]+', '_', name)
    # 3. Clean up extra underscores
    name = name.strip('_')
    name = re.sub(r'__+', '_', name)
    return name

# Apply this function to all columns in your DataFrame:
df_processed.columns = [clean_col_name(col) for col in df_processed.columns]
df_processed

Unnamed: 0,age,weight_kg,height_m,max_bpm,avg_bpm,resting_bpm,session_duration_hours,calories_burned,fat_percentage,water_intake_liters,...,gender_male,workout_type_hiit,workout_type_strength,workout_type_yoga,body_part_arms,body_part_back,body_part_chest,body_part_forearms,body_part_legs,body_part_shoulders
0,34,65.27,1.62,188.58,157.65,69.05,1.00,1080.90,26.800377,1.50,...,True,False,True,False,False,False,False,False,True,False
1,23,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,27.655021,1.90,...,False,True,False,False,False,False,True,False,False,False
2,33,58.98,1.67,175.04,123.95,54.96,0.91,802.26,24.320821,1.88,...,False,False,False,False,True,False,False,False,False,False
3,38,93.78,1.70,191.21,155.10,50.07,1.10,1450.79,32.813572,2.50,...,False,True,False,False,False,False,False,False,False,True
4,45,52.42,1.88,193.58,152.88,70.84,1.08,1166.40,17.307319,2.91,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,46,98.31,1.90,199.20,148.18,63.72,0.77,761.61,27.505498,2.99,...,False,False,True,False,True,False,False,False,False,False
19996,40,88.12,1.87,196.18,134.18,54.04,1.97,2303.13,26.180562,3.52,...,False,False,True,False,True,False,False,False,False,False
19997,50,46.20,1.67,163.34,157.92,61.65,1.36,1468.80,23.331802,2.81,...,True,False,True,False,False,False,False,True,False,False
19998,52,44.30,1.62,179.27,121.23,60.88,1.41,929.75,18.985868,1.59,...,True,False,False,True,False,False,False,False,False,True


In [22]:
output_file_name = "cleaned_fitness_data.csv"
df_processed.to_csv(output_file_name, index=False)