In [1]:
import pandas as pd
import re


# --- 1. SETUP: COLUMN CLEANING FUNCTION ---
def clean_col_name(col):
    """Converts column names to lowercase snake_case."""
    name = col.lower()
    name = re.sub(r'[()\s-]+', '_', name)
    name = name.strip('_')
    name = re.sub(r'__+', '_', name)
    return name


# Load the dataset (or use your currently processed DataFrame)
file_name = "data sources/kaggle - synthetic fitness and nutrition data.csv"
df = pd.read_csv(file_name)
df['Age'] = df['Age'].astype(int)

# Apply the cleaning function to all column names
df.columns = [clean_col_name(col) for col in df.columns.tolist()]

In [2]:
# --- 2. COLUMN DEFINITION ---

# Columns relevant to both domains (demographics, base metrics)
shared_cols = [
    'age', 'gender', 'weight_kg', 'height_m', 'bmi', 'bmi_calc',
    'fat_percentage', 'water_intake_liters', 'lean_mass_kg'
]

# Columns primarily related to nutrition (meal details, macro calculations)
nutrition_specific_cols = [
    'daily_meals_frequency', 'carbs', 'proteins', 'fats', 'calories',
    'meal_type', 'diet_type', 'sugar_g', 'sodium_mg', 'cholesterol_mg',
    'serving_size_g', 'cooking_method', 'prep_time_min', 'cook_time_min', 'cal_from_macros', 'pct_carbs',
    'protein_per_kg',
    # 'physical_exercise' is arguably more fitness, but let's leave it in workout for now.
]
# 'rating'

# Columns primarily related to workout/fitness (session details, heart rate, exercise specifics)
workout_specific_cols = [
    'max_bpm', 'avg_bpm', 'resting_bpm', 'session_duration_hours',
    'calories_burned', 'experience_level', 'workout_type', 'sets', 'reps',
    'difficulty_level', 'body_part',
    'cal_balance', 'burns_calories_bin'
]
# 'pct_hrr', 'pct_maxhr', 'physical_exercise', 'burns_calories_per_30_min', 'type_of_muscle', 'workout', 'expected_burn', 'equipment_needed', 'exercise_name', 'workout_frequency_days_week'
# future use: 'target_muscle_group', 'benefit',

In [3]:
# --- 3. CREATE & EXPORT DATAFRAMES ---

# Create the full list of columns for each new DataFrame
nutrition_cols = shared_cols + nutrition_specific_cols
workout_cols = shared_cols + workout_specific_cols

# Create the two DataFrames
nutrition_df = df[nutrition_cols]
workout_df = df[workout_cols]

# Export the new DataFrames
#####nutrition_df.to_csv('nutrition_data.csv', index=False)
#####workout_df.to_csv('workout_data.csv', index=False)

print("Data successfully split and exported to 'nutrition_data.csv' and 'workout_data.csv'.")
nutrition_df
workout_df
# Manual mapping for ordinal features
difficulty_mapping = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
intensity_mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}

workout_df['difficulty_level_enc'] = workout_df['difficulty_level'].map(difficulty_mapping)
workout_df['intensity_of_burn_enc'] = workout_df['burns_calories_bin'].map(intensity_mapping)

workout_df = workout_df.drop(columns=['difficulty_level', 'burns_calories_bin'])
workout_df
# Identify all remaining object columns (Nominal Features)
nominal_cols = workout_df.select_dtypes(include=['object']).columns.tolist()
print("Nominal columns to encode:", nominal_cols)

# Apply One-Hot Encoding
workout_df_processed = pd.get_dummies(workout_df, columns=nominal_cols, drop_first=True)
print(f"\nFinal processed shape after encoding: {workout_df_processed.shape}")

workout_df_processed

Data successfully split and exported to 'nutrition_data.csv' and 'workout_data.csv'.
Nominal columns to encode: ['gender', 'workout_type', 'body_part']

Final processed shape after encoding: (20000, 29)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workout_df['difficulty_level_enc'] = workout_df['difficulty_level'].map(difficulty_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workout_df['intensity_of_burn_enc'] = workout_df['burns_calories_bin'].map(intensity_mapping)


Unnamed: 0,age,weight_kg,height_m,bmi,bmi_calc,fat_percentage,water_intake_liters,lean_mass_kg,max_bpm,avg_bpm,...,gender_Male,workout_type_HIIT,workout_type_Strength,workout_type_Yoga,body_part_Arms,body_part_Back,body_part_Chest,body_part_Forearms,body_part_Legs,body_part_Shoulders
0,34,65.27,1.62,24.87,24.870447,26.800377,1.50,47.777394,188.58,157.65,...,True,False,True,False,False,False,False,False,True,False
1,23,56.41,1.55,23.48,23.479709,27.655021,1.90,40.809803,179.43,131.75,...,False,True,False,False,False,False,True,False,False,False
2,33,58.98,1.67,21.15,21.148123,24.320821,1.88,44.635580,175.04,123.95,...,False,False,False,False,True,False,False,False,False,False
3,38,93.78,1.70,32.45,32.449827,32.813572,2.50,63.007432,191.21,155.10,...,False,True,False,False,False,False,False,False,False,True
4,45,52.42,1.88,14.83,14.831372,17.307319,2.91,43.347504,193.58,152.88,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,46,98.31,1.90,27.23,27.232687,27.505498,2.99,71.269345,199.20,148.18,...,False,False,True,False,True,False,False,False,False,False
19996,40,88.12,1.87,25.20,25.199462,26.180562,3.52,65.049689,196.18,134.18,...,False,False,True,False,True,False,False,False,False,False
19997,50,46.20,1.67,16.57,16.565671,23.331802,2.81,35.420708,163.34,157.92,...,True,False,True,False,False,False,False,True,False,False
19998,52,44.30,1.62,16.88,16.880049,18.985868,1.59,35.889260,179.27,121.23,...,True,False,False,True,False,False,False,False,False,True
