In [None]:
import pandas as pd
import re


# --- 1. SETUP: COLUMN CLEANING FUNCTION ---
def clean_col_name(col):
    """Converts column names to lowercase snake_case."""
    name = col.lower()
    name = re.sub(r'[()\s-]+', '_', name)
    name = name.strip('_')
    name = re.sub(r'__+', '_', name)
    return name


# Load the dataset (or use your currently processed DataFrame)
file_name = "kaggle - synthetic fitness and nutrition data.csv"
df = pd.read_csv(file_name)
df['Age'] = df['Age'].astype(int)

# Apply the cleaning function to all column names
df.columns = [clean_col_name(col) for col in df.columns.tolist()]

In [None]:
# --- 2. COLUMN DEFINITION ---

# Columns relevant to both domains (demographics, base metrics)
shared_cols = [
    'age', 'gender', 'weight_kg', 'height_m', 'bmi', 'bmi_calc',
    'fat_percentage', 'water_intake_liters', 'lean_mass_kg'
]

# Columns primarily related to nutrition (meal details, macro calculations)
nutrition_specific_cols = [
    'daily_meals_frequency', 'carbs', 'proteins', 'fats', 'calories',
    'meal_type', 'diet_type', 'sugar_g', 'sodium_mg', 'cholesterol_mg',
    'serving_size_g', 'cooking_method', 'prep_time_min', 'cook_time_min', 'cal_from_macros', 'pct_carbs',
    'protein_per_kg',
    # 'physical_exercise' is arguably more fitness, but let's leave it in workout for now.
]
# 'rating'

# Columns primarily related to workout/fitness (session details, heart rate, exercise specifics)
workout_specific_cols = [
    'max_bpm', 'avg_bpm', 'resting_bpm', 'session_duration_hours',
    'calories_burned', 'experience_level', 'workout_type', 'sets', 'reps',
    'difficulty_level', 'body_part',
    'cal_balance', 'burns_calories_bin'
]
# 'pct_hrr', 'pct_maxhr', 'physical_exercise', 'burns_calories_per_30_min', 'type_of_muscle', 'workout', 'expected_burn', 'equipment_needed', 'exercise_name', 'workout_frequency_days_week'
# future use: 'target_muscle_group', 'benefit',

In [None]:
# --- 3. CREATE & EXPORT DATAFRAMES ---

# Create the full list of columns for each new DataFrame
nutrition_cols = shared_cols + nutrition_specific_cols
workout_cols = shared_cols + workout_specific_cols

# Create the two DataFrames
nutrition_df = df[nutrition_cols]
workout_df = df[workout_cols]

# Export the new DataFrames
#####nutrition_df.to_csv('nutrition_data.csv', index=False)
#####workout_df.to_csv('workout_data.csv', index=False)

print("Data successfully split and exported to 'nutrition_data.csv' and 'workout_data.csv'.")
nutrition_df
workout_df
# Manual mapping for ordinal features
difficulty_mapping = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
intensity_mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}

workout_df['difficulty_level_enc'] = workout_df['difficulty_level'].map(difficulty_mapping)
workout_df['intensity_of_burn_enc'] = workout_df['burns_calories_bin'].map(intensity_mapping)

workout_df = workout_df.drop(columns=['difficulty_level', 'burns_calories_bin'])
workout_df
# Identify all remaining object columns (Nominal Features)
nominal_cols = workout_df.select_dtypes(include=['object']).columns.tolist()
print("Nominal columns to encode:", nominal_cols)

# Apply One-Hot Encoding
workout_df_processed = pd.get_dummies(workout_df, columns=nominal_cols, drop_first=True)
print(f"\nFinal processed shape after encoding: {workout_df_processed.shape}")

workout_df_processed