In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# --- 1. SETUP AND DATA PREPARATION ---

# Define the column cleaning function
def clean_col_name(col):
    """Converts column names to lowercase snake_case."""
    name = col.lower()
    name = re.sub(r'[()\s-]+', '_', name)
    name = name.strip('_')
    name = re.sub(r'__+', '_', name)
    return name

# Load the dataset
file_name = "kaggle - synthetic fitness and nutrition data.csv"
df = pd.read_csv(file_name)

# Apply the cleaning function to all column names
df.columns = [clean_col_name(col) for col in df.columns.tolist()]

# Corrected column lists based on user input and actual cleaned names
shared_cols_clean = [
    'age', 'gender', 'weight_kg', 'height_m', 'bmi', 'bmi_calc',
    'fat_percentage', 'water_intake_liters', 'lean_mass_kg'
]

workout_specific_cols_clean = [
    'max_bpm', 'avg_bpm', 'resting_bpm', 'session_duration_hours',
    'calories_burned', 'experience_level', 'workout_type', 'sets', 'reps',
    'difficulty_level', 'body_part',
     'cal_balance', 'burns_calories_bin'
]

nutrition_specific_cols_clean = [
    'daily_meals_frequency', 'carbs', 'proteins', 'fats', 'calories',
    'meal_type', 'diet_type', 'sugar_g', 'sodium_mg', 'cholesterol_mg',
    'serving_size_g', 'cooking_method', 'prep_time_min', 'cook_time_min', 'cal_from_macros', 'pct_carbs', 'protein_per_kg',
]

# Create the two DataFrames for analysis
all_workout_cols = shared_cols_clean + workout_specific_cols_clean
all_nutrition_cols = shared_cols_clean + nutrition_specific_cols_clean

workout_df = df[all_workout_cols]
nutrition_df = df[all_nutrition_cols]

# --- 2. FEATURE ANALYSIS CODE ---

# --- A. Correlation Analysis (Numerical Features) ---

def plot_correlation_heatmap(df, title, filename):
    # Select only numerical columns for correlation analysis
    numerical_df = df.select_dtypes(include=np.number)

    # Calculate correlation matrix
    corr_matrix = numerical_df.corr()

    # Set up the matplotlib figure
    plt.figure(figsize=(14, 12))

    # Draw the heatmap
    sns.heatmap(corr_matrix,
                annot=False, # Too many features for annotation
                cmap='coolwarm',
                fmt=".2f",
                linewidths=.5,
                cbar_kws={"shrink": .75})

    plt.title(f'Correlation Matrix for {title} Features', fontsize=18)
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(rotation=0, fontsize=10)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Generate heatmaps
plot_correlation_heatmap(workout_df, 'Workout', 'workout_correlation_heatmap.png')
plot_correlation_heatmap(nutrition_df, 'Nutrition', 'nutrition_correlation_heatmap.png')
print("Generated workout_correlation_heatmap.png and nutrition_correlation_heatmap.png")


# --- B. Univariate Distribution Analysis (Key Features) ---

def plot_distributions(df, numerical_cols, categorical_cols, title, filename):
    num_plots = len(numerical_cols) + len(categorical_cols)
    rows = (num_plots + 1) // 2 # 2 columns per row

    plt.figure(figsize=(15, 5 * rows))

    # Plot numerical distributions (Histograms)
    for i, col in enumerate(numerical_cols):
        plt.subplot(rows, 2, i + 1)
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f'Distribution of {col}', fontsize=14)
        plt.xlabel(col, fontsize=12)
        plt.ylabel('Frequency', fontsize=12)

    # Plot categorical distributions (Bar Plots)
    for i, col in enumerate(categorical_cols):
        plt.subplot(rows, 2, len(numerical_cols) + i + 1)
        # Get value counts and ensure only top 10 are plotted to prevent clutter
        top_counts = df[col].value_counts().nlargest(10)
        sns.barplot(x=top_counts.index, y=top_counts.values)
        plt.title(f'Top Counts for {col}', fontsize=14)
        plt.xticks(rotation=45, ha='right', fontsize=10)
        plt.xlabel(col, fontsize=12)
        plt.ylabel('Count', fontsize=12)

    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Key features for distribution analysis
workout_num = ['calories_burned', 'session_duration_hours', 'avg_bpm', 'age']
workout_cat = ['workout_type', 'difficulty_level', 'gender']

nutrition_num = ['calories', 'carbs', 'proteins', 'fats']
nutrition_cat = ['diet_type', 'meal_type', 'cooking_method']

# Generate distribution plots
plot_distributions(workout_df, workout_num, workout_cat, 'Workout', 'workout_distributions.png')
plot_distributions(nutrition_df, nutrition_num, nutrition_cat, 'Nutrition', 'nutrition_distributions.png')
print("Generated workout_distributions.png and nutrition_distributions.png")

Generated workout_correlation_heatmap.png and nutrition_correlation_heatmap.png
Generated workout_distributions.png and nutrition_distributions.png
