In [3]:
import pandas as pd

# -----------------------------
# 1️⃣ LOAD DATA
# -----------------------------
fitness_df = pd.read_csv("health_fitness_dataset.csv")
mental_df = pd.read_csv("mental_health_dataset.csv")

# -----------------------------
# 2️⃣ CLEAN FITNESS DATA
# -----------------------------
# Drop duplicates
fitness_df = fitness_df.drop_duplicates()

# Fill missing numeric values with mean
numeric_cols_fitness = ["hours_sleep", "stress_level", "calories_burned", "daily_steps",
                        "hydration_level", "bmi", "avg_heart_rate", "resting_heart_rate",
                        "blood_pressure_systolic", "blood_pressure_diastolic", "fitness_level"]
for col in numeric_cols_fitness:
    if col in fitness_df.columns:
        fitness_df[col] = fitness_df[col].fillna(fitness_df[col].mean())

# Standardize text columns
text_cols_fitness = ["gender", "intensity", "smoking_status", "activity_type", "health_condition"]
for col in text_cols_fitness:
    if col in fitness_df.columns:
        fitness_df[col] = fitness_df[col].astype(str).str.strip().str.capitalize()

# -----------------------------
# 3️⃣ CLEAN MENTAL HEALTH DATA
# -----------------------------
mental_df = mental_df.drop_duplicates()

# Rename columns for consistency
mental_df.rename(columns={
    "Age": "age",
    "Gender": "gender",
    "Exercise Level": "exercise_frequency",
    "Diet Type": "diet_type",
    "Sleep Hours": "sleep_hours",
    "Stress Level": "stress_level",
    "Mental Health Condition": "mental_health_condition",
    "Social Interaction Score": "social_interaction_score",
    "Happiness Score": "happiness_index"
}, inplace=True)

# Standardize text columns
text_cols_mental = ["gender", "diet_type", "exercise_frequency", "stress_level", "mental_health_condition"]
for col in text_cols_mental:
    if col in mental_df.columns:
        mental_df[col] = mental_df[col].astype(str).str.strip().str.capitalize()

# Fill numeric missing values with mean
numeric_cols_mental = ["sleep_hours", "exercise_frequency", "social_interaction_score", "happiness_index"]
for col in numeric_cols_mental:
    if col in mental_df.columns:
        mental_df[col] = pd.to_numeric(mental_df[col], errors='coerce')
        mental_df[col] = mental_df[col].fillna(mental_df[col].mean())

# -----------------------------
# 4️⃣ SAVE CLEANED DATA
# -----------------------------
fitness_df.to_csv("cleaned_fitness_dataset.csv", index=False)
mental_df.to_csv("cleaned_mental_health_dataset.csv", index=False)

print("✅ Cleaning complete. Files saved as 'cleaned_fitness_dataset.csv' and 'cleaned_mental_health_dataset.csv'")


✅ Cleaning complete. Files saved as 'cleaned_fitness_dataset.csv' and 'cleaned_mental_health_dataset.csv'
