In [3]:
import os
import pandas as pd
import numpy as np

# ✅ Ensure directory exists
os.makedirs("data/raw", exist_ok=True)

# Set random seed for reproducibility
np.random.seed(42)

# Define number of records
n = 500

# Generate synthetic health data
data = pd.DataFrame({
    "age": np.random.randint(18, 80, size=n),
    "gender": np.random.choice(["Male", "Female"], size=n),
    "weight_kg": np.random.normal(70, 15, size=n).round(1),
    "height_cm": np.random.normal(170, 10, size=n).round(1),
    "smoking_status": np.random.choice(["Never", "Former", "Current"], size=n, p=[0.6, 0.25, 0.15]),
    "exercise_freq_per_week": np.random.poisson(3, size=n),
    "alcohol_intake_per_week": np.random.poisson(2, size=n),
    "family_history": np.random.choice(["Yes", "No"], size=n, p=[0.3, 0.7])
})

# Calculate BMI
data["height_m"] = data["height_cm"] / 100
data["bmi"] = (data["weight_kg"] / (data["height_m"] ** 2)).round(1)
data.drop("height_m", axis=1, inplace=True)

# Create a risk score based on features and generate a binary label
risk_score = (
    (data["age"] > 50).astype(int) +
    (data["bmi"] > 30).astype(int) +
    (data["smoking_status"] == "Current").astype(int) +
    (data["exercise_freq_per_week"] < 1).astype(int) +
    (data["alcohol_intake_per_week"] > 4).astype(int) +
    (data["family_history"] == "Yes").astype(int)
)

data["chronic_illness"] = (risk_score >= 3).astype(int)

# ✅ Save to CSV
data.to_csv("data/raw/synthetic_health_data.csv", index=False)

# Preview
data.head()


Unnamed: 0,age,gender,weight_kg,height_cm,smoking_status,exercise_freq_per_week,alcohol_intake_per_week,family_history,bmi,chronic_illness
0,56,Female,80.5,163.1,Former,1,3,No,30.3,0
1,69,Male,68.0,174.9,Never,0,2,No,22.2,0
2,46,Female,68.9,174.2,Never,1,4,No,22.7,0
3,32,Male,65.8,174.0,Former,1,3,No,21.7,0
4,60,Male,74.4,167.7,Never,5,1,No,26.5,0
