In [6]:
import os
import pandas as pd
import numpy as np

# ✅ Ensure directory exists
os.makedirs("data/raw", exist_ok=True)

# Set random seed for reproducibility
np.random.seed(42)

# Define number of records
n = 500

# Generate synthetic health data
data = pd.DataFrame({
    "age": np.random.randint(18, 80, size=n),
    "gender": np.random.choice(["Male", "Female"], size=n),
    "weight_kg": np.random.normal(70, 15, size=n).round(1),
    "height_cm": np.random.normal(170, 10, size=n).round(1),
    "smoking_status": np.random.choice(["Never", "Former", "Current"], size=n, p=[0.6, 0.25, 0.15]),
    "exercise_freq_per_week": np.random.poisson(3, size=n),
    "alcohol_intake_per_week": np.random.poisson(2, size=n),
    "family_history": np.random.choice(["Yes", "No"], size=n, p=[0.3, 0.7])
})

# Calculate BMI
data["height_m"] = data["height_cm"] / 100
data["bmi"] = (data["weight_kg"] / (data["height_m"] ** 2)).round(1)
data.drop("height_m", axis=1, inplace=True)

# Create a risk score based on features and generate a binary label
risk_score = (
    (data["age"] > 50).astype(int) +
    (data["bmi"] > 30).astype(int) +
    (data["smoking_status"] == "Current").astype(int) +
    (data["exercise_freq_per_week"] < 1).astype(int) +
    (data["alcohol_intake_per_week"] > 4).astype(int) +
    (data["family_history"] == "Yes").astype(int)
)

data["chronic_illness"] = (risk_score >= 3).astype(int)

# ✅ Save to CSV
data.to_csv("data/raw/synthetic_health_data.csv", index=False)

# Preview
data.tail()


Unnamed: 0,age,gender,weight_kg,height_cm,smoking_status,exercise_freq_per_week,alcohol_intake_per_week,family_history,bmi,chronic_illness
495,63,Male,93.0,171.7,Never,4,3,No,31.5,0
496,41,Female,60.3,158.1,Never,3,0,Yes,24.1,0
497,67,Male,77.2,170.6,Former,1,5,No,26.5,0
498,49,Male,64.4,170.9,Never,4,5,Yes,22.0,0
499,64,Male,57.5,187.2,Never,2,1,No,16.4,0


## Week 2 ETL by Margaret  
Data cleaning, outlier handling, feature engineering and saving cleaned data


In [7]:
import pandas as pd
import numpy as np

# load raw data
df = pd.read_csv("../data/raw/synthetic_health_data.csv")
df.head()


Unnamed: 0,age,gender,weight_kg,height_cm,smoking_status,exercise_freq_per_week,alcohol_intake_per_week,family_history,bmi,chronic_illness
0,56,Female,80.5,163.1,Former,1,3,No,30.3,0
1,69,Male,68.0,174.9,Never,0,2,No,22.2,0
2,46,Female,68.9,174.2,Never,1,4,No,22.7,0
3,32,Male,65.8,174.0,Former,1,3,No,21.7,0
4,60,Male,74.4,167.7,Never,5,1,No,26.5,0


### 🔍 Load Raw Data

Imported the synthetic health dataset generated in Week 1 from the `data/raw/` folder.  
Previewed the first few rows using `.head()` to inspect the structure and verify column types and values.


In [8]:
df.drop_duplicates(inplace=True)


In [9]:
# forward-fill then backward-fill as a fallback
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)


  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)


### 🧼 Data Cleaning & Preparation

Began by handling potential data quality issues in the raw synthetic health dataset.  
.Duplicates: Removed any repeated entries to ensure data integrity.  
.Missing Values: Applied forward and backward fill to address gaps in data, simulating a realistic cleaning workflow.  
.This step ensures a reliable foundation before performing feature engineering or statistical analysis.


In [10]:
#: remove extreme BMIs
Q1 = df['bmi'].quantile(0.25)
Q3 = df['bmi'].quantile(0.75)
IQR = Q3 - Q1
mask = df['bmi'].between(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
df = df[mask]


In [11]:
def bmi_category(b):
    if b < 18.5: return "Underweight"
    if b < 25:   return "Healthy"
    if b < 30:   return "Overweight"
    return "Obese"

df['bmi_category'] = df['bmi'].apply(bmi_category)


In [12]:
bins = [17, 30, 45, 60, 80]
labels = ['18–30','31–45','46–60','61–80']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)


In [13]:
df = pd.get_dummies(
    df,
    columns=['gender','smoking_status','family_history','bmi_category','age_group'],
    drop_first=True
)


In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_cols = ['age','weight_kg','height_cm','bmi',
                'exercise_freq_per_week','alcohol_intake_per_week']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


### Feature Engineering & Data Enrichment

Enhanced the dataset by deriving new insights and converting categorical variables for modeling:

.BMI Category: Classified individuals based on BMI ranges (Underweight, Healthy, Overweight, Obese).
.Age Group: Grouped ages into meaningful bins for better pattern analysis across demographics.
.Encoded Categorical Variables: Applied one-hot encoding to variables like gender, smoking status, and family history for model compatibility.
Normalized Numerical Features: Scaled age, weight, height, BMI, exercise, and alcohol intake for consistency across variables.
  
These enriched features lay the foundation for statistical analysis and predictive modeling in the next stage of the pipeline.


In [15]:
import os
os.makedirs("../data/transformed", exist_ok=True)

df.to_csv("../data/transformed/health_cleaned.csv", index=False)
