In [None]:
# Data Cleaning Notebook

import pandas as pd
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv('data/raw/ed_data.csv')

# Handle missing values
imputer = SimpleImputer(strategy='median')
df['treatment_time'] = imputer.fit_transform(df[['treatment_time']])
df['discharge_time'] = imputer.fit_transform(df[['discharge_time']])

# Handle outliers
z_scores = stats.zscore(df['wait_time'])
df = df[(z_scores < 3) & (z_scores > -3)]

# Normalize data
scaler = StandardScaler()
df[['treatment_time', 'wait_time']] = scaler.fit_transform(df[['treatment_time', 'wait_time']])

# Save cleaned data
df.to_csv('data/processed/cleaned_ed_data.csv', index=False)
