In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Load dataset
df = pd.read_csv("../data/heart.csv")

# Handle missing values
df.fillna(df.median(), inplace=True)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Encode categorical variables
encoder = OneHotEncoder()
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
encoded_features = encoder.fit_transform(df[categorical_cols]).toarray()
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

# Combine numerical and encoded categorical features
df = pd.concat([df, encoded_df], axis=1)
df.drop(categorical_cols, axis=1, inplace=True)

# Save cleaned data
df.to_csv("../data/cleaned_data.csv", index=False)