In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset from Azure ML storage
df = pd.read_csv("stroke-prediction-dataset.csv")

# Drop irrelevant columns
df.drop(columns=["id"], inplace=True)

# Handle missing values (fill with mean for simplicity)
df.fillna(df.mean(), inplace=True)

# Encode categorical features
label_encoders = {}
categorical_cols = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalize numerical columns
scaler = StandardScaler()
numerical_cols = ["age", "avg_glucose_level", "bmi"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split into train & test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Save preprocessed data
train.to_csv("train_data.csv", index=False)
test.to_csv("test_data.csv", index=False)

print("Preprocessing complete. Train & Test datasets saved.")


In [None]:
print(train.head())
print(test.head())
