In [1]:
# -------------------------------
# 1. Import Libraries
# -------------------------------
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# -------------------------------
# 2. Load Data
# -------------------------------
data = pd.read_csv("diabetic_data.csv")

# -------------------------------
# 3. Initial Cleaning
# -------------------------------
# Drop columns with too many missing or irrelevant values
data = data.drop(['weight', 'payer_code', 'medical_specialty'], axis=1)

# Replace "?" with NaN
data = data.replace("?", np.nan)

# Drop encounter_id and patient_nbr (identifiers, not useful for prediction)
data = data.drop(['encounter_id', 'patient_nbr'], axis=1)

# -------------------------------
# 4. Target Variable
# -------------------------------
# Convert 'readmitted' into binary classification: 1 (readmitted within 30 days) vs 0 (not readmitted)
data['readmitted'] = data['readmitted'].replace({'>30': 0, 'NO': 0, '<30': 1})

# -------------------------------
# 5. Encode Categorical Variables
# -------------------------------
categorical_cols = data.select_dtypes(include=['object']).columns

le = LabelEncoder()
for col in categorical_cols:
    data[col] = data[col].astype(str)  # ensure string type
    data[col] = le.fit_transform(data[col])

# -------------------------------
# 6. Split Features & Labels
# -------------------------------
X = data.drop('readmitted', axis=1)
y = data['readmitted']

# -------------------------------
# 7. Handle Imbalance using SMOTE
# -------------------------------
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print("Original class distribution:\n", y.value_counts())
print("\nAfter SMOTE balancing:\n", y_res.value_counts())

# -------------------------------
# 8. Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

# -------------------------------
# 9. Feature Scaling
# -------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("✅ Data Preprocessing & Balancing Complete")
print("Train Shape:", X_train.shape, " Test Shape:", X_test.shape)


NameError: name 'SMOTE' is not defined