In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 1. Load Dataset
print("=== Loading Dataset ===")
df = pd.read_csv('../data/raw/healthcare-dataset-stroke-data.csv')
print("Shape awal dataset:", df.shape)

=== Loading Dataset ===
Shape awal dataset: (5110, 12)


In [4]:
# 2. Handling Missing Values
print("\n=== Handling Missing Values ===")
# Cek missing values
print("Missing values sebelum preprocessing:")
print(df.isnull().sum())

# Isi missing values di BMI dengan median sesuai gender
df['bmi'] = df.groupby('gender')['bmi'].transform(lambda x: x.fillna(x.median()))

print("\nMissing values setelah preprocessing:")
print(df.isnull().sum())


=== Handling Missing Values ===
Missing values sebelum preprocessing:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Missing values setelah preprocessing:
id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


In [5]:
# 3. Handling Outliers dengan IQR method
print("\n=== Handling Outliers ===")
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Cap the outliers
    df[column] = np.where(df[column] > upper_bound, upper_bound,
                         np.where(df[column] < lower_bound, lower_bound, df[column]))
    return df

# Handle outliers untuk kolom numerik
numeric_columns = ['age', 'avg_glucose_level', 'bmi']
for col in numeric_columns:
    df = handle_outliers(df, col)
    print(f"Outliers in {col} have been handled")


=== Handling Outliers ===
Outliers in age have been handled
Outliers in avg_glucose_level have been handled
Outliers in bmi have been handled


In [6]:
# 4. Feature Encoding
print("\n=== Feature Encoding ===")
# Label Encoding untuk kolom binary dan ordinal
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])

# One-hot encoding untuk kolom kategorikal dengan multiple values
df = pd.get_dummies(df, columns=['work_type', 'smoking_status'])

print("Columns after encoding:")
print(df.columns.tolist())


=== Feature Encoding ===
Columns after encoding:
['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'avg_glucose_level', 'bmi', 'stroke', 'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes']


In [7]:
# 5. Feature Scaling
print("\n=== Feature Scaling ===")
scaler = StandardScaler()
features_to_scale = ['age', 'avg_glucose_level', 'bmi']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])


=== Feature Scaling ===


In [9]:
# 6. Handling Imbalanced Data
print("\n=== Handling Imbalanced Data ===")
# Pisahkan features dan target
X = df.drop(['stroke', 'id'], axis=1)
y = df['stroke']

print("Distribusi kelas sebelum resampling:")
print(pd.Series(y).value_counts(normalize=True))

# Gunakan kombinasi SMOTE dan RandomUnderSampler
sampler = Pipeline([
    ('smote', SMOTE(sampling_strategy=0.1, random_state=42)),
    ('under', RandomUnderSampler(sampling_strategy=0.5, random_state=42))
])

X_resampled, y_resampled = sampler.fit_resample(X, y)

print("\nDistribusi kelas setelah resampling:")
print(pd.Series(y_resampled).value_counts(normalize=True))


=== Handling Imbalanced Data ===
Distribusi kelas sebelum resampling:
stroke
0    0.951272
1    0.048728
Name: proportion, dtype: float64

Distribusi kelas setelah resampling:
stroke
0    0.666667
1    0.333333
Name: proportion, dtype: float64


In [10]:
# 7. Menyimpan hasil preprocessing
print("\n=== Saving Processed Data ===")
# Gabungkan kembali features dan target
processed_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                         pd.Series(y_resampled, name='stroke')], axis=1)

# Simpan ke file
processed_df.to_csv('../data/processed/processed_stroke_data.csv', index=False)
print("Data telah disimpan di '../data/processed/processed_stroke_data.csv'")


=== Saving Processed Data ===
Data telah disimpan di '../data/processed/processed_stroke_data.csv'


In [11]:
# 8. Summary
print("\n=== Summary Preprocessing ===")
print(f"1. Jumlah data awal: {len(df)}")
print(f"2. Jumlah data setelah preprocessing: {len(processed_df)}")
print(f"3. Jumlah features: {len(X_resampled.columns)}")
print("4. Preprocessing steps yang telah dilakukan:")
print("   - Handling missing values")
print("   - Handling outliers")
print("   - Feature encoding")
print("   - Feature scaling")
print("   - Handling imbalanced data")


=== Summary Preprocessing ===
1. Jumlah data awal: 5110
2. Jumlah data setelah preprocessing: 1458
3. Jumlah features: 17
4. Preprocessing steps yang telah dilakukan:
   - Handling missing values
   - Handling outliers
   - Feature encoding
   - Feature scaling
   - Handling imbalanced data
