# M5 Case Study — Evaluating Normalization Impact (Anti-Leakage)
Dataset: `m5_casestudy.csv` — bandingkan **None/MinMax/Standard/Robust** pada **KNN** & **LogReg** (5-fold Stratified CV).

In [None]:
# =========================
# M5 — Evaluasi Dampak Scaling (Anti-Leakage)
# =========================
import os, numpy as np, pandas as pd

# ===== 0) Load data =====
CSV_PATH = '/content/datasets/m5_casestudy.csv'   # ganti jika perlu
assert os.path.exists(CSV_PATH), f"Tidak menemukan file: {CSV_PATH}"
df = pd.read_csv(CSV_PATH)

# Target & fitur
target = 'is_defect'
assert target in df.columns, f"Kolom target '{target}' tidak ada."
X = df.drop(columns=[target])
y = df[target]

# Kolom numerik/kategorik
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

print("Ringkasan dataset:")
print("X shape:", X.shape, "| y dist:", y.value_counts().to_dict())
print("num_cols:", num_cols)
print("cat_cols:", cat_cols)

In [None]:
# ===== 1) CV & scorer =====
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# F1 untuk kelas positif; zero_division=0 agar aman saat fold tanpa positif
scorer_f1 = make_scorer(f1_score, pos_label=1, zero_division=0)

In [None]:
# ===== 2) Pipeline preprocessing =====
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler

In [None]:
# ===== 3) Model & scaler =====
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

models = {
    'knn':    KNeighborsClassifier(n_neighbors=5, weights='distance'),
    'logreg': LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs')
}

scalers = {
    'none':     None,
    'minmax':   MinMaxScaler(),
    'standard': StandardScaler(),
    'robust':   RobustScaler(),
}

In [None]:
# ===== 4) Evaluasi kombinasi model × scaler (anti-leakage) =====
rows = []
for sc_name, sc in scalers.items():
    # numerik: imputasi + (opsional) scaler
    num_steps = [('imp', SimpleImputer(strategy='median'))]
    if sc is not None:
        num_steps.append(('sc', sc))
    num_pipe = Pipeline(num_steps)

    # kategorik
    cat_pipe = Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ('oh',  OneHotEncoder(handle_unknown='ignore'))
    ])

    pre = ColumnTransformer([
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ])

    for mdl_name, mdl in models.items():
        clf = Pipeline([('pre', pre), ('clf', mdl)])
        scores = cross_val_score(clf, X, y, cv=cv, scoring=scorer_f1)
        rows.append({
            'scaler': sc_name,
            'model': mdl_name,
            'f1_mean': scores.mean(),
            'f1_std':  scores.std()
        })
        print(f"[F1] scaler={sc_name:8s} model={mdl_name:7s}  mean={scores.mean():.3f}  std={scores.std():.3f}")

res_df = pd.DataFrame(rows).sort_values(['model','scaler']).reset_index(drop=True)
print("\n=== Ringkasan F1 (mean±std) ===")
print(res_df)

In [None]:
# ===== 5) (Opsional) Tambahan: Yeo-Johnson + Scaler untuk fitur sangat skew =====
from sklearn.preprocessing import PowerTransformer

rows_yj = []
# kita uji pada Logistic Regression saja untuk contoh
for sc_name, sc in {'standard': StandardScaler(), 'robust': RobustScaler()}.items():
    num_pipe_yj = Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('yj',  PowerTransformer(method='yeo-johnson')),  # menerima nol/negatif
        ('sc',  sc)
    ])
    cat_pipe = Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ('oh',  OneHotEncoder(handle_unknown='ignore'))
    ])
    pre_yj = ColumnTransformer([
        ('num', num_pipe_yj, num_cols),
        ('cat', cat_pipe,    cat_cols)
    ])
    clf_yj = Pipeline([('pre', pre_yj),
                       ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))])
    scores = cross_val_score(clf_yj, X, y, cv=cv, scoring=scorer_f1)
    rows_yj.append({
        'scaler': f'yj+{sc_name}',
        'model':  'logreg',
        'f1_mean': scores.mean(),
        'f1_std':  scores.std()
    })
    print(f"[F1] scaler=yj+{sc_name:8s} model=logreg   mean={scores.mean():.3f}  std={scores.std():.3f}")

res_yj = pd.DataFrame(rows_yj) if rows_yj else pd.DataFrame(columns=['scaler','model','f1_mean','f1_std'])

In [None]:
# ===== 6) Simpan hasil =====
os.makedirs('/content/results', exist_ok=True)
out_main = '/content/results/m5_scaler_results.csv'
res_df.to_csv(out_main, index=False)
print("\nSaved:", out_main)

if not res_yj.empty:
    out_yj = '/content/results/m5_scaler_results_yj.csv'
    res_yj.to_csv(out_yj, index=False)
    print("Saved:", out_yj)

In [None]:
# ===== 7) (Opsional) sanity check extra: Accuracy =====
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score as cvs

scorer_acc = 'accuracy'
for mdl_name, mdl in models.items():
    # gunakan scaler 'standard' untuk contoh akurasi
    num_pipe = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', StandardScaler())])
    cat_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('oh', OneHotEncoder(handle_unknown='ignore'))])
    pre = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)])
    clf = Pipeline([('pre', pre), ('clf', mdl)])
    acc = cvs(clf, X, y, cv=cv, scoring=scorer_acc)
    print(f"[ACC] model={mdl_name:7s}  mean={acc.mean():.3f}  std={acc.std():.3f}")