# Minggu 4 — Handling Missing Values & Normalization (Colab)

Tujuan: audit pola missing, lakukan imputasi sederhana & per-kelompok, lalu contoh normalisasi/standarisasi dengan anti-leakage pipeline.


In [None]:
# Setup dasar
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
print('pandas:', pd.__version__)


## 1) Load dataset
Upload `contoh_simple.csv` ke `/content/datasets/`, atau notebook membuat data sintetis bila file tak ada.


In [None]:
CSV_PATH = "/content/data/contoh_simple.csv"  # ganti jika perlu
os.makedirs('/content/data', exist_ok=True)

if not os.path.exists(CSV_PATH):
    print('File tidak ditemukan — membuat data sintetis untuk demo...')
    rng = np.random.default_rng(11)
    n = 350
    df = pd.DataFrame({
        'order_id': np.arange(1, n+1),
        'shift': rng.choice(['pagi','siang','malam'], size=n),
        'machine_speed': rng.normal(100, 13, size=n),
        'temperature_c': rng.normal(32, 2.8, size=n),
        'downtime_min': np.abs(rng.normal(9, 6, size=n)),
        'is_defect': rng.choice([0,1], size=n, p=[0.9,0.1])
    })
    # missing untuk latihan
    df.loc[rng.choice(df.index, 20, replace=False), 'machine_speed'] = np.nan
    df.loc[rng.choice(df.index, 12, replace=False), 'temperature_c'] = np.nan
    df.loc[rng.choice(df.index, 10, replace=False), 'downtime_min'] = np.nan
    df.loc[rng.choice(df.index, 15, replace=False), 'shift'] = np.nan
else:
    df = pd.read_csv(CSV_PATH)

df.head()


## 2) Audit Missing
Cek proporsi missing per kolom dan ringkasan data.


In [None]:
missing_pct = df.isna().mean().sort_values(ascending=False)
print('Missing ratio per kolom (desc):')
print(missing_pct)
df.describe(include='all')


## 3) Imputasi Sederhana (modus & median) + Per-`shift`


In [None]:
df_work = df.copy()

# Imputasi shift (kategorik)
if 'shift' in df_work.columns:
    mode_val = df_work['shift'].mode(dropna=True)
    fill_val = mode_val.iloc[0] if not mode_val.empty else 'Unknown'
    df_work['shift'] = df_work['shift'].fillna(fill_val)

# Imputasi numerik per shift (fallback median global)
for col in ['machine_speed','temperature_c','downtime_min']:
    if col in df_work.columns:
        df_work[col] = pd.to_numeric(df_work[col], errors='coerce')
        if 'shift' in df_work.columns:
            med_by_shift = df_work.groupby('shift')[col].transform('median')
            df_work[col] = df_work[col].fillna(med_by_shift)
        df_work[col] = df_work[col].fillna(df_work[col].median())

print('Sisa missing setelah imputasi:')
print(df_work[['shift','machine_speed','temperature_c','downtime_min']].isna().sum())


## 4) Simpan `interim`


In [None]:
os.makedirs('/content/data/interim', exist_ok=True)
interim_path = '/content/data/interim/df_imputed_week4.csv'
df_work.to_csv(interim_path, index=False)
interim_path


## 5) Normalization/Standardization + Pipeline (Anti-Leakage)


In [None]:
target = 'is_defect' if 'is_defect' in df_work.columns else None
assert target is not None, 'Kolom target is_defect tidak ditemukan.'

X = df_work.drop(columns=[target])
y = df_work[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

num_cols = X_train.select_dtypes(include=['number']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object','category','bool']).columns.tolist()

# Pilih scaler: 'standard', 'minmax', atau 'robust'
SCALER = 'standard'
if SCALER == 'standard':
    scaler = StandardScaler()
elif SCALER == 'minmax':
    scaler = MinMaxScaler()
else:
    scaler = RobustScaler()

num_pipe = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', scaler)])
cat_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('oh', OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)])

clf = Pipeline([('pre', pre), ('clf', LogisticRegression(max_iter=1000))])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


## 6) Simpan `processed` (siap-model)


In [None]:
from sklearn import set_config
set_config(transform_output='pandas')
num_pipe = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', scaler)])
cat_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('oh', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
pre = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)])
Xtr_ready = pre.fit_transform(X_train)
Xte_ready = pre.transform(X_test)
train_ready = Xtr_ready.copy(); train_ready[target] = y_train.to_numpy()
test_ready  = Xte_ready.copy();  test_ready[target]  = y_test.to_numpy()
os.makedirs('/content/data/processed', exist_ok=True)
train_path = '/content/data/processed/train_ready_week4.csv'
test_path  = '/content/data/processed/test_ready_week4.csv'
train_ready.to_csv(train_path, index=False)
test_ready.to_csv(test_path, index=False)
train_path, test_path

## 7) Catatan & Tugas
- Dokumentasikan keputusan imputasi (kolom, metode, parameter) dan scaler yang dipakai.
- Bandingkan performa model saat `SCALER = 'standard'` vs `'minmax'` vs `'robust'` (1–2 paragraf observasi).
- Simpan artefak `interim` & `processed` ke repositori tugas.
