In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from flask import Flask, request, jsonify
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve
)
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("processed_kelulusan.csv")

X = df.drop("Lulus", axis=1)
y = df["Lulus"]

try:
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
    )
except ValueError:
    print(" Peringatan: Stratify dinonaktifkan karena salah satu kelas terlalu sedikit.")
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train.shape, X_val.shape, X_test.shape)

num_cols = X_train.select_dtypes(include="number").columns

pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler())
    ]), num_cols),
], remainder="drop")

pipe_lr = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
])

pipe_lr.fit(X_train, y_train)
y_val_pred = pipe_lr.predict(X_val)

print("\n===== Baseline: Logistic Regression =====")
print("F1 Score (val):", f1_score(y_val, y_val_pred, average="macro"))
print(classification_report(y_val, y_val_pred, digits=3))

pipe_rf = Pipeline([
    ("pre", pre),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_features="sqrt",
        class_weight="balanced",
        random_state=42
    ))
])

pipe_rf.fit(X_train, y_train)
y_val_rf = pipe_rf.predict(X_val)

print("\n===== Random Forest (default) =====")
print("F1 Score (val):", f1_score(y_val, y_val_rf, average="macro"))

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

param = {
    "clf__max_depth": [None, 12, 20, 30],
    "clf__min_samples_split": [2, 5, 10]
}

gs = GridSearchCV(
    pipe_rf, param_grid=param, cv=skf,
    scoring="f1_macro", n_jobs=-1, verbose=1
)

gs.fit(X_train, y_train)

print("\n===== Hasil GridSearch =====")
print("Best Params:", gs.best_params_)
print("Best CV F1:", gs.best_score_)

best_rf = gs.best_estimator_
y_val_best = best_rf.predict(X_val)
print("Best RF F1 (val):", f1_score(y_val, y_val_best, average="macro"))

final_model = best_rf  # ubah ke pipe_lr jika baseline lebih baik
y_test_pred = final_model.predict(X_test)

print("\n===== EVALUASI AKHIR (TEST) =====")
print("F1 Score (test):", f1_score(y_test, y_test_pred, average="macro"))
print(classification_report(y_test, y_test_pred, digits=3))
print("Confusion Matrix (test):")
print(confusion_matrix(y_test, y_test_pred))

if hasattr(final_model, "predict_proba"):
    y_test_proba = final_model.predict_proba(X_test)[:, 1]
    try:
        auc = roc_auc_score(y_test, y_test_proba)
        print("ROC-AUC (test):", auc)
    except:
        print("ROC-AUC tidak dapat dihitung (kemungkinan multi-class).")

    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
    plt.plot([0, 1], [0, 1], "k--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve (Test)")
    plt.legend()
    plt.tight_layout()
    plt.savefig("roc_test.png", dpi=120)
    plt.close()

joblib.dump(final_model, "model.pkl")
print("\n Model tersimpan ke 'model.pkl'")

app = Flask(__name__)
MODEL = joblib.load("model.pkl")

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json(force=True)
    X_input = pd.DataFrame([data])
    y_pred = MODEL.predict(X_input)[0]

    proba = None
    if hasattr(MODEL, "predict_proba"):
        proba = float(MODEL.predict_proba(X_input)[:, 1][0])

    return jsonify({
        "prediction": int(y_pred),
        "proba": proba
    })

if __name__ == "__main__":
    app.run(port=5000)


 Peringatan: Stratify dinonaktifkan karena salah satu kelas terlalu sedikit.
(7, 5) (1, 5) (2, 5)

===== Baseline: Logistic Regression =====
F1 Score (val): 1.0
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         1

    accuracy                          1.000         1
   macro avg      1.000     1.000     1.000         1
weighted avg      1.000     1.000     1.000         1


===== Random Forest (default) =====
F1 Score (val): 1.0
Fitting 4 folds for each of 12 candidates, totalling 48 fits

===== Hasil GridSearch =====
Best Params: {'clf__max_depth': None, 'clf__min_samples_split': 2}
Best CV F1: 1.0
Best RF F1 (val): 1.0

===== EVALUASI AKHIR (TEST) =====
F1 Score (test): 1.0
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         1
           1      1.000     1.000     1.000         1

    accuracy                          1.000         2
   macro avg      1.000     1.000     1

 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
