In [None]:
# Blok 1: impor dan seting dasar
# Jalankan sekali di awal
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# agar hasil deterministik
RANDOM_STATE = 42


In [None]:
# Blok 2: fungsi load data
# Sesuaikan path file CSV Anda
def load_datasets(base_dir):
    app_train = pd.read_csv(os.path.join(base_dir, 'application_train.csv'))
    app_test = pd.read_csv(os.path.join(base_dir, 'application_test.csv'))
    bureau = pd.read_csv(os.path.join(base_dir, 'bureau.csv'))
    bureau_bal = pd.read_csv(os.path.join(base_dir, 'bureau_balance.csv'))
    prev_app = pd.read_csv(os.path.join(base_dir, 'previous_application.csv'))
    pos_cash = pd.read_csv(os.path.join(base_dir, 'POS_CASH_balance.csv'))
    cc_bal = pd.read_csv(os.path.join(base_dir, 'credit_card_balance.csv'))
    inst_pay = pd.read_csv(os.path.join(base_dir, 'installments_payments.csv'))
    return {
        'app_train': app_train,
        'app_test': app_test,
        'bureau': bureau,
        'bureau_bal': bureau_bal,
        'prev_app': prev_app,
        'pos_cash': pos_cash,
        'cc_bal': cc_bal,
        'inst_pay': inst_pay
    }

# contoh pemanggilan
# data = load_datasets('/path/to/csvs')


In [None]:
# Blok 3: ringkasan cepat dataset
def quick_overview(df, name='data'):
    print(f'Ringkasan {name}')
    print('shape:', df.shape)
    print('kolom null (top 10):')
    print(df.isnull().mean().sort_values(ascending=False).head(10))
    print('\nTipe data (sample):')
    print(df.dtypes.value_counts())
    display(df.head())

# contoh
# quick_overview(data['app_train'], 'application_train')


In [None]:
# Blok 4: fungsi agregasi bureau sederhana
# Hanya contoh: hitung jumlah kredit dan rata-rata status.
def aggregate_bureau(bureau, bureau_bal=None):
    # agregat per SK_ID_CURR
    agg = bureau.groupby('SK_ID_CURR').agg({
        'CREDIT_ACTIVE': lambda x: (x == 'Active').sum(),
        'AMT_CREDIT_SUM': 'sum',
        'AMT_CREDIT_SUM_DEBT': 'sum',
        'DAYS_CREDIT': 'mean'
    }).rename(columns={
        'CREDIT_ACTIVE': 'bureau_active_count',
        'AMT_CREDIT_SUM': 'bureau_credit_sum',
        'AMT_CREDIT_SUM_DEBT': 'bureau_debt_sum',
        'DAYS_CREDIT': 'bureau_days_credit_mean'
    }).reset_index()
    # jika ada bureau_balance, tambah feature frekuensi keterlambatan kasar
    if bureau_bal is not None:
        bb = bureau_bal.copy()
        bb['BAD_MONTH'] = bb['STATUS'].replace({'C':0}).apply(lambda v: 1 if str(v).isdigit() and int(v) > 0 else 0)
        bb_agg = bb.groupby('SK_ID_BUREAU').agg({'BAD_MONTH':'sum'}).reset_index()
        # gabung ke bureau berdasarkan SK_ID_BUREAU
        b_with_bad = bureau.merge(bb_agg, on='SK_ID_BUREAU', how='left')
        bad_per_customer = b_with_bad.groupby('SK_ID_CURR').agg({'BAD_MONTH':'sum'}).reset_index().rename(columns={'BAD_MONTH':'bureau_bad_months'})
        agg = agg.merge(bad_per_customer, on='SK_ID_CURR', how='left')
        agg['bureau_bad_months'] = agg['bureau_bad_months'].fillna(0)
    return agg


In [None]:
# Blok 5: fungsi gabung dan pre-processing dasar
def build_feature_table(app_df, bureau_df=None, prev_app_df=None, drop_cols=None, n_sample=None):
    df = app_df.copy()
    if n_sample is not None:
        df = df.sample(n=n_sample, random_state=RANDOM_STATE)
    # drop kolom yang tidak relevan atau identifier selain SK_ID_CURR dan TARGET
    if drop_cols is None:
        drop_cols = ['SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']
    # merge bureau agregat jika ada
    if bureau_df is not None:
        df = df.merge(bureau_df, on='SK_ID_CURR', how='left')
    # contoh dari previous applications: hitung jumlah previous apps dan approved rate
    if prev_app_df is not None:
        prev = prev_app_df.groupby('SK_ID_CURR').agg({
            'SK_ID_PREV':'count',
            'NAME_CONTRACT_STATUS': lambda x: (x == 'Approved').sum()
        }).rename(columns={'SK_ID_PREV':'prev_app_count','NAME_CONTRACT_STATUS':'prev_app_approved_count'}).reset_index()
        df = df.merge(prev, on='SK_ID_CURR', how='left')
        df['prev_app_approved_rate'] = df['prev_app_approved_count'] / df['prev_app_count']
    # drop constant or id-like columns
    to_drop = [c for c in df.columns if c in drop_cols]
    # siapkan X, y
    y = None
    if 'TARGET' in df.columns:
        y = df['TARGET']
        X = df.drop(columns=['TARGET'] + to_drop)
    else:
        X = df.drop(columns=to_drop)
    return X, y


In [None]:
# Blok 6: pipeline numerik & kategorikal dan fungsi training evaluate
def get_column_groups(X):
    num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
    return num_cols, cat_cols

def build_preprocessor(num_cols, cat_cols):
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    preprocessor = ColumnTransformer([
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ], remainder='drop')
    return preprocessor

def train_and_evaluate(X, y, estimator, preprocessor, cv=3):
    # split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
    pipe = Pipeline([
        ('prep', preprocessor),
        ('clf', estimator)
    ])
    pipe.fit(X_train, y_train)
    # prediksi
    y_pred = pipe.predict(X_val)
    y_proba = pipe.predict_proba(X_val)[:,1] if hasattr(pipe.named_steps['clf'], 'predict_proba') else None
    results = {
        'model': pipe,
        'accuracy': accuracy_score(y_val, y_pred),
        'report': classification_report(y_val, y_pred, output_dict=False),
        'confusion': confusion_matrix(y_val, y_pred)
    }
    if y_proba is not None:
        results['roc_auc'] = roc_auc_score(y_val, y_proba)
    # cross val AUC bila tersedia
    if hasattr(estimator, 'predict_proba'):
        cv_scores = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc')
        results['cv_roc_auc'] = cv_scores
    return results


In [None]:
# Blok 7: jalankan end-to-end contoh dengan 2 model
# Asumsi: data sudah di-load ke variable `data`
def run_full_pipeline(data, n_sample=None):
    app = data['app_train']
    bureau = data.get('bureau')
    bureau_bal = data.get('bureau_bal')
    prev_app = data.get('prev_app')
    # agregasi bureau
    bureau_agg = None
    if bureau is not None:
        bureau_agg = aggregate_bureau(bureau, bureau_bal)
    # build features
    X, y = build_feature_table(app, bureau_df=bureau_agg, prev_app_df=prev_app, n_sample=n_sample)
    # singkirkan kolom dengan missing > 0.9 atau number of unique sangat besar
    thresh = 0.9
    large_missing = X.columns[X.isnull().mean() > thresh].tolist()
    X = X.drop(columns=large_missing)
    # ambil kolom sederhana
    num_cols, cat_cols = get_column_groups(X)
    # pilih subset numeric dan kategori yang wajar (limit kategori)
    cat_cols = [c for c in cat_cols if X[c].nunique() < 50]
    preproc = build_preprocessor(num_cols, cat_cols)
    # model 1: Logistic Regression
    lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    res_lr = train_and_evaluate(X, y, lr, preproc)
    print('=== Logistic Regression ===')
    print('Accuracy:', res_lr['accuracy'])
    if 'roc_auc' in res_lr:
        print('ROC AUC:', res_lr['roc_auc'])
    print(res_lr['report'])
    print('Confusion:\n', res_lr['confusion'])
    # model 2: Random Forest
    rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=RANDOM_STATE)
    res_rf = train_and_evaluate(X, y, rf, preproc)
    print('\n=== Random Forest ===')
    print('Accuracy:', res_rf['accuracy'])
    if 'roc_auc' in res_rf:
        print('ROC AUC:', res_rf['roc_auc'])
    print(res_rf['report'])
    print('Confusion:\n', res_rf['confusion'])
    return {'lr': res_lr, 'rf': res_rf, 'X': X, 'y': y}


In [None]:
# Blok 8: fungsi visualisasi ROC dan feature importance sederhana
def plot_roc(model_pipe, X_val, y_val):
    y_proba = model_pipe.predict_proba(X_val)[:,1]
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr)
    plt.plot([0,1],[0,1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.show()

def get_feature_importance(pipe, X):
    # jika classifier punya feature_importances_
    clf = pipe.named_steps['clf']
    pre = pipe.named_steps['prep']
    # ambil nama fitur setelah preprocessing (hati-hati dengan OneHot)
    # dapatkan fitur numerik
    num_names = pre.transformers_[0][2]
    cat_transformer = pre.transformers_[1][1]
    # proses nama kategori
    cat_cols = pre.transformers_[1][2]
    ohe = cat_transformer.named_steps['ohe']
    try:
        ohe_cols = list(ohe.get_feature_names_out(cat_cols))
    except:
        ohe_cols = []
    feature_names = list(num_names) + ohe_cols
    if hasattr(clf, 'feature_importances_'):
        fi = clf.feature_importances_
        imp_df = pd.DataFrame({'feature': feature_names, 'importance': fi})
        imp_df = imp_df.sort_values('importance', ascending=False).head(30)
        plt.figure(figsize=(6,8))
        sns.barplot(x='importance', y='feature', data=imp_df)
        plt.title('Feature importances')
        plt.show()
        return imp_df
    elif hasattr(clf, 'coef_'):
        coef = clf.coef_.ravel()
        imp_df = pd.DataFrame({'feature': feature_names, 'coef': coef})
        imp_df = imp_df.reindex(imp_df.coef.abs().sort_values(ascending=False).index).head(30)
        plt.figure(figsize=(6,8))
        sns.barplot(x='coef', y='feature', data=imp_df)
        plt.title('Top coef')
        plt.show()
        return imp_df
    else:
        print('Model tidak mendukung feature importances atau coef.')
        return None


In [None]:
# Blok 9: langkah rekomendasi ringkas untuk insight bisnis (fungsi bantu)
def business_insights_report(model_pipe, X_sample, top_n=5):
    # ambil penting fitur
    imp = get_feature_importance(model_pipe, X_sample)
    if imp is None:
        return []
    top = imp.head(top_n)['feature'].tolist()
    report = []
    for f in top:
        report.append(f'Fitur penting: {f}. Action: periksa kebijakan kredit terkait fitur ini, buat eksperimen pricing atau campaign target.')
    return report

# contoh penggunaan setelah run_full_pipeline mengembalikan model:
# results = run_full_pipeline(data, n_sample=10000)
# model_pipe = results['rf']['model']
# insights = business_insights_report(model_pipe, results['X'])
# for i in insights: print('-', i)
