<a href="https://colab.research.google.com/github/halimahbatam18/branch-AI/blob/main/bankruptcy_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Bankruptcy Prediction — Notebook


**Tujuan:** Membangun pipeline untuk memprediksi kebangkrutan bank sesuai arahan dosen.


**Termasuk:** EDA (2+ visualisasi terhadap target), penanganan imbalance, modelling (Decision Tree, Random Forest, ANN), evaluasi lengkap (Confusion Matrix, Precision, Recall, F1), dan ekstraksi 5 fitur paling penting.

Catatan: Jika ingin menggunakan file CSV asli, upload file dan ubah `DATA_PATH`.


In [None]:

# ====== Imports ======
import pandas as pd, numpy as np, matplotlib.pyplot as plt, os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample
import joblib
plt.rcParams["figure.figsize"] = (8,5)

# ====== Data path ======
DATA_PATH = None  # <-- ubah jika ada dataset: e.g. '/mnt/data/yourfile.csv'

def make_synthetic_bankrupt_dataset(n_samples=2000, random_state=42):
    rng = np.random.RandomState(random_state)
    debt_ratio = rng.beta(2,5,size=n_samples) * 2.0
    roa = rng.normal(0.02, 0.05, size=n_samples)
    net_profit_margin = rng.normal(0.05, 0.07, size=n_samples)
    liquidity_ratio = rng.normal(1.5, 0.7, size=n_samples)
    op_exp_ratio = rng.normal(0.6, 0.2, size=n_samples)
    asset_growth = rng.normal(0.03,0.1,size=n_samples)
    risk_score = (debt_ratio*1.8) + (-roa*8.0) + (op_exp_ratio*1.5) + (-liquidity_ratio*0.8) + (asset_growth* -1.0)
    prob = 1 / (1 + np.exp(- (risk_score - 0.5)))
    threshold = np.percentile(prob, 88)
    bankrupt = (prob > threshold).astype(int)
    df = pd.DataFrame({
        'debt_ratio': debt_ratio,
        'roa': roa,
        'net_profit_margin': net_profit_margin,
        'liquidity_ratio': liquidity_ratio,
        'op_exp_ratio': op_exp_ratio,
        'asset_growth': asset_growth,
        'bankrupt': bankrupt
    })
    return df

# Load or create
if DATA_PATH and os.path.exists(DATA_PATH):
    df = pd.read_csv(DATA_PATH)
    print("Loaded:", DATA_PATH)
else:
    df = make_synthetic_bankrupt_dataset(2000)
    print("Using synthetic dataset")
df.head()


## Exploratory Data Analysis (EDA) — minimal 2 visualisasi terhadap target

In [None]:

# ====== EDA ======
print("Class counts:\n", df['bankrupt'].value_counts())
print("\nClass percentages:\n", (df['bankrupt'].value_counts(normalize=True)*100).round(2))

display(df.describe().T)

# Visualisasi 1: Debt ratio distribution by target (boxplot)
plt.figure()
df.boxplot(column='debt_ratio', by='bankrupt')
plt.title('Debt Ratio by Bankrupt'); plt.suptitle(''); plt.show()

# Visualisasi 2: ROA distribution by target (boxplot)
plt.figure()
df.boxplot(column='roa', by='bankrupt')
plt.title('ROA by Bankrupt'); plt.suptitle(''); plt.show()

# Visualisasi 3: Scatter roa vs debt_ratio colored by target
plt.figure()
colors = df['bankrupt'].map({0:'C0',1:'C1'})
plt.scatter(df['debt_ratio'], df['roa'], c=colors, alpha=0.6)
plt.xlabel('debt_ratio'); plt.ylabel('roa'); plt.title('roa vs debt_ratio (by bankrupt)'); plt.show()


## Data Preparation — cek imbalance & scaling (sesuai arahan dosen)

In [None]:

# ====== Data Preparation ======
# 1. Missing & duplicates
print("Missing values per column:\n", df.isna().sum())
before = df.shape[0]
df = df.drop_duplicates().reset_index(drop=True)
print(f"Dropped {before - df.shape[0]} duplicates")

# 2. Feature / label split
X = df.drop(columns=['bankrupt'])
y = df['bankrupt']

# 3. Train-test split (stratify)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train:", X_train.shape, "Test:", X_test.shape)

# 4. Check imbalance
print("Train class distribution before handling:", y_train.value_counts().to_dict())

# ====== Option A: Random oversampling (implemented) ======
train = pd.concat([X_train, y_train], axis=1)
minority = train[train['bankrupt']==1]
majority = train[train['bankrupt']==0]
if len(minority)>0 and len(minority) < len(majority):
    minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
    train_bal = pd.concat([majority, minority_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)
    X_train = train_bal.drop(columns=['bankrupt']); y_train = train_bal['bankrupt']
    print("After random oversampling:", y_train.value_counts().to_dict())
else:
    print("No oversampling needed")

# ====== Option B: SMOTE (commented) ======
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=42)
# X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
# print("After SMOTE:", pd.Series(y_train_sm).value_counts())

# 5. Scaling for ANN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, '/mnt/data/scaler.joblib')
print("Saved scaler to /mnt/data/scaler.joblib")


## Modelling & Evaluation — Decision Tree, Random Forest, ANN; tampilkan Confusion Matrix, Precision, Recall, F1

In [None]:

# ====== Helper: plot confusion matrix ======
import seaborn as sns
def plot_cm(cm, labels=[0,1], title='Confusion Matrix'):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title(title)
    plt.show()

# ====== Train models ======
models = {
    'DecisionTree': DecisionTreeClassifier(max_depth=6, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(64,32), max_iter=300, random_state=42)
}

trained = {}
results = {}

for name, model in models.items():
    print("\n=== Training", name, "===")
    if name == 'MLP':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    trained[name] = model
    cm = confusion_matrix(y_test, y_pred)
    plot_cm(cm, labels=[0,1], title=f'Confusion Matrix - {name}')
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    try:
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1]) if hasattr(model, "predict_proba") else None
    except Exception:
        auc = None
    print(classification_report(y_test, y_pred, zero_division=0))
    results[name] = {'precision': prec, 'recall': rec, 'f1': f1, 'auc': auc}
    print(f"Metrics -> Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}, AUC: {auc}")


## Feature Importance — sebutkan top-5 variabel

In [None]:

# ====== Feature importance and top-5 features ======
rf = trained.get('RandomForest')
if rf is not None:
    feat_names = X.columns.tolist()
    importances = rf.feature_importances_
    fi = pd.DataFrame({'feature': feat_names, 'importance': importances}).sort_values('importance', ascending=False)
    display(fi)
    print("\nTop-5 features influencing bankruptcy:")
    print(fi.head(5).to_string(index=False))
    # plot
    plt.figure(); plt.barh(fi['feature'], fi['importance']); plt.gca().invert_yaxis(); plt.title('Feature Importance'); plt.xlabel('Importance'); plt.show()
else:
    print("RandomForest not trained")


## Save models

In [None]:

# ====== Save models ======
for name, model in trained.items():
    path = f'/mnt/data/model_{name}.joblib'
    joblib.dump(model, path)
    print("Saved", name, "->", path)
print("\nSelesai. Periksa /mnt/data untuk model dan scaler.")
