xxx

In [None]:
# Filename: svm_npl.py
# Python 3.9+ recommended

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay,
    PrecisionRecallDisplay
)
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")



In [None]:
from google.colab import files

# 1) Upload file
uploaded = files.upload()  # This opens a file picker in Colab

# If your file is named npa_dataset.csv
import pandas as pd
df = pd.read_csv('npa_dataset.csv')

Saving npa_dataset.csv to npa_dataset.csv


In [None]:
# Display column names after cleaning
print("Cleaned Columns:", df.columns.tolist())

Cleaned Columns: ['Loan_ID', 'Customer_ID', 'Loan_Amount', 'Loan_Type', 'Credit_Score', 'Repayment_History', 'Collateral_Value', 'Loan_Tenure', 'Default_Status']


In [None]:
# 2) Quick sanity check â€” print columns
print("Columns:", df.columns.tolist())
print(df.head())

Columns: ['Loan_ID', 'Customer_ID', 'Loan_Amount', 'Loan_Type', 'Credit_Score', 'Repayment_History', 'Collateral_Value', 'Loan_Tenure', 'Default_Status']
  Loan_ID Customer_ID  Loan_Amount Loan_Type  Credit_Score  Repayment_History  \
0  L00001      C00001      1742743  Personal           505          68.029548   
1  L00002      C00002      4354572  Business           654          66.635174   
2  L00003      C00003      4976484   Vehicle           330          55.712543   
3  L00004      C00004      2284489   Vehicle           543          70.504439   
4  L00005      C00005      1620006  Business           788          80.254261   

   Collateral_Value  Loan_Tenure  Default_Status  
0            481766           88               1  
1           2912934           33               0  
2           1347431          217               1  
3           2808402          205               1  
4            680588           73               0  


In [None]:
# 3) Basic cleaning / target + features
# Drop pure identifiers (not predictive)
drop_cols = ['Loan_ID', 'Customer_ID']

# Define target
target_col = 'Default_Status'   # 0 = performing, 1 = non-performing

# Feature columns (all except target & ID columns)
feature_cols = [c for c in df.columns if c not in drop_cols + [target_col]]

X = df[feature_cols].copy()
y = df[target_col].astype(int)  # ensure integer binary labels

In [None]:

# 4) Separate numeric and categorical feature sets -------------------------
numeric_features = ['Loan_Amount', 'Credit_Score', 'Collateral_Value', 'Loan_Tenure', ' Repayment_History ']
categorical_features = ['Loan_Type']

# If unknown categories appear at inference time, handle_unknown='ignore' avoids errors
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  # drop other columns
)


In [None]:
# 5) Train/test split with stratification ----------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

In [None]:
# 6) Build SVM pipeline -----------------------------------------------------
# class_weight='balanced' helps when defaults (1s) are rarer than non-defaults (0s)
svm_clf = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)

pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('svm', svm_clf)
])

In [None]:


# 7) Hyperparameter tuning via cross-validation ----------------------------

# Correct the numeric_features list by removing leading/trailing spaces
numeric_features = ['Loan_Amount', 'Credit_Score', 'Collateral_Value', 'Loan_Tenure', 'Repayment_History']
categorical_features = ['Loan_Type'] # Explicitly defining categorical_features

# Re-define preprocess with the corrected numeric_features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  # drop other columns
)

# Re-define pipe with the corrected preprocess
svm_clf = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('svm', svm_clf)
])

# Reduced param_grid for faster execution
param_grid = {
    'svm__C': [1, 2],          # Reduced options for C
    'svm__gamma': ['scale', 0.1] # Reduced options for gamma
}

# Reduced n_splits for faster cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='roc_auc',     # use ROC-AUC for imbalanced binary classification
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("\nBest params:", grid.best_params_)
print("Best CV ROC-AUC:", round(grid.best_score_, 4))

best_model = grid.best_estimator_

Fitting 3 folds for each of 4 candidates, totalling 12 fits

Best params: {'svm__C': 2, 'svm__gamma': 'scale'}
Best CV ROC-AUC: 0.9998


In [None]:
print("\nBest params:", grid.best_params_)
print("Best CV ROC-AUC:", round(grid.best_score_, 4))

best_model = grid.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# 8) Evaluation on Test Set
# ---------------------------
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # probability of class 1

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec  = recall_score(y_test, y_pred, zero_division=0)
f1   = f1_score(y_test, y_pred, zero_division=0)
roc  = roc_auc_score(y_test, y_proba)

print("\n=== Test Metrics ===")
print(f"Accuracy     : {acc:.4f}")
print(f"Precision    : {prec:.4f}")
print(f"Recall       : {rec:.4f}")
print(f"F1-Score     : {f1:.4f}")
print(f"ROC-AUC      : {roc:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))


=== Test Metrics ===
Accuracy     : 0.9897
Precision    : 0.9668
Recall       : 0.9962
F1-Score     : 0.9813
ROC-AUC      : 0.9998

Classification Report:
               precision    recall  f1-score   support

           0     0.9986    0.9873    0.9929     65665
           1     0.9668    0.9962    0.9813     24402

    accuracy                         0.9897     90067
   macro avg     0.9827    0.9918    0.9871     90067
weighted avg     0.9900    0.9897    0.9898     90067



In [None]:
# 9) Confusion Matrix Plot
# ---------------------------
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix (SVM - Test Set)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


NameError: name 'sns' is not defined

<Figure size 500x400 with 0 Axes>

In [None]:
# 10) ROC Curve Plot
# ---------------------------
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (SVM)")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()


In [None]:
# 11) Save model
# ---------------------------
model_path = "svm_loan_default_model.pkl"
joblib.dump(best_model, model_path)
print(f"\nModel saved to: {os.path.abspath(model_path)}")

In [None]:
# 12) Example: Predict on new data
# ---------------------------
# Build a single-row example with the same columns as X (no IDs, no target)
example = pd.DataFrame([{
    "Loan_Amount": 500000,
    "Loan_Type": "Mortgage",
    "Credit_Score": 690,
    "Repayment_History": "Good",
    "Collateral_Value": 750000,
    "Loan_Tenure": 240
}])

example_pred_proba = best_model.predict_proba(example)[:, 1][0]
example_pred_class = best_model.predict(example)[0]
print(f"\nExample prediction - Probability of default: {example_pred_proba:.4f}, Class: {example_pred_class}")
``