In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
from google.colab import files

# 1. Load dataset
# -----------------------------
# Replace with your actual file path
uploaded = files.upload()

import pandas as pd
df = pd.read_csv('npa_dataset.csv')

Saving npa_dataset.csv to npa_dataset.csv


In [None]:
# Quick sanity check â€” print columns
print("Columns:", df.columns.tolist())
print(df.head())

Columns: ['Loan_ID', 'Customer_ID', 'Loan_Amount', 'Loan_Type', 'Credit_Score', 'Repayment_History', 'Collateral_Value', 'Loan_Tenure', 'Default_Status']
  Loan_ID Customer_ID  Loan_Amount Loan_Type  Credit_Score  Repayment_History  \
0  L00001      C00001      1742743  Personal           505          68.029548   
1  L00002      C00002      4354572  Business           654          66.635174   
2  L00003      C00003      4976484   Vehicle           330          55.712543   
3  L00004      C00004      2284489   Vehicle           543          70.504439   
4  L00005      C00005      1620006  Business           788          80.254261   

   Collateral_Value  Loan_Tenure  Default_Status  
0            481766           88               1  
1           2912934           33               0  
2           1347431          217               1  
3           2808402          205               1  
4            680588           73               0  


In [None]:
# 2) Basic cleaning / target + features -----------------------------------
# Drop pure identifiers (not predictive)
drop_cols = ['Loan_ID', 'Customer_ID']

In [None]:
# Define target
target_col = 'Default_Status'   # 0 = performing, 1 = non-performing

In [None]:
# Feature columns (all except target & ID columns)
feature_cols = [c for c in df.columns if c not in drop_cols + [target_col]]

In [None]:
X = df[feature_cols].copy()
y = df[target_col].astype(int)  # ensure integer binary labels

In [None]:
# 3) Separate numeric and categorical feature sets -------------------------
numeric_features = ['Loan_Amount', 'Credit_Score', 'Collateral_Value', 'Loan_Tenure']
categorical_features = ['Loan_Type', 'Repayment_History']

# If unknown categories appear at inference time, handle_unknown='ignore' avoids errors
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  # drop other columns
)

In [None]:
# 4) Train/test split with stratification ----------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)


In [None]:
# 3. Apply preprocessing steps using the defined ColumnTransformer
# -----------------------------
X_train_processed = preprocess.fit_transform(X_train)
X_test_processed = preprocess.transform(X_test)

In [None]:
# 4. Logistic Regression
# -----------------------------
log_reg = LogisticRegression(max_iter=1000, solver="lbfgs")
log_reg.fit(X_train_processed, y_train)
y_pred_log = log_reg.predict(X_test_processed)

print("=== Logistic Regression Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))


=== Logistic Regression Results ===
Accuracy: 0.7262371345775922
              precision    recall  f1-score   support

           0       0.79      0.85      0.82     65665
           1       0.49      0.39      0.43     24402

    accuracy                           0.73     90067
   macro avg       0.64      0.62      0.63     90067
weighted avg       0.71      0.73      0.72     90067



In [None]:

# 5. SVM Comparison

svm_params = [
    {"C": 1, "gamma": "scale"},
    {"C": 2, "gamma": "scale"},
    {"C": 1, "gamma": 0.1},
    {"C": 2, "gamma": 0.1},
]

for params in svm_params:
    svm = SVC(C=params["C"], gamma=params["gamma"], kernel="rbf")
    svm.fit(X_train_processed, y_train)
    y_pred_svm = svm.predict(X_test_processed)

    print(f"\n=== SVM Results (C={params['C']}, gamma={params['gamma']}) ===")
    print("Accuracy:", accuracy_score(y_test, y_pred_svm))
    print(classification_report(y_test, y_pred_svm))



=== SVM Results (C=1, gamma=scale) ===
Accuracy: 0.7268477910888561
              precision    recall  f1-score   support

           0       0.79      0.85      0.82     65665
           1       0.49      0.40      0.44     24402

    accuracy                           0.73     90067
   macro avg       0.64      0.62      0.63     90067
weighted avg       0.71      0.73      0.72     90067


=== SVM Results (C=2, gamma=scale) ===
Accuracy: 0.726681248403966
              precision    recall  f1-score   support

           0       0.85      0.75      0.80     65665
           1       0.50      0.65      0.56     24402

    accuracy                           0.73     90067
   macro avg       0.68      0.70      0.68     90067
weighted avg       0.76      0.73      0.74     90067


=== SVM Results (C=1, gamma=0.1) ===
Accuracy: 0.7274029333718232
              precision    recall  f1-score   support

           0       0.77      0.89      0.83     65665
           1       0.49      0.30