<a href="https://colab.research.google.com/github/GouravMidya/DSW-MLtest/blob/main/Different_Model_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# prompt: Code to check gpu availablitiy

import tensorflow as tf

# Check GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [8]:
# Load the data (replace with actual file path)
data = pd.read_excel('/content/drive/MyDrive/DSW Assessment/train_data.xlsx')


In [9]:
# Preprocessing
def preprocess_data(data):
    # Extract year and month from transaction_date
    data['transaction_date'] = pd.to_datetime(data['transaction_date'])
    data['transaction_year'] = data['transaction_date'].dt.year
    data['transaction_month'] = data['transaction_date'].dt.month

    # Drop unnecessary columns
    data = data.drop(['customer_id', 'transaction_date'], axis=1)

    # Handle categorical data
    label_encoders = {}
    categorical_cols = ['sub_grade', 'term', 'home_ownership', 'purpose', 'application_type', 'verification_status']

    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

    # Scale numerical data
    scaler = StandardScaler()
    numerical_cols = ['cibil_score', 'total_no_of_acc', 'annual_inc', 'int_rate', 'loan_amnt', 'installment', 'account_bal', 'emp_length', 'transaction_year', 'transaction_month']
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    return data, label_encoders, scaler

# Preprocess the data
data, label_encoders, scaler = preprocess_data(data)

# Split data into features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'Decision Tree': DecisionTreeClassifier(),
}

# Evaluate models
def evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}

    for name, model in tqdm(models.items(), desc="Evaluating models"):
        print(f"\nEvaluating {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Collect evaluation metrics
        report = classification_report(y_test, y_pred, output_dict=True)
        results[name] = report

        print(f"Confusion Matrix for {name}:\
        {confusion_matrix(y_test, y_pred)}")
        print(f"Classification Report for {name}:\
        {classification_report(y_test, y_pred)}")

    return results


In [16]:

# Add ANN model
def build_ann(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate ANN
ann_model = build_ann(X_train.shape[1])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

ann_history = ann_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=256,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.7107 - loss: 0.6217 - val_accuracy: 0.7563 - val_loss: 0.5158
Epoch 2/50
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7488 - loss: 0.5269 - val_accuracy: 0.7610 - val_loss: 0.5070
Epoch 3/50
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7560 - loss: 0.5183 - val_accuracy: 0.7641 - val_loss: 0.5028
Epoch 4/50
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7592 - loss: 0.5103 - val_accuracy: 0.7634 - val_loss: 0.5018
Epoch 5/50
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.7615 - loss: 0.5083 - val_accuracy: 0.7660 - val_loss: 0.4999
Epoch 6/50
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7600 - loss: 0.5099 - val_accuracy: 0.7636 - val_loss: 0.5007
Epoch 7/50
[1m285/285[0m [32m━━━━━━

In [15]:
# Evaluate ANN
ann_eval = ann_model.evaluate(X_test, y_test, verbose=1)
print(f"\nANN Model Accuracy: {ann_eval[1]:.4f}")

[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7685 - loss: 0.4980

ANN Model Accuracy: 0.7680


In [12]:
# Run evaluation
results = evaluate_models(models, X_train, X_test, y_train, y_test)


Evaluating models:   0%|          | 0/4 [00:00<?, ?it/s]


Evaluating Logistic Regression...


Evaluating models:  25%|██▌       | 1/4 [00:00<00:01,  2.11it/s]

Confusion Matrix for Logistic Regression:        [[ 1551  4366]
 [  977 15847]]
Classification Report for Logistic Regression:                      precision    recall  f1-score   support

           0       0.61      0.26      0.37      5917
           1       0.78      0.94      0.86     16824

    accuracy                           0.77     22741
   macro avg       0.70      0.60      0.61     22741
weighted avg       0.74      0.77      0.73     22741


Evaluating Random Forest...


Parameters: { "use_label_encoder" } are not used.

Evaluating models:  75%|███████▌  | 3/4 [00:24<00:08,  8.08s/it]

Confusion Matrix for Random Forest:        [[ 1806  4111]
 [ 1229 15595]]
Classification Report for Random Forest:                      precision    recall  f1-score   support

           0       0.60      0.31      0.40      5917
           1       0.79      0.93      0.85     16824

    accuracy                           0.77     22741
   macro avg       0.69      0.62      0.63     22741
weighted avg       0.74      0.77      0.74     22741


Evaluating XGBoost...
Confusion Matrix for XGBoost:        [[ 1814  4103]
 [ 1267 15557]]
Classification Report for XGBoost:                      precision    recall  f1-score   support

           0       0.59      0.31      0.40      5917
           1       0.79      0.92      0.85     16824

    accuracy                           0.76     22741
   macro avg       0.69      0.62      0.63     22741
weighted avg       0.74      0.76      0.74     22741


Evaluating Decision Tree...


Evaluating models: 100%|██████████| 4/4 [00:26<00:00,  6.51s/it]

Confusion Matrix for Decision Tree:        [[ 2391  3526]
 [ 3916 12908]]
Classification Report for Decision Tree:                      precision    recall  f1-score   support

           0       0.38      0.40      0.39      5917
           1       0.79      0.77      0.78     16824

    accuracy                           0.67     22741
   macro avg       0.58      0.59      0.58     22741
weighted avg       0.68      0.67      0.68     22741






In [13]:
# Print summary
for model_name, metrics in results.items():
    print(f"{model_name}: Accuracy: {metrics['accuracy']:.4f}, Precision (0): {metrics['0']['precision']:.4f}, Recall (0): {metrics['0']['recall']:.4f}, F1-Score (0): {metrics['0']['f1-score']:.4f}")

Logistic Regression: Accuracy: 0.7650, Precision (0): 0.6135, Recall (0): 0.2621, F1-Score (0): 0.3673
Random Forest: Accuracy: 0.7652, Precision (0): 0.5951, Recall (0): 0.3052, F1-Score (0): 0.4035
XGBoost: Accuracy: 0.7639, Precision (0): 0.5888, Recall (0): 0.3066, F1-Score (0): 0.4032
Decision Tree: Accuracy: 0.6727, Precision (0): 0.3791, Recall (0): 0.4041, F1-Score (0): 0.3912
