<a href="https://colab.research.google.com/github/2303A52367/EXPAILAB-40/blob/main/Lab_Assignment-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import shap
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('UCI_Credit_Card.csv')

# Drop duplicates and handle missing values
df = df.drop_duplicates()
df = df.dropna()

# Identify categorical columns and encode if present
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# Feature selection
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Model evaluation
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))

# SHAP Analysis
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)

# Save SHAP plots
shap.summary_plot(shap_values[1], X_test, feature_names=X.columns, show=False)
plt.savefig("summary_plot.png")
plt.clf()

shap.force_plot(explainer.expected_value[1], shap_values[1], X_test, feature_names=X.columns, matplotlib=True, show=False)
plt.savefig("force_plot.png")
plt.clf()

shap.waterfall_plot(shap.Explanation(values=shap_values[1], base_values=explainer.expected_value[1], data=X_test, feature_names=X.columns), show=False)
plt.savefig("waterfall_plot.png")
plt.clf()

# Compare SHAP and Built-in Feature Importances
rf_importance = clf.feature_importances_
shap_global_importance = np.abs(shap_values[1]).mean(axis=0)
top5_features = pd.Series(shap_global_importance, index=X.columns).sort_values(ascending=False)[:5]
print("Top 5 SHAP features:\n", top5_features)


Accuracy: 0.8135
Precision: 0.6300268096514745
Recall: 0.357958872810358
F1: 0.4565322972316659
ROC AUC: 0.6495362958035148
