<a href="https://colab.research.google.com/github/kusumakodamanchili/Fraud_Transaction_Detection/blob/main/Fraud_Transaction_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install xgboost
!pip install imbalanced-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')


In [None]:
try:
    df = pd.read_csv('Fraud.csv', sep=',')
except ParserError:
    try:
        df = pd.read_csv('Fraud.csv', sep=';')
    except ParserError:
        try:
            df = pd.read_csv('Fraud.csv', engine='python')
        except Exception as e:
            print(f"Could not read the CSV file: {e}")
df.head()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
print(df.isnull().sum())
df = df.dropna()

In [None]:
df.duplicated().sum()

In [None]:
f = df.dropna()
print("After dropping missing values:", df.shape)

In [None]:
sns.countplot(x='isFraud', data=df)
plt.title('Fraud vs Non-Fraud Distribution')
plt.show()

In [None]:
# Step 5 (Fixed): Correlation matrix with only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix (Numeric Features Only)")
plt.show()

In [None]:
if 'nameOrig' in df.columns: df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)
if 'step' in df.columns: df.drop(['step'], axis=1, inplace=True)

In [None]:

X = df.drop('isFraud', axis=1)
y = df['isFraud']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:

df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.drop('isFraud', axis=1)
y = df_encoded['isFraud']


In [None]:

df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop('isFraud', axis=1)
y = df_encoded['isFraud']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


In [None]:

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train_res)

y_pred = model.predict(X_test_scaled)

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]))


In [None]:
import shap

# Use TreeExplainer for tree-based models like RandomForest
explainer = shap.Explainer(model, X_train_scaled)

# Compute SHAP values for a small batch of test samples
shap_values = explainer(X_test_scaled[:100])

# Display summary plot (with feature names)
shap.summary_plot(shap_values, features=X_test_scaled[:100], feature_names=X.columns)


In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features")
plt.show()