<a href="https://colab.research.google.com/github/kevinkurianmathew/Business_Card/blob/main/CreditCardFraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentinel Analytics — Fraud / Gaming Detection (Colab)

**Instructions:**
1. This notebook now uses `kagglehub` to download the dataset. No need to upload `kaggle.json`.
2. Run cells top → bottom.
3. This notebook uses PySpark for setup but runs the core modeling in scikit-learn / XGBoost.



In [None]:
# Install dependencies (run in a code cell)
!pip install kaggle --quiet
!pip install kagglehub --quiet
!pip install pyspark --quiet
!pip install xgboost --quiet
!pip install scikit-learn pandas matplotlib seaborn joblib --quiet


In [None]:
# Download dataset using kagglehub
import kagglehub
import os

dataset_name = "mlg-ulb/creditcardfraud"
path = kagglehub.dataset_download(dataset_name)

print("Path to dataset files:", path)
dataset_file = os.path.join(path, "creditcard.csv")
print("Path to creditcard.csv:", dataset_file)


Path to dataset files: /kaggle/input/creditcardfraud
Path to creditcard.csv: /kaggle/input/creditcardfraud/creditcard.csv


In [None]:
# Load data into pandas and initialize a SparkSession
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("sentinel").getOrCreate()

# The dataset file path is now from the kagglehub download
if 'dataset_file' in globals() and os.path.exists(dataset_file):
    df = pd.read_csv(dataset_file)
    print('Loaded creditcard.csv, shape:', df.shape)
else:
    print('Dataset file not found. Check the kagglehub download path.')


Loaded creditcard.csv, shape: (284807, 31)


In [None]:
# Basic EDA & preprocessing (pandas)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# If using creditcard dataset
if 'df' in globals():
    display(df.head())
    if 'Class' in df.columns:
        print('Class distribution (normalized):\n', df['Class'].value_counts(normalize=True))
    if 'Time' in df.columns:
        df['hour'] = (df['Time'] // 3600) % 24

# Fill missing values and scale numeric features
from sklearn.preprocessing import StandardScaler
features = [c for c in df.columns if c not in ['Class']]
X = df[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = df['Class'].values if 'Class' in df.columns else df['isFraud'].values
print('Feature matrix shape:', X_scaled.shape)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Class distribution (normalized):
 Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64
Feature matrix shape: (284807, 31)


In [None]:
# Train/test split and stacking: IsolationForest + XGBoost
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
print('Train/Test shapes:', X_train.shape, X_test.shape)

iso = IsolationForest(n_estimators=200, contamination=y.mean(), random_state=42, n_jobs=-1)
iso.fit(X_train)
iso_scores_train = -iso.decision_function(X_train)
iso_scores_test = -iso.decision_function(X_test)

import numpy as np
X_train_stack = np.hstack([X_train, iso_scores_train.reshape(-1,1)])
X_test_stack = np.hstack([X_test, iso_scores_test.reshape(-1,1)])

clf = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=4)
clf.fit(X_train_stack, y_train)
y_pred_prob = clf.predict_proba(X_test_stack)[:,1]
print('ROC AUC:', roc_auc_score(y_test, y_pred_prob))

Train/Test shapes: (227845, 31) (56962, 31)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


ROC AUC: 0.9635589534069113


In [None]:
# Threshold selection and evaluation
from sklearn.metrics import precision_recall_curve, auc, f1_score
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)
print('PR AUC:', pr_auc)

best_thresh = 0.5
best_f1 = 0
for t in np.linspace(0.01,0.99,99):
    f1 = f1_score(y_test, (y_pred_prob>t).astype(int))
    if f1 > best_f1:
        best_f1 = f1; best_thresh = t
print('Best threshold:', best_thresh, 'Best F1:', best_f1)
print(classification_report(y_test, (y_pred_prob>best_thresh).astype(int)))

PR AUC: 0.8445623237767411
Best threshold: 0.89 Best F1: 0.8571428571428571
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.77      0.86        98

    accuracy                           1.00     56962
   macro avg       0.99      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [None]:
# Save pipeline (scaler + iso + clf)
import joblib
joblib.dump({'scaler':scaler, 'iso':iso, 'clf':clf}, '/content/sentinel_fraud_pipeline.pkl')
print('Saved /content/sentinel_fraud_pipeline.pkl')

Saved /content/sentinel_fraud_pipeline.pkl


### Notes
- For large-scale production use, move feature engineering into PySpark and use distributed training or a model serving layer.
- Consider adding SHAP explainability cells for stakeholder-ready interpretation.