In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

PROJECT_ROOT = "/content/drive/MyDrive/fraud-detection"
DATA_PATH = f"{PROJECT_ROOT}/data/creditcard.csv"

df = pd.read_csv(DATA_PATH)
df.shape

(284807, 31)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(['Class', 'Time'], axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Sanity check
print("Train class ratio:", y_train.value_counts(normalize=True))
print("Test class ratio:", y_test.value_counts(normalize=True))

Train class ratio: Class
0    0.998271
1    0.001729
Name: proportion, dtype: float64
Test class ratio: Class
0    0.99828
1    0.00172
Name: proportion, dtype: float64


In [None]:
#columns V1-V17 are all PCAd already; std:<2 and mean : ~0
df[['V1', 'V2', 'V3']].describe()

Unnamed: 0,V1,V2,V3
count,284807.0,284807.0,284807.0
mean,1.168375e-15,3.416908e-16,-1.379537e-15
std,1.958696,1.651309,1.516255
min,-56.40751,-72.71573,-48.32559
25%,-0.9203734,-0.5985499,-0.8903648
50%,0.0181088,0.06548556,0.1798463
75%,1.315642,0.8037239,1.027196
max,2.45493,22.05773,9.382558


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train['Amount'] = scaler.fit_transform(X_train[['Amount']])
X_test['Amount'] = scaler.transform(X_test[['Amount']])


In [None]:
#!pip install xgboost==1.3.1 --quiet
#!pip install numpy==1.24.4 --quiet
import xgboost
import numpy as np

print(xgboost.__version__)  # Confirm it's 1.3.1
print(np.__version__)  # should print 1.24.4


1.3.1
1.24.4


In [None]:
#####6/26/25P:8:37

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd
import numpy as np
import os
import tarfile

# 1. Train model
xgb_model = XGBClassifier(
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight= y_train.value_counts()[0] / y_train.value_counts()[1],
    random_state=42
)

xgb_model.fit(X_train.to_numpy(), y_train)

# 2. Extract booster
booster = xgb_model.get_booster()

# 3. Save model as model.bin
os.makedirs("flat-model", exist_ok=True)
booster.save_model("flat-model/model.bin")

# 4. Tar the directory
with tarfile.open("xgb_model_final.tar.gz", "w:gz") as tar:
    tar.add("flat-model", arcname=".")

In [None]:
!tar -tzf xgb_model_final.tar.gz

./
./model.bin


In [None]:
#!pip install awscli # to install awscli
#!pip install awscli boto3
os.environ['AWS_ACCESS_KEY_ID'] = 'XXXXXX'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'XXXXX'
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'  # or your region

In [None]:
#!ls
!aws s3 cp xgb_model_final.tar.gz s3://fraud-model-artifacts/xgb_model_final.tar.gz

Completed 89.9 KiB/89.9 KiB (197.2 KiB/s) with 1 file(s) remainingupload: ./xgb_model_final.tar.gz to s3://fraud-model-artifacts/xgb_model_final.tar.gz


In [None]:
!aws s3 ls s3://fraud-model-artifacts/xgb_model_final.tar.gz


2025-06-27 02:01:49      92104 xgb_model_final.tar.gz


In [None]:
#step#3: Random Forest BaseLine:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score, classification_report

print("RF ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print(classification_report(y_test, y_pred_rf, digits=4))


RF ROC AUC: 0.9580130501131234
              precision    recall  f1-score   support

           0     0.9996    0.9999    0.9998     56864
           1     0.9610    0.7551    0.8457        98

    accuracy                         0.9995     56962
   macro avg     0.9803    0.8775    0.9227     56962
weighted avg     0.9995    0.9995    0.9995     56962



Model Chosen: XGBoost
Because it delivered higher recall (0.8469 vs 0.7551) for fraud class, faster training, and better overall AUC — which is critical in fraud detection.

In [None]:
import tarfile
import xgboost as xgb

# Step 1: Extract tarball
with tarfile.open("xgb_model_final.tar.gz", "r:gz") as tar:
    tar.extractall("verify-dir")

# Step 2: Load model
booster = xgb.Booster()
booster.load_model("verify-dir/model.bin")

# Step 3: Predict dummy
dmatrix = xgb.DMatrix([[0]*29])  # 29 = your feature count
print(booster.predict(dmatrix))


[6.6519203e-07]
