<a href="https://colab.research.google.com/github/harishkulkarni10/Credit-Card-Fraud-Detection/blob/development/notebooks/02_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Clone repo (development branch)
!git clone -b development https://github.com/harishkulkarni10/Credit-Card-Fraud-Detection.git
%cd Credit-Card-Fraud-Detection

Cloning into 'Credit-Card-Fraud-Detection'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects:   3% (1/27)[Kremote: Counting objects:   7% (2/27)[Kremote: Counting objects:  11% (3/27)[Kremote: Counting objects:  14% (4/27)[Kremote: Counting objects:  18% (5/27)[Kremote: Counting objects:  22% (6/27)[Kremote: Counting objects:  25% (7/27)[Kremote: Counting objects:  29% (8/27)[Kremote: Counting objects:  33% (9/27)[Kremote: Counting objects:  37% (10/27)[Kremote: Counting objects:  40% (11/27)[Kremote: Counting objects:  44% (12/27)[Kremote: Counting objects:  48% (13/27)[Kremote: Counting objects:  51% (14/27)[Kremote: Counting objects:  55% (15/27)[Kremote: Counting objects:  59% (16/27)[Kremote: Counting objects:  62% (17/27)[Kremote: Counting objects:  66% (18/27)[Kremote: Counting objects:  70% (19/27)[Kremote: Counting objects:  74% (20/27)[Kremote: Counting objects:  77% (21/27)[Kremote: Counting objects:  81% (22/27)

In [9]:
# Load from Drive + Fix Folder
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=False)

drive_path = "/content/drive/MyDrive/Data Science course/Major Projects/Projects/Credit Card Fraud Detection/Credit Card Fraud Detection/data/preprocessed_data.pkl"

os.makedirs("models", exist_ok=True)

# Copy file
!cp "$drive_path" "models/preprocessed_data.pkl"
print("preprocessed_data.pkl loaded to local models/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
preprocessed_data.pkl loaded to local models/


In [10]:
# Load pickle
import pickle

with open("models/preprocessed_data.pkl", "rb") as f:
    data = pickle.load(f)

X_train = data['X_train']
X_val = data['X_val']
y_train = data['y_train'].reshape(-1, 1)
y_val = data['y_val'].reshape(-1, 1)
feature_cols = data['feature_cols']

print(f"X_train: {X_train.shape}, fraud: {y_train.mean():.5f}")

X_train: (226980, 30), fraud: 0.00167


In [11]:
# Scale features using StandardScaler
from sklearn.preprocessing import StandardScaler
import joblib
import os

# Initialize scaler
scaler = StandardScaler()

# Fit on X_train only
X_train_scaled = scaler.fit_transform(X_train)

# Transform X_val (no fit!)
X_val_scaled = scaler.transform(X_val)

print(f"X_train_scaled mean: {X_train_scaled.mean():.6f} ")
print(f"X_train_scaled std:  {X_train_scaled.std():.6f} ")

X_train_scaled mean: -0.000000 
X_train_scaled std:  1.000000 


In [13]:
# Save scaler to local + Google Drive
import os

# Save locally
joblib.dump(scaler, "models/scaler.pkl")
print("Scaler saved locally: models/scaler.pkl")

drive_path = "/content/drive/MyDrive/Data Science course/Major Projects/Projects/Credit Card Fraud Detection/Credit Card Fraud Detection/data/scaler.pkl"

os.makedirs(os.path.dirname(drive_path), exist_ok=True)

# Copy to Drive
!cp "models/scaler.pkl" "$drive_path"
print(f"Scaler backed up to Drive: {drive_path}")

Scaler saved locally: models/scaler.pkl
Scaler backed up to Drive: /content/drive/MyDrive/Data Science course/Major Projects/Projects/Credit Card Fraud Detection/Credit Card Fraud Detection/data/scaler.pkl


In [14]:
# Convert to PyTorch DataLoader
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert to tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_train_tensor = torch.FloatTensor(y_train)
y_val_tensor   = torch.FloatTensor(y_val)

# Create dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)

# Data Loaders
BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

x_batch, y_batch = next(iter(train_loader))
print(f"Batch shape: X={x_batch.shape}, y={y_batch.shape}")
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")
print("DataLoader ready for training!")


Batch shape: X=torch.Size([256, 30]), y=torch.Size([256, 1])
Train batches: 887, Val batches: 222
DataLoader ready for training!


In [15]:
# CELL: Save scaled tensors to Drive (FAST LOAD LATER)
import torch
import os

# Convert to tensors (already done)
X_train_tensor = torch.FloatTensor(X_train_scaled)
X_val_tensor   = torch.FloatTensor(X_val_scaled)
y_train_tensor = torch.FloatTensor(y_train)
y_val_tensor   = torch.FloatTensor(y_val)

# Save to Drive
drive_path = "/content/drive/MyDrive/Data Science course/Major Projects/Projects/Credit Card Fraud Detection/Credit Card Fraud Detection/data/scaled_tensors.pth"

torch.save({
    'X_train': X_train_tensor,
    'X_val': X_val_tensor,
    'y_train': y_train_tensor,
    'y_val': y_val_tensor,
    'feature_cols': feature_cols
}, drive_path)

print(f"Scaled tensors saved to Drive: {drive_path}")

Scaled tensors saved to Drive: /content/drive/MyDrive/Data Science course/Major Projects/Projects/Credit Card Fraud Detection/Credit Card Fraud Detection/data/scaled_tensors.pth
