In [1]:
# Environment Setup
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from xgboost import XGBClassifier

In [2]:
# Dataset load
dataset = pd.read_csv('dataset/train_transaction.csv')
dataset.set_index("TransactionID", inplace=True)
print(dataset.info())
print(dataset.describe())


<class 'pandas.core.frame.DataFrame'>
Index: 590540 entries, 2987000 to 3577539
Columns: 393 entries, isFraud to V339
dtypes: float64(376), int64(3), object(14)
memory usage: 1.7+ GB
None
             isFraud  TransactionDT  TransactionAmt          card1  \
count  590540.000000   5.905400e+05   590540.000000  590540.000000   
mean        0.034990   7.372311e+06      135.027176    9898.734658   
std         0.183755   4.617224e+06      239.162522    4901.170153   
min         0.000000   8.640000e+04        0.251000    1000.000000   
25%         0.000000   3.027058e+06       43.321000    6019.000000   
50%         0.000000   7.306528e+06       68.769000    9678.000000   
75%         0.000000   1.124662e+07      125.000000   14184.000000   
max         1.000000   1.581113e+07    31937.391000   18396.000000   

               card2          card3          card5          addr1  \
count  581607.000000  588975.000000  586281.000000  524834.000000   
mean      362.555488     153.194925     199

In [3]:
# Thanks to spectacular EDA for Columns V https://www.kaggle.com/code/cdeotte/eda-for-columns-v-and-id#V-Reduced
# 211 V-Columns will be droped

# Define Selected all V-Columns
all_v_columns = [col for col in dataset.columns if col.startswith("V")]

# Define Selected V-Columns
v = [1, 3, 4, 6, 8, 11]
v += [13, 14, 17, 20, 23, 26, 27, 30]
v += [36, 37, 40, 41, 44, 47, 48]
v += [54, 56, 59, 62, 65, 67, 68, 70]
v += [76, 78, 80, 82, 86, 88, 89, 91]
v += [96, 98, 99, 104]
v += [107, 108, 111, 115, 117, 120, 121, 123]
v += [124, 127, 129, 130, 136]
v += [138, 139, 142, 147, 156, 162]
v += [165, 160, 166]
v += [178, 176, 173, 182]
v += [187, 203, 205, 207, 215]
v += [169, 171, 175, 180, 185, 188, 198, 210, 209]
v += [218, 223, 224, 226, 228, 229, 235]
v += [240, 258, 257, 253, 252, 260, 261]
v += [264, 266, 267, 274, 277]
v += [220, 221, 234, 238, 250, 271]
v += [294, 284, 285, 286, 291, 297]
v += [303, 305, 307, 309, 310, 320]
v += [281, 283, 289, 296, 301, 314]
v += [332, 325, 335, 338]

# Convert to column names
selected_v_columns = [f"V{i}" for i in v]

# Determine unimportant V-columns (all V-columns - selected V-columns)
not_important_v_columns = list(set(all_v_columns) - set(selected_v_columns))

# Drop unimportant V-columns while keeping everything else
dataset_filtered = dataset.drop(columns=not_important_v_columns)

# Print the shape to confirm changes
print(f"Original Dataset Shape: {dataset.shape}")
print(f"Filtered Dataset Shape: {dataset_filtered.shape}")
print(f"Dropped {len(not_important_v_columns)} Unimportant V-Columns")

Original Dataset Shape: (590540, 393)
Filtered Dataset Shape: (590540, 182)
Dropped 211 Unimportant V-Columns


In [4]:
# Convert D Columns into Actual Past Time Points
for i in range(1, 16):  # D1 to D15
    if i in [1, 2, 3, 5, 9]: continue  # Skip these columns
    if f'D{i}' in dataset_filtered.columns:
        dataset_filtered[f'D{i}'] = dataset_filtered[f'D{i}'] - dataset_filtered['TransactionDT'] / np.float32(
            24 * 60 * 60)

In [5]:
# Standardize TransactionAmt (If Available)
if 'TransactionAmt' in dataset_filtered.columns:
    scaler = StandardScaler()
    dataset_filtered['TransactionAmt_scaled'] = scaler.fit_transform(dataset_filtered[['TransactionAmt']])

# Feature Engineering for Time-Based Features
dataset_filtered['TransactionDay'] = dataset_filtered['TransactionDT'] // (24 * 3600)
dataset_filtered['TransactionHour'] = (dataset_filtered['TransactionDT'] % (24 * 3600)) // 3600


In [6]:
# Feature Engineering for categorical Freatures
categorical_cols = dataset_filtered.select_dtypes(include=['object']).columns
print("🔹 Categorical Columns:", categorical_cols)

for col in categorical_cols:
    le = LabelEncoder()
    dataset_filtered[col] = le.fit_transform(dataset_filtered[col].astype(str))

print(dataset_filtered.dtypes.value_counts())

🔹 Categorical Columns: Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'],
      dtype='object')
float64    166
int32       14
int64        5
Name: count, dtype: int64


In [7]:
# Final Dataset After Feature Engineering
X = dataset_filtered.drop(columns=['isFraud'])
y = dataset_filtered['isFraud']

# Fill Missing Values
X = X.fillna(0)

print(f"Feature Engineering Complete! Final Dataset Shape: {X.shape}")


Feature Engineering Complete! Final Dataset Shape: (590540, 184)


In [8]:
# Stratified Split (95% Train / 5% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, stratify=y, random_state=42
)

print(f"Training Size: {X_train.shape[0]} rows")
print(f"Test Size: {X_test.shape[0]} rows")

fraud_count = y_train.value_counts()
print(f"Non-Fraud Cases in Training: {fraud_count[0]}")
print(f"Fraud Cases in Training: {fraud_count[1]}")
print(f"Fraud Ratio in Training: {fraud_count[1] / (fraud_count[0] + fraud_count[1]) * 100:.4f}%")


Training Size: 561013 rows
Test Size: 29527 rows
Non-Fraud Cases in Training: 541383
Fraud Cases in Training: 19630
Fraud Ratio in Training: 3.4990%


In [9]:
print("\n🔹 Running XGBoost for Feature Importance Analysis...")

# Feature Selection Using XGBoost

# Initialize XGBoost Model
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    missing=-1,
    eval_metric='auc',
    nthread=4,
    tree_method='hist'
)

# Train XGBoost on Full Feature Set
xgb_model.fit(X_train, y_train, verbose=30)

# Get Feature Importances
importance = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Select Top Features Based on Importance
N_FEATURES = 100
top_features = feature_importance_df.iloc[:N_FEATURES]["Feature"].tolist()
# Print Feature Importances
print("\n🔹 Top 10 Features Based on XGBoost Importance:")
print(top_features[:10])


🔹 Running XGBoost for Feature Importance Analysis...

🔹 Top 10 Features Based on XGBoost Importance:
['V258', 'V257', 'V70', 'V187', 'V91', 'V294', 'C14', 'C1', 'C7', 'C4']


In [10]:
# Create New Training Data with Only These Features
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

print(f"Training Data Shape After Feature Selection: {X_train_selected.shape}")

smote = SMOTE(sampling_strategy=0.3, random_state=42)
X_train_selected, y_train = smote.fit_resample(X_train_selected, y_train)

Training Data Shape After Feature Selection: (561013, 100)


In [11]:
X_train_tensor = torch.tensor(X_train_selected.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_selected.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [12]:
# Define & Train the Model
class FraudDetectionNN(nn.Module):
    def __init__(self, input_dim):
        super(FraudDetectionNN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = FraudDetectionNN(input_dim=X_train_tensor.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.00005)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1.85, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        inputs = torch.sigmoid(inputs)
        BCE_loss = nn.BCELoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return focal_loss.mean()

criterion = FocalLoss()


In [13]:
model_save_path = "best_model.pth"
best_auc = 0.0  # Track the best ROC-AUC
num_epochs = 75
patience = 5
counter = 0
threshold = 0.3

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    y_train_true = []
    y_train_pred = []

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        # Store predictions & true values for accuracy calculation
        y_train_true.extend(batch_y.cpu().numpy())
        y_train_pred.extend(outputs.cpu().detach().numpy())

    epoch_loss /= len(train_loader)

    # Convert predictions to binary
    y_train_pred = torch.tensor(y_train_pred)
    y_train_pred_class = (y_train_pred > threshold).int()

    # Compute Training Metrics
    train_accuracy = accuracy_score(y_train_true, y_train_pred_class)
    train_precision = precision_score(y_train_true, y_train_pred_class, zero_division=0)
    train_recall = recall_score(y_train_true, y_train_pred_class, zero_division=0)
    train_f1 = f1_score(y_train_true, y_train_pred_class, zero_division=0)
    train_auc = roc_auc_score(y_train_true, y_train_pred)

    # Check Model Predictions
    print("Sample Predictions (First 10):", outputs[:10].cpu().detach().numpy())
    print("Min Prediction Value:", np.min(outputs.cpu().detach().numpy()))
    print("Max Prediction Value:", np.max(outputs.cpu().detach().numpy()))

    # Print Training Metrics After Each Epoch
    print(f"Epoch {epoch + 1}/{num_epochs} | Loss: {epoch_loss:.8f} | Acc: {train_accuracy:.4f} | "
          f"Precision: {train_precision:.4f} | Recall: {train_recall:.4f} | "
          f"F1: {train_f1:.4f} | AUC: {train_auc:.4f}")

    # Save the best model based on ROC-AUC
    if train_auc > best_auc:
        best_auc = train_auc
        torch.save(model.state_dict(), model_save_path)
        counter = 0
    else:
        counter += 1

    # Early Stopping Condition
    if counter >= patience:
        print(f"⏹️ Early stopping triggered at Epoch {epoch + 1} (No improvement for {patience} epochs).")
        break

print(f"Best model saved at Epoch {epoch + 1} with AUC: {best_auc:.4f}")
print("\nModel Training Complete!")

  y_train_pred = torch.tensor(y_train_pred)


Sample Predictions (First 10): [[0.0490536 ]
 [0.05486806]
 [0.21698242]
 [0.03289121]
 [0.03735921]
 [0.04713591]
 [0.06134352]
 [0.66155124]
 [0.0304619 ]
 [0.05397942]]
Min Prediction Value: 0.009933051
Max Prediction Value: 1.0
Epoch 1/75 | Loss: 0.42414446 | Acc: 0.6794 | Precision: 0.3740 | Recall: 0.5781 | F1: 0.4542 | AUC: 0.6890
Sample Predictions (First 10): [[0.01482606]
 [0.01417975]
 [0.0134683 ]
 [0.00226215]
 [0.00195199]
 [0.00537194]
 [0.00366488]
 [0.00619759]
 [1.        ]
 [0.01077743]]
Min Prediction Value: 0.0010385449
Max Prediction Value: 1.0
Epoch 2/75 | Loss: 0.34264555 | Acc: 0.7977 | Precision: 0.6391 | Recall: 0.2831 | F1: 0.3924 | AUC: 0.7171
Sample Predictions (First 10): [[9.9796045e-01]
 [4.5405184e-05]
 [6.5146995e-01]
 [3.8170350e-05]
 [1.8375093e-04]
 [1.5900278e-05]
 [9.7958720e-05]
 [4.7004814e-04]
 [8.3878382e-05]
 [1.3517856e-03]]
Min Prediction Value: 1.023385e-05
Max Prediction Value: 1.0
Epoch 3/75 | Loss: 0.32852484 | Acc: 0.8041 | Precision:

In [14]:
print("\n🔹 Loading Best Model for Final Evaluation...")

# Load the best saved model
model.load_state_dict(torch.load(model_save_path))
model.eval()

y_test_pred = []
y_test_true = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        y_test_pred.extend(outputs.cpu().numpy())
        y_test_true.extend(batch_y.cpu().numpy())

# Convert predictions to binary
y_test_pred = torch.tensor(y_test_pred)
y_test_pred_class = (y_test_pred > threshold).int()

# Compute Test Metrics
test_accuracy = accuracy_score(y_test_true, y_test_pred_class)
test_precision = precision_score(y_test_true, y_test_pred_class, zero_division=0)
test_recall = recall_score(y_test_true, y_test_pred_class, zero_division=0)
test_f1 = f1_score(y_test_true, y_test_pred_class, zero_division=0)
test_auc = roc_auc_score(y_test_true, y_test_pred)

# Print Final Test Results
print("\nFinal Test Set Metrics (Best Model):")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")
print(f"ROC-AUC: {test_auc:.4f}")


🔹 Loading Best Model for Final Evaluation...


  model.load_state_dict(torch.load(model_save_path))



Final Test Set Metrics (Best Model):
Accuracy: 0.9742
Precision: 0.7482
Recall: 0.3940
F1-Score: 0.5162
ROC-AUC: 0.8527
