In [None]:
pip install torch-geometric

Collecting torch-geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Using cached torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
pip install torch_geometric_temporal

Collecting torch_geometric_temporal
  Using cached torch_geometric_temporal-0.56.0-py3-none-any.whl.metadata (1.9 kB)
Collecting torch_sparse (from torch_geometric_temporal)
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch_scatter (from torch_geometric_temporal)
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->torch_geometric_temporal)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->torch_geometric_temporal)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->torch_geometric_temporal)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics import precision_recall_curve, confusion_matrix # Added confusion_matrix

# --- Device Configuration ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Step 1: Load the Data
# Assuming these files are in the /content/ directory or accessible
train_identity = pd.read_csv(r'/content/train_identity.csv')
train_transaction = pd.read_csv(r'/content/train_transaction.csv')
test_identity = pd.read_csv(r'/content/test_identity.csv')
test_transaction = pd.read_csv(r'/content/test_transaction.csv')
sample_submission = pd.read_csv(r'/content/sample_submission.csv')

# Step 1.1: Fix column names in test_identity
test_identity.columns = [col.replace('id-', 'id_') for col in test_identity.columns]

# Step 1.2: Combine the datasets
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

# Store original test TransactionIDs for submission alignment
test_transaction_ids = test['TransactionID'].copy()

# Clean up memory
del train_identity, train_transaction, test_identity, test_transaction
gc.collect()

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Step 2: Enhanced Preprocessing
# 2.1 Remove features with high missing values (>80%)
missing_percent = train.isnull().mean()
high_missing_cols = missing_percent[missing_percent > 0.8].index.tolist()
train.drop(columns=high_missing_cols, inplace=True)
test.drop(columns=high_missing_cols, inplace=True)

# 2.2 Define potential categorical columns
potential_categorical_cols = ['ProductCD', 'card4', 'card5', 'card6', 'P_emaildomain', 'R_emaildomain']

# Verify which categorical columns exist
categorical_cols = [col for col in potential_categorical_cols if col in train.columns]
missing_cols = [col for col in potential_categorical_cols if col not in train.columns]
if missing_cols:
    print(f"Note: The following columns are missing in the dataset: {missing_cols}")

# 2.3 Handle missing values for initial numerical columns
numerical_cols_initial = train.select_dtypes(include=['float64', 'int64']).columns.tolist()
if 'isFraud' in numerical_cols_initial:
    numerical_cols_initial.remove('isFraud')
if 'TransactionID' in numerical_cols_initial:
    numerical_cols_initial.remove('TransactionID')
if 'TransactionDT' in numerical_cols_initial:
    numerical_cols_initial.remove('TransactionDT')

for col in numerical_cols_initial:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())

# 2.4 Remove outliers using IQR (only for train set)
for col in ['TransactionAmt', 'C1', 'C2']:
    if col in train.columns:
        q1 = train[col].quantile(0.05)
        q3 = train[col].quantile(0.95)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        train = train[(train[col] >= lower_bound) & (train[col] <= upper_bound)]

# Step 3: Enhanced Feature Engineering
# 3.1 Time-based features
train['hour'] = ((train['TransactionDT'] // 3600) % 24)
test['hour'] = ((test['TransactionDT'] // 3600) % 24)

# 3.2 Log transform TransactionAmt
train['LogTransactionAmt'] = np.log1p(train['TransactionAmt'])
test['LogTransactionAmt'] = np.log1p(test['TransactionAmt'])

# 3.3 Enhanced email domain grouping
def group_email(email):
    if pd.isna(email) or email == 'missing':
        return 'NoInfo'
    email = str(email).lower()
    if 'gmail' in email:
        return 'Google'
    elif 'yahoo' in email:
        return 'Yahoo'
    elif any(x in email for x in ['hotmail', 'outlook', 'msn', 'live']):
        return 'Microsoft'
    elif any(x in email for x in ['aol', 'aim']):
        return 'AOL'
    elif any(x in email for x in ['protonmail', 'proton']):
        return 'Proton'
    else:
        return 'Other'

if 'P_emaildomain' in train.columns:
    train['P_emaildomain'] = train['P_emaildomain'].apply(group_email)
    test['P_emaildomain'] = test['P_emaildomain'].apply(group_email)

if 'R_emaildomain' in train.columns:
    train['R_emaildomain'] = train['R_emaildomain'].apply(group_email)
    test['R_emaildomain'] = test['R_emaildomain'].apply(group_email)

# 3.4 Transaction frequency features
for col in ['card1', 'card2', 'card3', 'card5']:
    if col in train.columns:
        freq_map = train[col].value_counts().to_dict()
        train[f'{col}_freq'] = train[col].map(freq_map)
        test[f'{col}_freq'] = test[col].map(freq_map)
        train[f'{col}_freq'] = train[f'{col}_freq'].fillna(0) # Fill NaNs for freq features
        test[f'{col}_freq'] = test[f'{col}_freq'].fillna(0)

# 3.5 Device info features
if 'DeviceInfo' in train.columns:
    train['DeviceType'] = train['DeviceInfo'].str.split('/', expand=True)[0]
    test['DeviceType'] = test['DeviceInfo'].str.split('/', expand=True)[0]
    train['DeviceType'] = train['DeviceType'].fillna('unknown')
    test['DeviceType'] = test['DeviceType'].fillna('unknown')


# Step 4: Prepare Data for GNN and LSTM
# 4.1 Define feature set
numerical_cols = ['LogTransactionAmt', 'hour']

# Add original card columns if they exist
for col in ['card1', 'card2', 'card3', 'card5']:
    if col in train.columns:
        numerical_cols.append(col)

# Add frequency features
numerical_cols.extend([f'{col}_freq' for col in ['card1', 'card2', 'card3', 'card5'] if f'{col}_freq' in train.columns])

# Add C and D columns
numerical_cols.extend([col for col in ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10',
                                     'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6',
                                     'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15'] if col in train.columns])

# Add 'V' and 'M' columns
v_cols = [f'V{i}' for i in range(1, 340) if f'V{i}' in train.columns]
m_cols = [f'M{i}' for i in range(1, 10) if f'M{i}' in train.columns]

numerical_cols.extend(v_cols)

# --- Handle 'M' columns which can be boolean or object types ---
for col in m_cols:
    if col in train.columns:
        train[col] = train[col].map({'T': 1, 'F': 0}).astype(float)
        test[col] = test[col].map({'T': 1, 'F': 0}).astype(float)
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)
    numerical_cols.append(col)

feature_selection_categorical_cols = [col for col in ['ProductCD', 'card4', 'card5', 'card6',
                                                     'P_emaildomain', 'R_emaildomain', 'DeviceType']
                                      if col in train.columns]

# 4.2 Encode categorical features (Label Encoding for now, One-Hot is not strictly needed for GAN if features are scaled)
label_encoders = {}
for col in feature_selection_categorical_cols:
    le = LabelEncoder()
    train[col] = train[col].fillna('missing').astype(str)
    test[col] = test[col].fillna('missing').astype(str)
    combined = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# 4.3 Combine features
features = numerical_cols + feature_selection_categorical_cols
features = [f for f in features if f in train.columns] # Final check

# --- CRITICAL: Fill NaNs for all selected features BEFORE Scaling ---
print("Checking and filling any remaining NaNs in selected features...")
for col in features:
    if train[col].isnull().any():
        if col in numerical_cols:
            median_val = train[col].median()
            train[col].fillna(median_val, inplace=True)
            test[col].fillna(median_val, inplace=True)
        elif col in feature_selection_categorical_cols:
            train[col].fillna(0, inplace=True) # Assuming 0 for missing categorical is ok after label encoding
            test[col].fillna(0, inplace=True)
        else: # Fallback for other potential types
            if pd.api.types.is_numeric_dtype(train[col]):
                median_val = train[col].median()
                train[col].fillna(median_val, inplace=True)
                test[col].fillna(median_val, inplace=True)
            else: # Should ideally not hit this after proper handling
                train[col].fillna('unknown_nan_val', inplace=True) # Fallback, but should be handled by previous steps
                test[col].fillna('unknown_nan_val', inplace=True)

print(f"NaNs in train[features] before scaling: {train[features].isnull().sum().sum()}")
print(f"NaNs in test[features] before scaling: {test[features].isnull().sum().sum()}")

# Sort data by TransactionDT for temporal processing
train = train.sort_values(by='TransactionDT').reset_index(drop=True)
test = test.sort_values(by='TransactionDT').reset_index(drop=True)

X = train[features]
y = train['isFraud']
X_test_full = test[features]

# --- L1-Regularized Logistic Regression for Feature Importance ---
print("Performing L1-Regularized Logistic Regression for Feature Importance...")
temp_scaler = StandardScaler()
X_temp_scaled = temp_scaler.fit_transform(X)
logistic_reg = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=42, class_weight='balanced', max_iter=1000)
logistic_reg.fit(X_temp_scaled, y)

feature_importance = pd.DataFrame({'Feature': features, 'Coefficient': logistic_reg.coef_[0]})
feature_importance['Abs_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False)
print("Top 20 Feature Importances (L1 Logistic Regression):")
print(feature_importance.head(20))

# 4.4 Scale features (StandardScaler first, then MinMaxScaler to [0, 0.7])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_full)

minmax_scaler = MinMaxScaler(feature_range=(0, 0.7)) # Scale numerical features to 0-0.7 for GAN compatibility
numerical_indices = [features.index(col) for col in numerical_cols if col in features]
X_scaled[:, numerical_indices] = minmax_scaler.fit_transform(X_scaled[:, numerical_indices])
X_test_scaled[:, numerical_indices] = minmax_scaler.transform(X_test_scaled[:, numerical_indices])

# --- Store original shapes and indices before GAN augmentation ---
original_num_train_nodes = X_scaled.shape[0]
original_y = y.copy() # Keep original y for non-augmented validation/comparison

# --- GAN for Data Augmentation ---
print("\n--- Training GAN for Data Augmentation ---")

# Separate fraud (minority) and non-fraud (majority) data for GAN training
X_fraud_real = X_scaled[y == 1]
X_non_fraud_real = X_scaled[y == 0]

# Define GAN Architecture
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(True),
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, output_dim),
            nn.Sigmoid() # Output values between 0 and 1, suitable for scaled data
        )
    def forward(self, input):
        return self.main(input)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2, True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, True),
            nn.Linear(256, 1),
            nn.Sigmoid() # Output probability
        )
    def forward(self, input):
        return self.main(input)

# GAN Hyperparameters
latent_dim = 100 # Dimension of the noise vector
gan_output_dim = X_scaled.shape[1] # Number of features
lr_gan = 0.0002
b1 = 0.5 # Adam: decay of first order momentum of all gradients
b2 = 0.999 # Adam: decay of second order momentum of all gradients
n_epochs_gan = 3000 # Number of epochs for GAN training (can be quite high)
batch_size_gan = 64

generator = Generator(latent_dim, gan_output_dim).to(device)
discriminator = Discriminator(gan_output_dim).to(device)

optimizer_G = torch.optim.Adam(generator.parameters(), lr=lr_gan, betas=(b1, b2))
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=lr_gan, betas=(b1, b2))

adversarial_loss = nn.BCELoss() # Binary Cross Entropy Loss

# Prepare real fraud data for GAN training
real_fraud_data_tensor = torch.tensor(X_fraud_real, dtype=torch.float).to(device)

# --- GAN Training Loop ---
for epoch in range(n_epochs_gan):
    # --- Train Discriminator ---
    optimizer_D.zero_grad()

    # Real samples
    # Randomly sample from the real fraud data
    real_idx = torch.randint(0, real_fraud_data_tensor.size(0), (batch_size_gan,))
    real_samples = real_fraud_data_tensor[real_idx]
    real_labels = torch.ones(batch_size_gan, 1).to(device)
    d_output_real = discriminator(real_samples)
    d_loss_real = adversarial_loss(d_output_real, real_labels)

    # Fake samples
    z = torch.randn(batch_size_gan, latent_dim).to(device)
    fake_samples = generator(z).detach() # Detach to prevent gradients from flowing to G
    fake_labels = torch.zeros(batch_size_gan, 1).to(device)
    d_output_fake = discriminator(fake_samples)
    d_loss_fake = adversarial_loss(d_output_fake, fake_labels)

    d_loss = d_loss_real + d_loss_fake
    d_loss.backward()
    optimizer_D.step()

    # --- Train Generator ---
    optimizer_G.zero_grad()
    z = torch.randn(batch_size_gan, latent_dim).to(device)
    gen_samples = generator(z)
    g_output = discriminator(gen_samples)
    g_loss = adversarial_loss(g_output, real_labels) # Generator wants to fool D (make fake look real)
    g_loss.backward()
    optimizer_G.step()

    if (epoch + 1) % 500 == 0: # Print less frequently for high epochs
        print(f"GAN Epoch {epoch+1}/{n_epochs_gan}, D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

print("GAN training finished.")

# --- Generate Synthetic Fraud Data ---
# Number of synthetic samples needed to balance the classes
num_synthetic_samples_needed = len(X_non_fraud_real) - len(X_fraud_real)
if num_synthetic_samples_needed < 0:
    num_synthetic_samples_needed = 0 # In case fraud is already majority (unlikely here)

print(f"Generating {num_synthetic_samples_needed} synthetic fraud samples...")

generator.eval() # Set generator to evaluation mode
with torch.no_grad():
    synthetic_z = torch.randn(num_synthetic_samples_needed, latent_dim).to(device)
    X_synthetic_fraud = generator(synthetic_z).cpu().numpy()

# Combine real and synthetic fraud samples
X_augmented_fraud = np.vstack([X_fraud_real, X_synthetic_fraud])
y_augmented_fraud = np.ones(X_augmented_fraud.shape[0])

# Combine all augmented data (original non-fraud + augmented fraud)
X_resampled_gan = np.vstack([X_non_fraud_real, X_augmented_fraud])
y_resampled_gan = np.hstack([np.zeros(X_non_fraud_real.shape[0]), y_augmented_fraud])

# Shuffle the augmented data
shuffled_indices = np.random.permutation(len(X_resampled_gan))
X_resampled_gan = X_resampled_gan[shuffled_indices]
y_resampled_gan = y_resampled_gan[shuffled_indices]

print(f"Augmented data shape (X): {X_resampled_gan.shape}")
print(f"Augmented data shape (y): {y_resampled_gan.shape}")
print(f"Fraud count in augmented data: {np.sum(y_resampled_gan == 1)}")
print(f"Non-Fraud count in augmented data: {np.sum(y_resampled_gan == 0)}")


# 4.6 Create graph structure (edges based on shared card1) using augmented data
def create_edge_index_from_series(card_series):
    edge_index = []
    col_to_group_by = card_series.name if card_series.name is not None else 0 # Default to 0 for unnamed series

    temp_df = card_series.reset_index()
    if col_to_group_by not in temp_df.columns:
        # Fallback: if series name isn't a column, it's typically the first value column
        col_to_group_by = temp_df.columns[1] # Assumes 'index' is 0, values are 1 for reset_index()

    card_to_indices = temp_df.groupby(col_to_group_by)['index'].apply(list)

    for card_val, indices in card_to_indices.items():
        if pd.notna(card_val):
            for i in range(len(indices)):
                for j in range(i + 1, len(indices)):
                    edge_index.append([indices[i], indices[j]])
                    edge_index.append([indices[j], indices[i]])
    if not edge_index:
        print("Warning: No edges created. Returning empty edge_index.")
        return torch.empty((2, 0), dtype=torch.long)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    return edge_index

if 'card1' in features:
    card1_idx = features.index('card1')
    # Use GAN-augmented data for training graph creation
    temp_card1_train = pd.Series(X_resampled_gan[:, card1_idx], name='card1_values')
    temp_card1_test = pd.Series(X_test_scaled[:, card1_idx], name='card1_values') # Test set remains unchanged
else:
    raise ValueError("card1 not found in features. This should not happen after fix. Debug.")

train_edge_index = create_edge_index_from_series(temp_card1_train)
test_edge_index = create_edge_index_from_series(temp_card1_test)

# 4.7 Create PyTorch Geometric Data object for training using GAN augmented data
x_train_gan = torch.tensor(X_resampled_gan, dtype=torch.float)
y_train_gan = torch.tensor(y_resampled_gan, dtype=torch.long) # Use GAN augmented labels
data = Data(x=x_train_gan, edge_index=train_edge_index, y=y_train_gan)

# 4.8 Create train/val/test masks - these will now be over the GAN-augmented data
n_samples_gan = len(y_resampled_gan)
# Stratify by the new, balanced y_resampled_gan
train_idx, temp_idx = train_test_split(range(n_samples_gan), test_size=0.3, random_state=42, stratify=y_resampled_gan)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42, stratify=y_resampled_gan[temp_idx])

train_mask = torch.zeros(n_samples_gan, dtype=torch.bool)
val_mask = torch.zeros(n_samples_gan, dtype=torch.bool)
test_mask = torch.zeros(n_samples_gan, dtype=torch.bool)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

# 4.9 Create PyTorch Geometric Data object for test set (original, not augmented)
test_data = Data(x=torch.tensor(X_test_scaled, dtype=torch.float), edge_index=test_edge_index)


# Step 5: Define Temporal GNN Model with LSTM
class TemporalGCNLSTM(nn.Module):
    def __init__(self, input_dim, hidden_gnn_dim, hidden_lstm_dim, output_dim, sequence_length, dropout_rate=0.7):
        super(TemporalGCNLSTM, self).__init__()
        self.sequence_length = sequence_length
        self.hidden_gnn_dim = hidden_gnn_dim
        self.dropout_rate = dropout_rate

        self.conv1 = GCNConv(input_dim, hidden_gnn_dim)
        self.bn1 = nn.BatchNorm1d(hidden_gnn_dim)
        self.conv2 = GCNConv(hidden_gnn_dim, hidden_gnn_dim)
        self.bn2 = nn.BatchNorm1d(hidden_gnn_dim)

        self.lstm = nn.LSTM(hidden_gnn_dim, hidden_lstm_dim, batch_first=True)

        self.fc = nn.Linear(hidden_lstm_dim, output_dim)

    def forward(self, x, edge_index, batch_size=1):
        x_gnn = self.conv1(x, edge_index)
        x_gnn = self.bn1(x_gnn)
        x_gnn = F.relu(x_gnn)
        x_gnn = F.dropout(x_gnn, p=self.dropout_rate, training=self.training)
        x_gnn = self.conv2(x_gnn, edge_index)
        x_gnn = self.bn2(x_gnn)
        x_gnn = F.relu(x_gnn)
        x_gnn = F.dropout(x_gnn, p=self.dropout_rate, training=self.training)

        # LSTM expects input of shape (batch_size, sequence_length, input_size)
        # Here, we treat the entire graph as a single sequence for simplicity
        x_lstm_input = x_gnn.unsqueeze(0) # Add batch dimension

        lstm_out, (h_n, c_n) = self.lstm(x_lstm_input)

        out = self.fc(lstm_out.squeeze(0)) # Remove batch dimension for final linear layer

        return F.log_softmax(out, dim=1)


# Step 6: Train Temporal GNN Model
input_dim = X_resampled_gan.shape[1] # Use augmented data input dim
hidden_gnn_dim = 96
hidden_lstm_dim = 96
output_dim = 2
sequence_length = len(X_resampled_gan) # Use GAN augmented data length
dropout_rate = 0.7

model = TemporalGCNLSTM(input_dim=input_dim,
                        hidden_gnn_dim=hidden_gnn_dim,
                        hidden_lstm_dim=hidden_lstm_dim,
                        output_dim=output_dim,
                        sequence_length=sequence_length,
                        dropout_rate=dropout_rate).to(device)

data = data.to(device) # Move augmented data to device
test_data = test_data.to(device) # Move original test data to device
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

# Class weights for NLLLoss based on the new, balanced dataset
# Since GAN aims to balance classes, weights might be closer to [1.0, 1.0] if perfectly balanced
# Still good to calculate based on actual counts in the GAN-augmented training data
class_weights = torch.tensor([1.0, (y_train_gan == 0).sum() / (y_train_gan == 1).sum()], dtype=torch.float).to(device)
criterion = nn.NLLLoss(weight=class_weights)

def train_model():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Modified evaluate_model to handle external test set without true labels
def evaluate_model(mask_or_none, data_obj, optimal_threshold=None, is_original_test_set=False):
    model.eval()
    with torch.no_grad():
        out = model(data_obj.x, data_obj.edge_index)

        if is_original_test_set:
            # For the *true* test set (submission data), we only return probabilities.
            # Metrics cannot be computed as true labels are unknown.
            probs = torch.softmax(out, dim=1)[:, 1].cpu().numpy()
            return probs # Return probabilities directly for submission
        else:
            # For train/validation subsets within the *augmented* data
            probs = torch.softmax(out[mask_or_none], dim=1)[:, 1].cpu().numpy()
            true = data_obj.y[mask_or_none].cpu().numpy()

            if optimal_threshold is None:
                # This branch is for finding the optimal threshold on the validation set.
                precision_curve, recall_curve, thresholds = precision_recall_curve(true, probs)
                f1_scores = 2 * (precision_curve * recall_curve) / (precision_curve + recall_curve + 1e-9)

                if len(f1_scores) > 0 and len(thresholds) > 0:
                    optimal_idx = np.argmax(f1_scores)
                    optimal_threshold_found = thresholds[optimal_idx]
                else:
                    optimal_threshold_found = 0.5 # Default threshold if no valid f1 scores (e.g., all 0s or 1s)
                return optimal_threshold_found, probs, true # Return optimal threshold and raw probabilities/true labels
            else:
                # This branch is for computing metrics using a given threshold.
                pred = (probs >= optimal_threshold).astype(int)
                precision = precision_score(true, pred, zero_division=0)
                recall = recall_score(true, pred, zero_division=0)
                f1 = f1_score(true, pred, zero_division=0)
                auc = roc_auc_score(true, probs)
                accuracy = accuracy_score(true, pred)

                # Calculate Sensitivity and Specificity separately
                tn, fp, fn, tp = confusion_matrix(true, pred).ravel()

                # Add a small epsilon to denominators to prevent division by zero
                sensitivity = tp / (tp + fn + 1e-9) # True Positive Rate (Recall)
                specificity = tn / (tn + fp + 1e-9) # True Negative Rate

                g_mean_sensitivity = np.sqrt(sensitivity * specificity)

                return precision, recall, f1, auc, accuracy, sensitivity, specificity, g_mean_sensitivity

# --- Early Stopping Implementation ---
patience = 30
best_val_f1 = -1
epochs_no_improve = 0
max_epochs = 400

print("\nTraining TGNN+LSTM with GAN-augmented data and early stopping...")
for epoch in range(1, max_epochs + 1):
    train_loss = train_model()

    # Evaluate validation set (part of augmented data) to find best threshold
    current_optimal_threshold, val_probs_epoch, val_true_epoch = evaluate_model(
        data.val_mask, data, optimal_threshold=None, is_original_test_set=False
    )

    # Calculate F1 score for early stopping
    precision_curve, recall_curve, thresholds = precision_recall_curve(val_true_epoch, val_probs_epoch)
    f1_scores = 2 * (precision_curve * recall_curve) / (precision_curve + recall_curve + 1e-9)
    current_val_f1 = np.max(f1_scores) if len(f1_scores) > 0 else 0.0

    if current_val_f1 > best_val_f1:
        best_val_f1 = current_val_f1
        best_optimal_threshold = current_optimal_threshold
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        epochs_no_improve += 1

    if epoch % 20 == 0 or epochs_no_improve == 0:
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}, Val F1: {current_val_f1:.4f}, Best Val F1: {best_val_f1:.4f}, Epochs no improve: {epochs_no_improve}')

    if epochs_no_improve == patience:
        print(f"Early stopping at epoch {epoch} as validation F1 did not improve for {patience} epochs.")
        break

print(f"Optimal threshold found on validation set: {best_optimal_threshold:.4f}")

# Load the best model state for final evaluation
model.load_state_dict(torch.load('best_model.pth'))


# Step 7: Evaluate on (GAN-augmented) Test Set (Internal Split, not the external test.csv)
# These metrics are for the split of the augmented data used internally as a test set.
test_precision, test_recall, test_f1, test_auc, test_accuracy, test_sensitivity, test_specificity, test_g_mean_sensitivity = evaluate_model(
    data.test_mask, data, best_optimal_threshold, is_original_test_set=False
)
print("\nInternal (GAN-augmented) Test Set Performance:")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall (Sensitivity): {test_recall:.4f}")
print(f"Test Specificity: {test_specificity:.4f}")
print(f"Test F1: {test_f1:.4f}")
print(f"Test AUC: {test_auc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test G-Mean Sensitivity: {test_g_mean_sensitivity:.4f}")


# Step 8: Evaluate on (GAN-augmented) Train Set (Internal Split)
train_precision, train_recall, train_f1, train_auc, train_accuracy, train_sensitivity, train_specificity, train_g_mean_sensitivity = evaluate_model(
    data.train_mask, data, best_optimal_threshold, is_original_test_set=False
)
print("\nInternal (GAN-augmented) Train Set Performance:")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall (Sensitivity): {train_recall:.4f}")
print(f"Train Specificity: {train_specificity:.4f}")
print(f"Train F1: {train_f1:.4f}")
print(f"Train AUC: {train_auc:.4f}")
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train G-Mean Sensitivity: {train_g_mean_sensitivity:.4f}")


# Step 9: Generate Submission (predictions for the actual unseen test data from test.csv)
print("\nGenerating predictions for the actual unseen test data (submission file)...")
# For the actual test.csv data, we only generate probabilities, no true labels exist for evaluation.
submission_probs = evaluate_model(
    None, # No mask needed, evaluate all nodes in test_data
    test_data,
    best_optimal_threshold,
    is_original_test_set=True
)

test_predictions_sorted_df = pd.DataFrame({
    'TransactionID': test_transaction_ids, # Use the stored original test TransactionIDs
    'isFraud': submission_probs # These are the raw probabilities
})

final_submission_df = sample_submission[['TransactionID']].merge(
    test_predictions_sorted_df, on='TransactionID', how='left'
)

final_submission_df['isFraud'] = final_submission_df['isFraud'].fillna(0) # Ensure no NaNs in submission

final_submission_df.to_csv('submission.csv', index=False)
print("Submission file generated: submission.csv")
print("Note: Performance metrics (Precision, Recall, F1, AUC, Accuracy, G-Mean Sensitivity) are not reported for the final 'test.csv' dataset as its true labels are unknown.")

Using device: cpu
Train shape: (4878, 434), Test shape: (3666, 433)
Note: The following columns are missing in the dataset: ['R_emaildomain']


  train['hour'] = ((train['TransactionDT'] // 3600) % 24)
  test['hour'] = ((test['TransactionDT'] // 3600) % 24)
  train['LogTransactionAmt'] = np.log1p(train['TransactionAmt'])
  test['LogTransactionAmt'] = np.log1p(test['TransactionAmt'])
  train[f'{col}_freq'] = train[col].map(freq_map)
  test[f'{col}_freq'] = test[col].map(freq_map)
  train[f'{col}_freq'] = train[col].map(freq_map)
  test[f'{col}_freq'] = test[col].map(freq_map)
  train[f'{col}_freq'] = train[col].map(freq_map)
  test[f'{col}_freq'] = test[col].map(freq_map)
  train[f'{col}_freq'] = train[col].map(freq_map)
  test[f'{col}_freq'] = test[col].map(freq_map)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the ori

Checking and filling any remaining NaNs in selected features...
NaNs in train[features] before scaling: 0
NaNs in test[features] before scaling: 0
Performing L1-Regularized Logistic Regression for Feature Importance...
Top 20 Feature Importances (L1 Logistic Regression):
               Feature  Coefficient  Abs_Coefficient
14                  C5    -1.597534         1.597534
10                  C1     1.289573         1.289573
114                V83     0.983129         0.983129
169               V279    -0.914861         0.914861
71                 V40     0.831003         0.831003
121                V90     0.746076         0.746076
23                 C14    -0.676903         0.676903
176               V286     0.643280         0.643280
170               V280     0.632459         0.632459
77                 V46    -0.630896         0.630896
221          ProductCD     0.624086         0.624086
113                V82    -0.622718         0.622718
174               V284    -0.616969    