In [None]:
pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
pip install torch_geometric_temporal

Collecting torch_geometric_temporal
  Using cached torch_geometric_temporal-0.56.0-py3-none-any.whl.metadata (1.9 kB)
Collecting torch_sparse (from torch_geometric_temporal)
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch_scatter (from torch_geometric_temporal)
  Using cached torch_scatter-2.1.2-cp311-cp311-linux_x86_64.whl
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->torch_geometric_temporal)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->torch_geometric_temporal)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->torch_geometric_temporal)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch-

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import gc

# Step 1: Load and Prepare Data
# Load credit card dataset
credit_card_data = pd.read_csv('creditcard.csv')  # Standard credit card fraud dataset

# Create train/test split from the single dataset
train_size = int(0.8 * len(credit_card_data))
train = credit_card_data[:train_size].copy()
test = credit_card_data[train_size:].copy()

# Create TransactionID for consistency with original code structure
train['TransactionID'] = range(len(train))
test['TransactionID'] = range(len(train), len(train) + len(test))

# Store TransactionID for test set alignment
test_ids = test['TransactionID'].copy()

# Rename target column to match original code
if 'Class' in train.columns:
    train['isFraud'] = train['Class']
    test['isFraud'] = test['Class']
    train.drop('Class', axis=1, inplace=True)
    test.drop('Class', axis=1, inplace=True)

# Optimize memory by downcasting
def downcast_df(df):
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

train = downcast_df(train)
test = downcast_df(test)

# Clean up memory
gc.collect()

# Step 2: Preprocessing
# Remove features with high missing values (>80%)
missing_percent = train.isnull().mean()
high_missing_cols = missing_percent[missing_percent > 0.8].index.tolist()
if 'TransactionID' in high_missing_cols:
    high_missing_cols.remove('TransactionID')
train.drop(columns=high_missing_cols, inplace=True)
test.drop(columns=high_missing_cols, inplace=True)

# For credit card dataset, most features are already numerical (V1-V28, Time, Amount)
# Create some categorical features from existing data for demonstration
train['AmountRange'] = pd.cut(train['Amount'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
test['AmountRange'] = pd.cut(test['Amount'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

# Define categorical columns
categorical_cols = ['AmountRange']

# Add 'missing' to the categories of AmountRange
train['AmountRange'] = train['AmountRange'].cat.add_categories(['missing'])
test['AmountRange'] = test['AmountRange'].cat.add_categories(['missing'])

# Handle missing values
numerical_cols = train.select_dtypes(include=['float32', 'float64', 'int8', 'int16', 'int32']).columns.tolist()
if 'isFraud' in numerical_cols:
    numerical_cols.remove('isFraud')
if 'TransactionID' in numerical_cols:
    numerical_cols.remove('TransactionID')

for col in numerical_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())

for col in categorical_cols:
    train[col] = train[col].fillna('missing')
    test[col] = test[col].fillna('missing')

# Remove outliers using IQR for key columns (train only)
for col in ['Amount', 'V1', 'V2']:
    if col in train.columns:
        q1 = train[col].quantile(0.05)
        q3 = train[col].quantile(0.95)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        train = train[(train[col] >= lower_bound) & (train[col] <= upper_bound)]

# Step 3: Feature Engineering
# Time-based features (assuming Time column exists)
if 'Time' in train.columns:
    train['hour'] = (train['Time'] // 3600) % 24
    test['hour'] = (test['Time'] // 3600) % 24
    train['day_of_week'] = (train['Time'] // (3600 * 24)) % 7
    test['day_of_week'] = (test['Time'] // (3600 * 24)) % 7
else:
    # Create synthetic time features if Time column doesn't exist
    train['hour'] = np.random.randint(0, 24, len(train))
    test['hour'] = np.random.randint(0, 24, len(test))
    train['day_of_week'] = np.random.randint(0, 7, len(train))
    test['day_of_week'] = np.random.randint(0, 7, len(test))

# Log transform Amount
train['LogAmount'] = np.log1p(train['Amount'])
test['LogAmount'] = np.log1p(test['Amount'])

# Amount-based frequency features
amount_bins = pd.qcut(train['Amount'], q=10, duplicates='drop')
train['AmountBin'] = amount_bins
test['AmountBin'] = pd.qcut(test['Amount'], q=10, duplicates='drop')

# Create frequency features based on amount bins
freq_map = train['AmountBin'].value_counts().to_dict()
# Explicitly convert to numeric type to avoid categorical issues
train['AmountBin_freq'] = train['AmountBin'].map(freq_map).astype(float)
test['AmountBin_freq'] = test['AmountBin'].map(freq_map).astype(float)
# Fill missing values with 0
train['AmountBin_freq'] = train['AmountBin_freq'].fillna(0)
test['AmountBin_freq'] = test['AmountBin_freq'].fillna(0)

# Statistical features from V columns
v_cols = [col for col in train.columns if col.startswith('V')]
if v_cols:
    train['V_mean'] = train[v_cols].mean(axis=1)
    test['V_mean'] = test[v_cols].mean(axis=1)
    train['V_std'] = train[v_cols].std(axis=1)
    test['V_std'] = test[v_cols].std(axis=1)

# Step 4: Feature Preparation for Temporal GNN
# Define feature set
numerical_cols = ['LogAmount', 'hour', 'day_of_week', 'AmountBin_freq'] + \
                [col for col in v_cols[:15]] + \
                [col for col in ['V_mean', 'V_std'] if col in train.columns]

feature_selection_categorical_cols = [col for col in ['AmountRange'] if col in train.columns]

# Encode categorical features
label_encoders = {}
for col in feature_selection_categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Combine features
features = numerical_cols + feature_selection_categorical_cols
features = [f for f in features if f in train.columns]

X = train[features]
y = train['isFraud']
X_test_full = test[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_full)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Step 5: Construct Graph for Temporal GNN
# Create edge index based on amount similarity and time proximity
def create_edge_index(df, time_col='Time', amount_col='Amount', time_threshold=3600, amount_threshold=0.1):
    edge_index = []
    df_sample = df.sample(min(5000, len(df)), random_state=42)  # Sample for efficiency

    for i in range(len(df_sample)):
        for j in range(i + 1, len(df_sample)):
            time_diff = abs(df_sample[time_col].iloc[i] - df_sample[time_col].iloc[j]) if time_col in df_sample.columns else 0
            amount_diff = abs(df_sample[amount_col].iloc[i] - df_sample[amount_col].iloc[j]) / max(df_sample[amount_col].iloc[i], df_sample[amount_col].iloc[j], 1e-6)

            if time_diff <= time_threshold and amount_diff <= amount_threshold:
                edge_index.append([i, j])
                edge_index.append([j, i])

    if not edge_index:
        # Create some random edges if no connections found
        for i in range(min(1000, len(df_sample))):
            j = np.random.randint(0, len(df_sample))
            if i != j:
                edge_index.append([i, j])

    return torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Create edge index (use smaller sample for efficiency)
time_col = 'Time' if 'Time' in train.columns else 'hour'
edge_index = create_edge_index(train, time_col, 'Amount', time_threshold=3600, amount_threshold=0.1)

# Adjust data size to match edge index
max_node = max(X_resampled.shape[0], edge_index.max().item() + 1) if edge_index.numel() > 0 else X_resampled.shape[0]
if X_resampled.shape[0] < max_node:
    # Pad with zeros if needed
    padding = np.zeros((max_node - X_resampled.shape[0], X_resampled.shape[1]))
    X_resampled = np.vstack([X_resampled, padding])
    y_resampled_padded = np.zeros(max_node)
    y_resampled_padded[:len(y_resampled)] = y_resampled
    y_resampled = y_resampled_padded

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_resampled, dtype=torch.float)
y_tensor = torch.tensor(y_resampled, dtype=torch.float)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float)

# Create PyTorch Geometric Data object
data = Data(x=X_tensor, edge_index=edge_index, y=y_tensor)

# Split into train/val/test
train_mask = torch.zeros(X_tensor.size(0), dtype=torch.bool)
val_mask = torch.zeros(X_tensor.size(0), dtype=torch.bool)
test_mask = torch.zeros(X_tensor.size(0), dtype=torch.bool)

train_idx, temp_idx = train_test_split(range(X_tensor.size(0)), test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

# Step 6: Define Temporal GNN Model
class TemporalGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(TemporalGNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TemporalGNN(in_channels=X_tensor.shape[1], hidden_channels=64, out_channels=1).to(device)
data = data.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.BCELoss()

# Step 7: Training Loop
def train_model(model, data, epochs=50):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index).squeeze()
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# Train the model
train_model(model, data)

# Step 8: Optimize Decision Threshold
model.eval()
with torch.no_grad():
    val_probs = model(data.x, data.edge_index).squeeze()[data.val_mask].cpu().numpy()
    y_val = data.y[data.val_mask].cpu().numpy()

precision, recall, thresholds = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.4f}")

# Step 9: Evaluate on Train and Test Sets
model.eval()
with torch.no_grad():
    # Predictions for training set
    train_probs = model(data.x, data.edge_index).squeeze()[data.train_mask].cpu().numpy()
    y_train = data.y[data.train_mask].cpu().numpy()
    train_pred = (train_probs >= optimal_threshold).astype(int)

    # Predictions for test set
    test_probs = model(data.x, data.edge_index).squeeze()[data.test_mask].cpu().numpy()
    y_test = data.y[data.test_mask].cpu().numpy()
    test_pred = (test_probs >= optimal_threshold).astype(int)

    # Train metrics
    train_precision = precision_score(y_train, train_pred)
    train_recall = recall_score(y_train, train_pred)
    train_f1 = f1_score(y_train, train_pred)
    train_auc = roc_auc_score(y_train, train_probs)
    train_accuracy = accuracy_score(y_train, train_pred)

    # Test metrics
    test_precision = precision_score(y_test, test_pred)
    test_recall = recall_score(y_test, test_pred)
    test_f1 = f1_score(y_test, test_pred)
    test_auc = roc_auc_score(y_test, test_probs)
    test_accuracy = accuracy_score(y_test, test_pred)

    # Print train metrics
    print("\nTrain Set Performance:")
    print(f"Train Precision: {train_precision:.4f}")
    print(f"Train Recall: {train_recall:.4f}")
    print(f"Train F1: {train_f1:.4f}")
    print(f"Train AUC: {train_auc:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}")

    # Print test metrics
    print("\nTest Set Performance:")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")
    print(f"Test F1: {test_f1:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

# Step 10: Generate Predictions for Submission
edge_index_test = create_edge_index(test, time_col, 'Amount', time_threshold=3600, amount_threshold=0.1)
test_data = Data(x=X_test_tensor, edge_index=edge_index_test).to(device)

model.eval()
with torch.no_grad():
    test_full_probs = model(test_data.x, test_data.edge_index).squeeze().cpu().numpy()

# Create submission DataFrame
test_pred_df = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': test_full_probs
})

# Create sample submission format
sample_submission = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': test_full_probs
})

# Save submission
sample_submission.to_csv('submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['hour'] = (train['Time'] // 3600) % 24
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['day_of_week'] = (train['Time'] // (3600 * 24)) % 7
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['LogAmount'] = np.log1p(train['Amount'])
A value is trying to be set on a copy of a slice from a

Epoch 0, Loss: 1.2547
Epoch 10, Loss: 0.2040
Epoch 20, Loss: 0.1465
Epoch 30, Loss: 0.1160
Epoch 40, Loss: 0.0988
Optimal Threshold: 0.2678

Train Set Performance:
Train Precision: 0.9654
Train Recall: 0.9865
Train F1: 0.9759
Train AUC: 0.9977
Train Accuracy: 0.9756

Test Set Performance:
Test Precision: 0.9641
Test Recall: 0.9864
Test F1: 0.9751
Test AUC: 0.9975
Test Accuracy: 0.9749
