In [None]:
pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
!pip install torch_geometric_temporal



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import gc

# Step 1: Load and Prepare Data
train_identity = pd.read_csv(r'/content/train_identity.csv')
train_transaction = pd.read_csv(r"/content/train_transaction.csv")
test_identity = pd.read_csv(r"/content/test_identity.csv")
test_transaction = pd.read_csv(r"/content/test_transaction.csv")
sample_submission = pd.read_csv(r"/content/sample_submission.csv")

# Fix column names in test_identity
test_identity.columns = [col.replace('id-', 'id_') for col in test_identity.columns]

# Combine datasets
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

# Store TransactionID for test set alignment
test_ids = test['TransactionID'].copy()

# Optimize memory by downcasting
def downcast_df(df):
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

train = downcast_df(train)
test = downcast_df(test)

# Clean up memory
del train_identity, train_transaction, test_identity, test_transaction
gc.collect()

# Step 2: Preprocessing
# Remove features with high missing values (>80%)
missing_percent = train.isnull().mean()
high_missing_cols = missing_percent[missing_percent > 0.8].index.tolist()
if 'TransactionID' in high_missing_cols:
    high_missing_cols.remove('TransactionID')
train.drop(columns=high_missing_cols, inplace=True)
test.drop(columns=high_missing_cols, inplace=True)

# Define and verify categorical columns
potential_categorical_cols = ['ProductCD', 'card4', 'card5', 'card6', 'P_emaildomain', 'R_emaildomain']
categorical_cols = [col for col in potential_categorical_cols if col in train.columns]

# Handle missing values
numerical_cols = train.select_dtypes(include=['float32', 'float64', 'int8', 'int16', 'int32']).columns.tolist()
if 'isFraud' in numerical_cols:
    numerical_cols.remove('isFraud')
if 'TransactionID' in numerical_cols:
    numerical_cols.remove('TransactionID')

for col in numerical_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())  # Use train median for consistency

for col in categorical_cols:
    train[col] = train[col].fillna('missing').astype(str)
    test[col] = test[col].fillna('missing').astype(str)

# Remove outliers using IQR for key columns (train only)
for col in ['TransactionAmt', 'C1', 'C2']:
    if col in train.columns:
        q1 = train[col].quantile(0.05)
        q3 = train[col].quantile(0.95)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        train = train[(train[col] >= lower_bound) & (train[col] <= upper_bound)]

# Step 3: Feature Engineering
# Time-based features
train['hour'] = (train['TransactionDT'] // 3600) % 24
test['hour'] = (test['TransactionDT'] // 3600) % 24
train['day_of_week'] = (train['TransactionDT'] // (3600 * 24)) % 7
test['day_of_week'] = (test['TransactionDT'] // (3600 * 24)) % 7

# Log transform TransactionAmt
train['LogTransactionAmt'] = np.log1p(train['TransactionAmt'])
test['LogTransactionAmt'] = np.log1p(test['TransactionAmt'])

# Email domain grouping
def group_email(email):
    if pd.isna(email) or email == 'missing':
        return 'NoInfo'
    email = str(email).lower()
    if 'gmail' in email:
        return 'Google'
    elif 'yahoo' in email:
        return 'Yahoo'
    elif any(x in email for x in ['hotmail', 'outlook', 'msn', 'live']):
        return 'Microsoft'
    elif any(x in email for x in ['aol', 'aim']):
        return 'AOL'
    elif any(x in email for x in ['protonmail', 'proton']):
        return 'Proton'
    else:
        return 'Other'

if 'P_emaildomain' in train.columns:
    train['P_emaildomain'] = train['P_emaildomain'].apply(group_email)
    test['P_emaildomain'] = test['P_emaildomain'].apply(group_email)

if 'R_emaildomain' in train.columns:
    train['R_emaildomain'] = train['R_emaildomain'].apply(group_email)
    test['R_emaildomain'] = test['R_emaildomain'].apply(group_email)

# Transaction frequency features
for col in ['card1', 'card2', 'card3', 'card5']:
    if col in train.columns:
        freq_map = train[col].value_counts().to_dict()
        train[f'{col}_freq'] = train[col].map(freq_map)
        test[f'{col}_freq'] = test[col].map(freq_map)
        train[f'{col}_freq'] = train[f'{col}_freq'].fillna(0)
        test[f'{col}_freq'] = test[f'{col}_freq'].fillna(0)

# Device info features
if 'DeviceInfo' in train.columns:
    train['DeviceType'] = train['DeviceInfo'].str.split('/', expand=True)[0]
    test['DeviceType'] = test['DeviceInfo'].str.split('/', expand=True)[0]
    train['DeviceType'] = train['DeviceType'].fillna('unknown')
    test['DeviceType'] = test['DeviceType'].fillna('unknown')

# Step 4: Feature Preparation for Temporal GNN
# Define feature set
numerical_cols = ['LogTransactionAmt', 'hour', 'day_of_week'] + \
                [f'{col}_freq' for col in ['card1', 'card2', 'card3', 'card5'] if f'{col}_freq' in train.columns] + \
                [col for col in ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10',
                               'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6',
                               'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15'] if col in train.columns]

feature_selection_categorical_cols = [col for col in ['ProductCD', 'card4', 'card5', 'card6',
                                                    'P_emaildomain', 'R_emaildomain', 'DeviceType']
                                    if col in train.columns]

# Encode categorical features
label_encoders = {}
for col in feature_selection_categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Combine features
features = numerical_cols + feature_selection_categorical_cols
features = [f for f in features if f in train.columns]

X = train[features]
y = train['isFraud']
X_test_full = test[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_full)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Step 5: Construct Graph for Temporal GNN
# Create edge index based on card1 and time proximity
def create_edge_index(df, time_col='TransactionDT', card_col='card1', time_threshold=3600):
    edge_index = []
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            if df[card_col].iloc[i] == df[card_col].iloc[j] and \
               abs(df[time_col].iloc[i] - df[time_col].iloc[j]) <= time_threshold:
                edge_index.append([i, j])
                edge_index.append([j, i])
    return torch.tensor(edge_index, dtype=torch.long).t().contiguous()

edge_index = create_edge_index(train, 'TransactionDT', 'card1', time_threshold=3600)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_resampled, dtype=torch.float)
y_tensor = torch.tensor(y_resampled.values, dtype=torch.float)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float)

# Create PyTorch Geometric Data object
data = Data(x=X_tensor, edge_index=edge_index, y=y_tensor)

# Split into train/val/test
train_mask = torch.zeros(X_tensor.size(0), dtype=torch.bool)
val_mask = torch.zeros(X_tensor.size(0), dtype=torch.bool)
test_mask = torch.zeros(X_tensor.size(0), dtype=torch.bool)

train_idx, temp_idx = train_test_split(range(X_tensor.size(0)), test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

# Step 6: Define Temporal GNN Model
class TemporalGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(TemporalGNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = nn.Linear(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TemporalGNN(in_channels=X_tensor.shape[1], hidden_channels=64, out_channels=1).to(device)
data = data.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.BCELoss()

# Step 7: Training Loop
def train_model(model, data, epochs=50):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index).squeeze()
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# Train the model
train_model(model, data)

# Step 8: Optimize Decision Threshold
model.eval()
with torch.no_grad():
    val_probs = model(data.x, data.edge_index).squeeze()[data.val_mask].cpu().numpy()
    y_val = data.y[data.val_mask].cpu().numpy()

precision, recall, thresholds = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.4f}")

# Step 9: Evaluate on Train and Test Sets
model.eval()
with torch.no_grad():
    # Predictions for training set
    train_probs = model(data.x, data.edge_index).squeeze()[data.train_mask].cpu().numpy()
    y_train = data.y[data.train_mask].cpu().numpy()
    train_pred = (train_probs >= optimal_threshold).astype(int)

    # Predictions for test set
    test_probs = model(data.x, data.edge_index).squeeze()[data.test_mask].cpu().numpy()
    y_test = data.y[data.test_mask].cpu().numpy()
    test_pred = (test_probs >= optimal_threshold).astype(int)

    # Train metrics
    train_precision = precision_score(y_train, train_pred)
    train_recall = recall_score(y_train, train_pred)
    train_f1 = f1_score(y_train, train_pred)
    train_auc = roc_auc_score(y_train, train_probs)
    train_accuracy = accuracy_score(y_train, train_pred)

    # Test metrics
    test_precision = precision_score(y_test, test_pred)
    test_recall = recall_score(y_test, test_pred)
    test_f1 = f1_score(y_test, test_pred)
    test_auc = roc_auc_score(y_test, test_probs)
    test_accuracy = accuracy_score(y_test, test_pred)

    # Print train metrics
    print("\nTrain Set Performance:")
    print(f"Train Precision: {train_precision:.4f}")
    print(f"Train Recall: {train_recall:.4f}")
    print(f"Train F1: {train_f1:.4f}")
    print(f"Train AUC: {train_auc:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}")

    # Print test metrics
    print("\nTest Set Performance:")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")
    print(f"Test F1: {test_f1:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

# Step 10: Generate Predictions for Submission
edge_index_test = create_edge_index(test, 'TransactionDT', 'card1', time_threshold=3600)
test_data = Data(x=X_test_tensor, edge_index=edge_index_test).to(device)

model.eval()
with torch.no_grad():
    test_full_probs = model(test_data.x, test_data.edge_index).squeeze().cpu().numpy()

# Create submission DataFrame
test_pred_df = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': test_full_probs
})

# Merge with sample_submission
sample_submission = sample_submission.merge(test_pred_df, on='TransactionID', how='left', suffixes=('', '_pred'))
mean_prob = test_full_probs.mean()
sample_submission['isFraud'] = sample_submission['isFraud_pred'].fillna(mean_prob)
sample_submission = sample_submission[['TransactionID', 'isFraud']]

# Save submission
sample_submission.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully!")

Epoch 0, Loss: 0.6959
Epoch 10, Loss: 0.4360
Epoch 20, Loss: 0.3093
Epoch 30, Loss: 0.2345
Epoch 40, Loss: 0.1779
Optimal Threshold: 0.5253

Train Set Performance:
Train Precision: 0.9744
Train Recall: 0.9762
Train F1: 0.9753
Train AUC: 0.9958
Train Accuracy: 0.9752

Test Set Performance:
Test Precision: 0.9600
Test Recall: 0.9826
Test F1: 0.9711
Test AUC: 0.9940
Test Accuracy: 0.9709

Submission file 'submission.csv' created successfully!
