In [None]:
qpip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics import precision_recall_curve

# Step 1: Load the Data
train_identity = pd.read_csv(r'/content/train_identity.csv')
train_transaction = pd.read_csv(r'/content/train_transaction.csv')
test_identity = pd.read_csv(r'/content/test_identity.csv')
test_transaction = pd.read_csv(r'/content/test_transaction.csv')
sample_submission = pd.read_csv(r'/content/sample_submission.csv')

# Step 1.1: Fix column names in test_identity
test_identity.columns = [col.replace('id-', 'id_') for col in test_identity.columns]

# Step 1.2: Combine the datasets
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

# Store original test TransactionIDs for submission alignment
test_transaction_ids = test['TransactionID'].copy()

# Clean up memory
del train_identity, train_transaction, test_identity, test_transaction
gc.collect()

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Step 2: Enhanced Preprocessing
# 2.1 Remove features with high missing values (>80%)
missing_percent = train.isnull().mean()
high_missing_cols = missing_percent[missing_percent > 0.8].index.tolist()
train.drop(columns=high_missing_cols, inplace=True)
test.drop(columns=high_missing_cols, inplace=True)

# 2.2 Define potential categorical columns
potential_categorical_cols = ['ProductCD', 'card4', 'card5', 'card6', 'P_emaildomain', 'R_emaildomain']

# Verify which categorical columns exist
categorical_cols = [col for col in potential_categorical_cols if col in train.columns]
missing_cols = [col for col in potential_categorical_cols if col not in train.columns]
if missing_cols:
    print(f"Note: The following columns are missing in the dataset: {missing_cols}")

# 2.3 Handle missing values
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()
if 'isFraud' in numerical_cols:
    numerical_cols.remove('isFraud')
if 'TransactionID' in numerical_cols:
    numerical_cols.remove('TransactionID')

for col in numerical_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())  # Use train median for test to avoid data leakage

for col in categorical_cols:
    train[col] = train[col].fillna('missing').astype(str)
    test[col] = test[col].fillna('missing').astype(str)

# 2.4 Remove outliers using IQR (only for train set)
for col in ['TransactionAmt', 'C1', 'C2']:
    if col in train.columns:
        q1 = train[col].quantile(0.05)
        q3 = train[col].quantile(0.95)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        train = train[(train[col] >= lower_bound) & (train[col] <= upper_bound)]

# Step 3: Enhanced Feature Engineering
# 3.1 Time-based features
train['hour'] = ((train['TransactionDT'] // 3600) % 24)
test['hour'] = ((test['TransactionDT'] // 3600) % 24)

# 3.2 Log transform TransactionAmt
train['LogTransactionAmt'] = np.log1p(train['TransactionAmt'])
test['LogTransactionAmt'] = np.log1p(test['TransactionAmt'])

# 3.3 Enhanced email domain grouping
def group_email(email):
    if pd.isna(email) or email == 'missing':
        return 'NoInfo'
    email = str(email).lower()
    if 'gmail' in email:
        return 'Google'
    elif 'yahoo' in email:
        return 'Yahoo'
    elif any(x in email for x in ['hotmail', 'outlook', 'msn', 'live']):
        return 'Microsoft'
    elif any(x in email for x in ['aol', 'aim']):
        return 'AOL'
    elif any(x in email for x in ['protonmail', 'proton']):
        return 'Proton'
    else:
        return 'Other'

if 'P_emaildomain' in train.columns:
    train['P_emaildomain'] = train['P_emaildomain'].apply(group_email)
    test['P_emaildomain'] = test['P_emaildomain'].apply(group_email)

if 'R_emaildomain' in train.columns:
    train['R_emaildomain'] = train['R_emaildomain'].apply(group_email)
    test['R_emaildomain'] = test['R_emaildomain'].apply(group_email)

# 3.4 Transaction frequency features
for col in ['card1', 'card2', 'card3', 'card5']:
    if col in train.columns:
        freq_map = train[col].value_counts().to_dict()
        train[f'{col}_freq'] = train[col].map(freq_map)
        test[f'{col}_freq'] = test[col].map(freq_map)
        train[f'{col}_freq'] = train[f'{col}_freq'].fillna(0)
        test[f'{col}_freq'] = test[f'{col}_freq'].fillna(0)

# 3.5 Device info features
if 'DeviceInfo' in train.columns:
    train['DeviceType'] = train['DeviceInfo'].str.split('/', expand=True)[0]
    test['DeviceType'] = test['DeviceInfo'].str.split('/', expand=True)[0]
    train['DeviceType'] = train['DeviceType'].fillna('unknown')
    test['DeviceType'] = test['DeviceType'].fillna('unknown')

# Step 4: Prepare Data for GNN
# 4.1 Define feature set
numerical_cols = ['LogTransactionAmt', 'hour'] + \
                [f'{col}_freq' for col in ['card1', 'card2', 'card3', 'card5'] if f'{col}_freq' in train.columns] + \
                [col for col in ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10',
                               'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6',
                               'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15'] if col in train.columns]

feature_selection_categorical_cols = [col for col in ['ProductCD', 'card4', 'card5', 'card6',
                                                    'P_emaildomain', 'R_emaildomain', 'DeviceType']
                                    if col in train.columns]

# 4.2 Encode categorical features
label_encoders = {}
for col in feature_selection_categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# 4.3 Combine features
features = numerical_cols + feature_selection_categorical_cols
features = [f for f in features if f in train.columns]

X = train[features]
y = train['isFraud']
X_test_full = test[features]

# 4.4 Scale features (StandardScaler first, then MinMaxScaler to [0, 0.7])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_full)

# Apply MinMaxScaler to numerical features only
minmax_scaler = MinMaxScaler(feature_range=(0, 0.7))
numerical_indices = [features.index(col) for col in numerical_cols if col in features]
X_scaled[:, numerical_indices] = minmax_scaler.fit_transform(X_scaled[:, numerical_indices])
X_test_scaled[:, numerical_indices] = minmax_scaler.transform(X_test_scaled[:, numerical_indices])

# 4.5 Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 4.6 Create graph structure (edges based on shared card1)
def create_edge_index(df, card_col='card1'):
    edge_index = []
    node_to_idx = {tid: idx for idx, tid in enumerate(df.index)}
    for card in df[card_col].unique():
        if pd.notna(card):
            indices = df[df[card_col] == card].index.tolist()
            for i in range(len(indices)):
                for j in range(i + 1, len(indices)):
                    edge_index.append([node_to_idx[indices[i]], node_to_idx[indices[j]]])
                    edge_index.append([node_to_idx[indices[j]], node_to_idx[indices[i]]])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    return edge_index

# Create edge_index for training data
train_edge_index = create_edge_index(train)

# Create edge_index for test data
test_edge_index = create_edge_index(test)

# 4.7 Create PyTorch Geometric Data object for training
x = torch.tensor(X_resampled, dtype=torch.float)
y = torch.tensor(y_resampled.values, dtype=torch.long)
data = Data(x=x, edge_index=train_edge_index, y=y)

# 4.8 Create train/val/test masks
n_samples = len(y_resampled)
train_idx, temp_idx = train_test_split(range(n_samples), test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

train_mask = torch.zeros(n_samples, dtype=torch.bool)
val_mask = torch.zeros(n_samples, dtype=torch.bool)
test_mask = torch.zeros(n_samples, dtype=torch.bool)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

# 4.9 Create PyTorch Geometric Data object for test set
test_data = Data(x=torch.tensor(X_test_scaled, dtype=torch.float), edge_index=test_edge_index)

# Step 5: Define GNN Model
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Step 6: Train GNN Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN(input_dim=X_resampled.shape[1], hidden_dim=64, output_dim=2).to(device)
data = data.to(device)
test_data = test_data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(mask):
    model.eval()
    with torch.no_grad():
        out = model(data)
        probs = torch.softmax(out[mask], dim=1)[:, 1].cpu().numpy()
        pred = (probs >= optimal_threshold).astype(int)
        true = data.y[mask].cpu().numpy()
        precision = precision_score(true, pred)
        recall = recall_score(true, pred)
        f1 = f1_score(true, pred)
        auc = roc_auc_score(true, probs)
        accuracy = accuracy_score(true, pred)
    return precision, recall, f1, auc, accuracy

# Find optimal threshold on validation set
model.train()
for epoch in range(100):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

model.eval()
with torch.no_grad():
    out = model(data)
    val_probs = torch.softmax(out[data.val_mask], dim=1)[:, 1].cpu().numpy()
    val_true = data.y[data.val_mask].cpu().numpy()

precision, recall, thresholds = precision_recall_curve(val_true, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

# Step 7: Evaluate on Test Set
test_precision, test_recall, test_f1, test_auc, test_accuracy = evaluate(data.test_mask)
print("\nTest Set Performance:")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1: {test_f1:.4f}")
print(f"Test AUC: {test_auc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Step 8: Evaluate on Train Set
train_precision, train_recall, train_f1, train_auc, train_accuracy = evaluate(data.train_mask)
print("\nTrain Set Performance:")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall: {train_recall:.4f}")
print(f"Train F1: {train_f1:.4f}")
print(f"Train AUC: {train_auc:.4f}")
print(f"Train Accuracy: {train_accuracy:.4f}")

# Step 9: Generate Submission
with torch.no_grad():
    model.eval()
    test_out = model(test_data)
    test_probs = torch.softmax(test_out, dim=1)[:, 1].cpu().numpy()

# Ensure test_probs aligns with sample_submission
if len(test_probs) != len(sample_submission):
    print(f"Warning: Test predictions length ({len(test_probs)}) does not match sample_submission length ({len(sample_submission)})")
    # Create a mapping to align predictions with TransactionIDs
    test_pred_df = pd.DataFrame({
        'TransactionID': test_transaction_ids,
        'isFraud': test_probs
    })
    # Merge with sample_submission to ensure correct order and length
    sample_submission = sample_submission.merge(test_pred_df, on='TransactionID', how='left', suffixes=('', '_new'))
    sample_submission['isFraud'] = sample_submission['isFraud_new'].fillna(0)  # Fill missing predictions with 0
    sample_submission = sample_submission.drop(columns=['isFraud_new'])
else:
    sample_submission['isFraud'] = test_probs

sample_submission.to_csv('submission.csv', index=False)
print("Submission file generated: submission.csv")

Train shape: (28215, 434), Test shape: (25629, 433)
Note: The following columns are missing in the dataset: ['R_emaildomain']


  train['hour'] = ((train['TransactionDT'] // 3600) % 24)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['hour'] = ((train['TransactionDT'] // 3600) % 24)
  test['hour'] = ((test['TransactionDT'] // 3600) % 24)
  train['LogTransactionAmt'] = np.log1p(train['TransactionAmt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['LogTransactionAmt'] = np.log1p(train['TransactionAmt'])
  test['LogTransactionAmt'] = np.log1p(test['TransactionAmt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

Epoch 0, Loss: 0.6996
Epoch 10, Loss: 0.3987
Epoch 20, Loss: 0.2547
Epoch 30, Loss: 0.2233
Epoch 40, Loss: 0.2066
Epoch 50, Loss: 0.1909
Epoch 60, Loss: 0.1827
Epoch 70, Loss: 0.1772
Epoch 80, Loss: 0.1678
Epoch 90, Loss: 0.1635

Test Set Performance:
Test Precision: 0.9048
Test Recall: 0.7518
Test F1: 0.8212
Test AUC: 0.8900
Test Accuracy: 0.8379

Train Set Performance:
Train Precision: 0.8981
Train Recall: 0.7547
Train F1: 0.8201
Train AUC: 0.8895
Train Accuracy: 0.8344
Submission file generated: submission.csv
