In [None]:
pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics import precision_recall_curve
from sklearn.neighbors import NearestNeighbors

# Step 1: Load the Data
data = pd.read_csv('/content/creditcard.csv')

# Clean up memory
gc.collect()

print(f"Data shape: {data.shape}")

# Step 2: Enhanced Preprocessing
# 2.1 Check for missing values
missing_percent = data.isnull().mean()
print(f"Missing values in each column:\n{missing_percent[missing_percent > 0]}")

# 2.2 Check for NaN in 'Class' column specifically
if data['Class'].isna().sum() > 0:
    print(f"Found {data['Class'].isna().sum()} NaN values in 'Class' column.")
    # Option 1: Drop rows with NaN in 'Class'
    data = data.dropna(subset=['Class'])
    print(f"After dropping NaN in 'Class', data shape: {data.shape}")
    # Option 2: (Alternative) Impute with mode (0 or 1, likely 0 for non-fraud)
    # data['Class'] = data['Class'].fillna(data['Class'].mode()[0])

# 2.3 Define feature columns
numerical_cols = ['Time', 'Amount'] + [f'V{i}' for i in range(1, 29)]
target_col = 'Class'

# 2.4 Handle missing values in numerical columns
for col in numerical_cols:
    data[col] = data[col].fillna(data[col].median())

# 2.5 Split data into train and test sets
train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Class'])

# 2.6 Remove outliers using IQR for 'Amount' (only for training data)
for col in ['Amount']:
    q1 = train[col].quantile(0.05)
    q3 = train[col].quantile(0.95)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    train = train[(train[col] >= lower_bound) & (train[col] <= upper_bound)]

# Step 3: Enhanced Feature Engineering
# 3.1 Time-based features
train['hour'] = (train['Time'] // 3600) % 24
test['hour'] = (test['Time'] // 3600) % 24

# 3.2 Log transform Amount
train['LogAmount'] = np.log1p(train['Amount'])
test['LogAmount'] = np.log1p(test['Amount'])

# 3.3 Transaction frequency features (based on 'Time' proximity)
train['time_freq'] = train.groupby('Time')['Time'].transform('count')
test['time_freq'] = test.groupby('Time')['Time'].transform('count')
train['time_freq'] = train['time_freq'].fillna(1)
test['time_freq'] = test['time_freq'].fillna(1)

# Step 4: Prepare Data for GNN
# 4.1 Define feature set
numerical_cols = ['LogAmount', 'hour', 'time_freq'] + [f'V{i}' for i in range(1, 29)]
features = numerical_cols

X = train[features]
y = train['Class']
X_test_full = test[features]

# 4.2 Scale features (StandardScaler first, then MinMaxScaler to [0, 0.7])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_full)

minmax_scaler = MinMaxScaler(feature_range=(0, 0.7))
X_scaled = minmax_scaler.fit_transform(X_scaled)
X_test_scaled = minmax_scaler.transform(X_test_scaled)

# 4.3 Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 4.4 Create graph structure (edges based on similarity in features)
def create_edge_index(X, k=5):
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(X)
    edge_index = []
    for i in range(len(X)):
        for j in indices[i][1:]:  # Skip self (first neighbor)
            edge_index.append([i, j])
            edge_index.append([j, i])  # Undirected graph
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    return edge_index

# Create edge_index for training and test data
train_edge_index = create_edge_index(X_resampled)
test_edge_index = create_edge_index(X_test_scaled)

# 4.5 Create PyTorch Geometric Data object for training
x = torch.tensor(X_resampled, dtype=torch.float)
y = torch.tensor(y_resampled.values, dtype=torch.long)
data = Data(x=x, edge_index=train_edge_index, y=y)

# 4.6 Create train/val/test masks
n_samples = len(y_resampled)
train_idx, temp_idx = train_test_split(range(n_samples), test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

train_mask = torch.zeros(n_samples, dtype=torch.bool)
val_mask = torch.zeros(n_samples, dtype=torch.bool)
test_mask = torch.zeros(n_samples, dtype=torch.bool)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

# 4.7 Create PyTorch Geometric Data object for test set
test_data = Data(x=torch.tensor(X_test_scaled, dtype=torch.float), edge_index=test_edge_index)

# Step 5: Define GNN Model
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Step 6: Train GNN Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN(input_dim=X_resampled.shape[1], hidden_dim=64, output_dim=2).to(device)
data = data.to(device)
test_data = test_data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(mask):
    model.eval()
    with torch.no_grad():
        out = model(data)
        probs = torch.softmax(out[mask], dim=1)[:, 1].cpu().numpy()
        pred = (probs >= optimal_threshold).astype(int)
        true = data.y[mask].cpu().numpy()
        precision = precision_score(true, pred)
        recall = recall_score(true, pred)
        f1 = f1_score(true, pred)
        auc = roc_auc_score(true, probs)
        accuracy = accuracy_score(true, pred)
    return precision, recall, f1, auc, accuracy

# Find optimal threshold on validation set
model.train()
for epoch in range(100):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

model.eval()
with torch.no_grad():
    out = model(data)
    val_probs = torch.softmax(out[data.val_mask], dim=1)[:, 1].cpu().numpy()
    val_true = data.y[data.val_mask].cpu().numpy()

precision, recall, thresholds = precision_recall_curve(val_true, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

# Step 7: Evaluate on Test Set
test_precision, test_recall, test_f1, test_auc, test_accuracy = evaluate(data.test_mask)
print("\nTest Set Performance:")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1: {test_f1:.4f}")
print(f"Test AUC: {test_auc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Step 8: Evaluate on Train Set
train_precision, train_recall, train_f1, train_auc, train_accuracy = evaluate(data.train_mask)
print("\nTrain Set Performance:")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall: {train_recall:.4f}")
print(f"Train F1: {train_f1:.4f}")
print(f"Train AUC: {train_auc:.4f}")
print(f"Train Accuracy: {train_accuracy:.4f}")

# Step 9: Generate Predictions for Test Set
with torch.no_grad():
    model.eval()
    test_out = model(test_data)
    test_probs = torch.softmax(test_out, dim=1)[:, 1].cpu().numpy()

# Create submission file
submission = pd.DataFrame({
    'Index': test.index,
    'Class': test_probs
})
submission.to_csv('submission.csv', index=False)
print("Submission file generated: submission.csv")

Data shape: (11959, 31)
Missing values in each column:
V20       0.000084
V21       0.000084
V22       0.000084
V23       0.000084
V24       0.000084
V25       0.000084
V26       0.000084
V27       0.000084
V28       0.000084
Amount    0.000084
Class     0.000084
dtype: float64
Found 1 NaN values in 'Class' column.
After dropping NaN in 'Class', data shape: (11958, 31)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].fillna(data[col].median())


Epoch 0, Loss: 0.7063
Epoch 10, Loss: 0.1501
Epoch 20, Loss: 0.0733
Epoch 30, Loss: 0.0472
Epoch 40, Loss: 0.0392
Epoch 50, Loss: 0.0299
Epoch 60, Loss: 0.0239
Epoch 70, Loss: 0.0215
Epoch 80, Loss: 0.0181
Epoch 90, Loss: 0.0187

Test Set Performance:
Test Precision: 0.9957
Test Recall: 0.9979
Test F1: 0.9968
Test AUC: 0.9989
Test Accuracy: 0.9968

Train Set Performance:
Train Precision: 0.9962
Train Recall: 0.9980
Train F1: 0.9971
Train AUC: 0.9997
Train Accuracy: 0.9971
Submission file generated: submission.csv
