In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, List
import torch.optim as optim
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from torch.utils.data import TensorDataset, DataLoader

In [None]:
categorical_cols = ['hour', 'day_of_week', 'is_weekend', 'day', 'year', 'month', 'week',
                    'Sender_account', 'Receiver_account',
                    'Payment_currency', 'Received_currency',
                    'Sender_bank_location', 'Receiver_bank_location', 'Payment_type']
continuous_cols = ["Amount"]
target_col = 'Is_laundering'

train = pd.read_csv("preprocessed_train.csv")[categorical_cols + continuous_cols + [target_col]]
val = pd.read_csv("preprocessed_validation.csv")[categorical_cols + continuous_cols + [target_col]]
test = pd.read_csv("preprocessed_test.csv")[categorical_cols + continuous_cols + [target_col]]


for df in [train, val, test]:
    df["is_weekend"] = df["is_weekend"].astype(int)
num_categories = []

for col in categorical_cols:
    num_categories.append(train[col].nunique())
num_categories[5] = 12
num_categories[6] = 52
class TabularEmbedding(nn.Module):
    def __init__(self,
                 num_categories,  # list of cardinalities for each categorical feature
                 num_continuous,  # number of continuous features
                 d_model):        # transformer model dimension
        super().__init__()

        self.num_categorical = len(num_categories)
        self.num_continuous = num_continuous
        self.d_model = d_model

        # --- Categorical embeddings (column-based) ---
        self.cat_embeddings = nn.ModuleList([
            nn.Embedding(num_embeddings=cardinality, embedding_dim=d_model)
            for cardinality in num_categories
        ])

        # Initialize with truncated normal (mean=0, std=0.01)
        for emb in self.cat_embeddings:
            nn.init.trunc_normal_(emb.weight, mean=0.0, std=0.01)

        # --- Continuous projection layer ---
        # Project all continuous features into d_model
        if num_continuous > 0:
            self.continuous_proj = nn.Linear(num_continuous, d_model)
            nn.init.trunc_normal_(self.continuous_proj.weight, mean=0.0, std=0.01)
            nn.init.zeros_(self.continuous_proj.bias)

    def forward(self, x_cat, x_cont):
        """
        x_cat: [batch_size, num_categorical]  (long tensor)
        x_cont: [batch_size, num_continuous]  (float tensor)
        """
        # Embed categorical columns
        cat_embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeddings)]
        cat_embs = torch.stack(cat_embs, dim=1)  # [batch_size, num_categorical, d_model]

        # Process continuous features (if any)
        if self.num_continuous > 0:
            cont_emb = self.continuous_proj(x_cont).unsqueeze(1)  # [batch_size, 1, d_model]
            x = torch.cat([cat_embs, cont_emb], dim=1)  # total_tokens = num_categorical + 1
        else:
            x = cat_embs

        return x  # [batch_size, num_tokens, d_model]

class ResidualAttentionEncoderLayer(nn.Module):
    """
    Single transformer encoder layer with:
      - multi-head self-attention
      - residual attention integration: attn_out + alpha * prev_attn_out
      - residual connection + dropout + layernorm
      - feed-forward MLP (GeLU)
      - returns (out, attn_out) where attn_out can be passed to next layer
    """

    def __init__(self,
                 d_model: int,
                 nhead: int,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1,
                 attn_dropout: float = 0.0):
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead

        self.self_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=attn_dropout, batch_first=True)

        # Learnable scalar for residual-attention gating
        self.alpha = nn.Parameter(torch.tensor(0.0))  # initialize near 0, model learns whether to use it

        # Residual dropout
        self.residual_dropout = nn.Dropout(dropout)

        # LayerNorms and FFN
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model),
        )

        # final dropout on FFN output (already included inside ffn)
        self.final_dropout = nn.Dropout(dropout)

    def forward(self,
                x: torch.Tensor,
                attn_mask: Optional[torch.Tensor] = None,
                key_padding_mask: Optional[torch.Tensor] = None,
                prev_attn_out: Optional[torch.Tensor] = None
                ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        x: [batch, seq_len, d_model]
        attn_mask: optional causal or custom mask (seq_len, seq_len) or (batch, seq_len, seq_len) (PyTorch handles various shapes)
        key_padding_mask: optional (batch, seq_len) boolean mask for padding
        prev_attn_out: previous layer's attention output (same shape as attn_out) OR None
        returns: (out, attn_out)
        """

        # MultiheadAttention in batch_first mode: returns (attn_output, attn_weights)
        attn_out, attn_weights = self.self_attn(query=x,
                                               key=x,
                                               value=x,
                                               attn_mask=attn_mask,
                                               key_padding_mask=key_padding_mask,
                                               need_weights=True,
                                               average_attn_weights=False)  # attn_out: [batch, seq_len, d_model]
        # attn_weights shape: [batch, nhead, seq_len, seq_len]

        # Optionally integrate residual attention (previous layer's attn_out)
        if prev_attn_out is not None:
            # prev_attn_out is expected to be same shape as attn_out: [batch, seq_len, d_model]
            attn_out = attn_out + self.alpha * prev_attn_out

        # Residual + LayerNorm
        x = x + self.residual_dropout(attn_out)
        x = self.norm1(x)

        # Feed-forward
        ffn_out = self.ffn(x)               # [batch, seq_len, d_model]
        x = x + self.final_dropout(ffn_out)
        x = self.norm2(x)

        # Return the post-attention vector as the layer's 'attention output' for residual-attention chaining
        # We return attn_out (before residual-sum into x) so next layer can combine it
        return x, attn_out


class MaskedEncoderStack(nn.Module):
    """
    Stack of ResidualAttentionEncoderLayer that runs on a *subset* of tokens
    (e.g., only sender+receiver tokens). We pass prev_attn between layers to enable
    residual-attention chaining.
    """

    def __init__(self, n_layers: int, **layer_kwargs):
        super().__init__()
        self.layers = nn.ModuleList([ResidualAttentionEncoderLayer(**layer_kwargs) for _ in range(n_layers)])

    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, key_padding_mask: Optional[torch.Tensor] = None):
        """
        x: [batch, seq_len_subset, d_model]  (e.g., seq_len_subset==2 for sender+receiver)
        returns: output of stack, plus last layer's attn_out (so caller can optionally inspect)
        """
        prev_attn = None
        out = x
        for layer in self.layers:
            out, prev_attn = layer(out, attn_mask=attn_mask, key_padding_mask=key_padding_mask, prev_attn_out=prev_attn)
        return out, prev_attn


class FullEncoderStack(nn.Module):
    """
    Stack of ResidualAttentionEncoderLayer that runs on the full token sequence (unmasked)
    """

    def __init__(self, n_layers: int, **layer_kwargs):
        super().__init__()
        self.layers = nn.ModuleList([ResidualAttentionEncoderLayer(**layer_kwargs) for _ in range(n_layers)])

    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, key_padding_mask: Optional[torch.Tensor] = None):
        prev_attn = None
        out = x
        for layer in self.layers:
            out, prev_attn = layer(out, attn_mask=attn_mask, key_padding_mask=key_padding_mask, prev_attn_out=prev_attn)
        return out, prev_attn


class DualMaskedTransformerEncoder(nn.Module):
    """
    Implements the two-stage encoder:
     1) Masked encoder on subset tokens (sender+receiver)
        - we extract the two tokens, run the masked stack, then reinsert updated embeddings
     2) Full encoder on all tokens
    """

    def __init__(self,
                 d_model: int,
                 nhead: int,
                 n_layers_masked: int = 2,
                 n_layers_full: int = 2,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1,
                 attn_dropout: float = 0.0):
        super().__init__()

        layer_kwargs = dict(d_model=d_model,
                            nhead=nhead,
                            dim_feedforward=dim_feedforward,
                            dropout=dropout,
                            attn_dropout=attn_dropout)

        self.masked_encoder = MaskedEncoderStack(n_layers=n_layers_masked, **layer_kwargs)
        self.full_encoder = FullEncoderStack(n_layers=n_layers_full, **layer_kwargs)

    def forward(self,
                x: torch.Tensor,
                sender_idx: int,
                receiver_idx: int,
                key_padding_mask: Optional[torch.Tensor] = None
                ) -> torch.Tensor:
        """
        x: [batch, seq_len, d_model]
        sender_idx, receiver_idx: integer token indices (same for all samples in batch)
            - If token positions are different per example, you'd pass per-sample indices and use gather.
        key_padding_mask: optional (batch, seq_len) boolean mask
        returns: output after full encoder: [batch, seq_len, d_model]
        """

        batch, seq_len, d_model = x.shape
        assert d_model == self.full_encoder.layers[0].d_model

        # 1) Extract sender & receiver tokens as a small sequence [batch, 2, d_model]
        pair_indices = torch.tensor([sender_idx, receiver_idx], device=x.device)
        # For simplicity we assume same indices for all samples (common in tabular pipelines)
        x_pairs = x[:, pair_indices, :]  # [batch, 2, d_model]

        # (Optional) We could build a small attention mask for the pair (e.g., allow mutual attention) - not needed here
        masked_out, last_pair_attn = self.masked_encoder(x_pairs)  # [batch, 2, d_model]

        # 2) Reinsert updated pair embeddings back into the full sequence
        x_updated = x.clone()
        x_updated[:, sender_idx, :] = masked_out[:, 0, :]
        x_updated[:, receiver_idx, :] = masked_out[:, 1, :]

        # 3) Run full encoder (unmasked) to integrate pair features into context
        full_out, last_full_attn = self.full_encoder(x_updated, key_padding_mask=key_padding_mask)

        return full_out

d_model = 64

embedding_layer = TabularEmbedding(
    num_categories=num_categories,
    num_continuous=len(continuous_cols),
    d_model=d_model
)

transformer = DualMaskedTransformerEncoder(
    d_model=d_model,
    nhead=8,
    n_layers_masked=2,
    n_layers_full=2,
    dim_feedforward=256,
    dropout=0.1
)

# simple classifier head
class ClassifierHead(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(d_model//2, 1)
        )
    def forward(self, x):
        # Use mean pooling over sequence dimension
        x = x.mean(dim=1)
        print(x.shape)
        return self.fc(x).squeeze(-1)

classifier = ClassifierHead(d_model)
class TabAMLModel(nn.Module):
    def __init__(self, embedding_layer, transformer, classifier,
                 sender_idx=0, receiver_idx=1):
        super().__init__()
        self.embedding = embedding_layer
        self.transformer = transformer
        self.classifier = classifier
        self.sender_idx = sender_idx
        self.receiver_idx = receiver_idx

    def forward(self, x_cat, x_cont):
        # Step 1: embed both categorical and continuous features
        x = self.embedding(x_cat, x_cont)  # [batch, num_tokens, d_model]
        # Step 2: run dual transformer
        x = self.transformer(x, sender_idx=self.sender_idx, receiver_idx=self.receiver_idx)
        # Step 3: predict
        logits = self.classifier(x)
        return logits
model = TabAMLModel(embedding_layer, transformer, classifier)



In [4]:
train.Sender_account.describe()

count    7.429619e+06
mean     1.232359e+05
std      7.083507e+04
min      0.000000e+00
25%      6.226700e+04
50%      1.231110e+05
75%      1.845330e+05
max      2.458900e+05
Name: Sender_account, dtype: float64

In [5]:
train[categorical_cols].dtypes

hour                      int64
day_of_week               int64
is_weekend                int64
day                       int64
year                      int64
month                     int64
week                      int64
Sender_account            int64
Receiver_account          int64
Payment_currency          int64
Received_currency         int64
Sender_bank_location      int64
Receiver_bank_location    int64
Payment_type              int64
dtype: object

In [6]:
for col in categorical_cols:
    # Make sure no negative or non-integer values
    if train[col].dtype == 'int' or np.issubdtype(train[col].dtype, np.integer):
        train[col] -= train[col].min()
        val[col] -= val[col].min()
        test[col] -= test[col].min()

In [7]:
X_cat_train = torch.tensor(train[categorical_cols].values, dtype=torch.long)
X_cont_train = torch.tensor(train[continuous_cols].values, dtype=torch.float)
y_train = torch.tensor(train[target_col].values, dtype=torch.float)

X_cat_val = torch.tensor(val[categorical_cols].values, dtype=torch.long)
X_cont_val = torch.tensor(val[continuous_cols].values, dtype=torch.float)
y_val = torch.tensor(val[target_col].values, dtype=torch.float)

X_cat_test = torch.tensor(test[categorical_cols].values, dtype=torch.long)
X_cont_test = torch.tensor(test[continuous_cols].values, dtype=torch.float)
y_test = torch.tensor(test[target_col].values, dtype=torch.float)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [8]:
train.head(5)

Unnamed: 0,hour,day_of_week,is_weekend,day,year,month,week,Sender_account,Receiver_account,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Amount,Is_laundering
0,10,4,0,6,0,9,39,214498,162977,10,10,16,16,1,1459.15,0
1,10,4,0,6,0,9,39,36586,495398,10,1,16,15,5,6019.64,0
2,10,4,0,6,0,9,39,7222,259992,10,10,16,16,3,14328.44,0
3,10,4,0,6,0,9,39,132152,566042,10,10,16,16,0,11895.0,0
4,10,4,0,6,0,9,39,236371,224426,10,10,16,16,1,115.25,0


In [9]:
print(list(zip(categorical_cols, num_categories)))

[('hour', 24), ('day_of_week', 7), ('is_weekend', 2), ('day', 31), ('year', 2), ('month', 12), ('week', 52), ('Sender_account', 245891), ('Receiver_account', 589380), ('Payment_currency', 13), ('Received_currency', 13), ('Sender_bank_location', 18), ('Receiver_bank_location', 18), ('Payment_type', 7)]


In [10]:
print(X_cat_train.shape, X_cont_train.shape)
print(X_cat_val.shape, X_cont_val.shape)
print(X_cat_test.shape, X_cont_test.shape)

torch.Size([7429619, 14]) torch.Size([7429619, 1])
torch.Size([1044845, 14]) torch.Size([1044845, 1])
torch.Size([1030388, 14]) torch.Size([1030388, 1])


In [11]:
for i, col in enumerate(categorical_cols):
    max_idx = X_cat_train[:, i].max().item()
    min_idx = X_cat_train[:, i].min().item()
    if max_idx >= num_categories[i] or min_idx < 0:
        print(f"❌ Column '{col}' has out-of-range index [{min_idx}, {max_idx}] "
              f"but embedding size is {num_categories[i]}")

In [12]:
print(X_cat_train.shape, X_cat_train.dtype)
print(X_cont_train.shape, X_cont_train.dtype)
print(y_train.shape, y_train.dtype)

torch.Size([7429619, 14]) torch.int64
torch.Size([7429619, 1]) torch.float32
torch.Size([7429619]) torch.float32


In [13]:
# Create datasets
train_dataset = TensorDataset(X_cat_train, X_cont_train, y_train)
val_dataset = TensorDataset(X_cat_val, X_cont_val, y_val)

# Use DataLoader to handle batching
batch_size = 1024  # try 512 if memory is still low
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Training loop
model.train()
for epoch in range(10):
    total_loss = 0
    for X_cat_batch, X_cont_batch, y_batch in train_loader:
        optimizer.zero_grad()
        logits = model(X_cat_batch, X_cont_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y_batch)
    
    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")

torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size([1024, 64])
torch.Size(

In [15]:
import torch

# Save the model weights
torch.save(model.state_dict(), "model_weights.pth")