In [1]:
import math
import pandas as pd
import numpy as np
import copy
from einops import rearrange
from typing import List, Dict, Union
from argparse import Namespace

import torch
import torch.nn as nn
from torch import einsum
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

from operations.data import generate_dataset
from operations.data import generate_dataloader
from operations.embeds import Embedding
from operations.model import NewGELU
from operations.utils import generate_splits
from operations.utils import preprocess
from operations.utils import CutMix, Mixup

from sklearn.base import TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
# read in data
data = pd.read_csv('data/creditcard.csv')

# generate split indices
sup_train_indices, val_indices, test_indices, ssl_train_indices = generate_splits(data.shape[0])

# preprocess data
df_proc, y_proc, no_num, no_cat, cats = preprocess(data.drop(columns=['Class']), data.Class, 0)

In [3]:
# generate train/val/test sets
train_df, train_y = df_proc.iloc[sup_train_indices], y_proc.iloc[sup_train_indices]
val_df, val_y = df_proc.iloc[val_indices], y_proc.iloc[val_indices]
test_df, test_y = df_proc.iloc[test_indices], y_proc.iloc[test_indices]

In [4]:
config = Namespace()

# where to store our train/val/test sets
config.train_csv_path = 'data/train/target/train_targets.csv'
config.val_csv_path = 'data/val/target/val_targets.csv'
config.test_csv_path = 'data/test/target/test_targets.csv'
config.train_y_csv = 'data/train/label/train_labels.csv'
config.val_y_csv = 'data/val/label/val_labels.csv'
config.test_y_csv = 'data/test/label/test_labels.csv'

# save the preprocessed data
train_df.to_csv(config.train_csv_path, index=False)
train_y.to_csv(config.train_y_csv, index=False)

val_df.to_csv(config.val_csv_path, index=False)
val_y.to_csv(config.val_y_csv, index=False)

test_df.to_csv(config.test_csv_path, index=False)
test_y.to_csv(config.test_y_csv, index=False)

In [5]:
# dataloader reads in files using their designated paths
train_dataset, val_dataset, test_dataset = generate_dataset(
                                            train_csv_path = config.train_csv_path,
                                            val_csv_path = config.val_csv_path,
                                            test_csv_path = config.test_csv_path,
                                            train_y_csv_path = config.train_y_csv,
                                            val_y_csv_path = config.val_y_csv,
                                            test_y_csv_path = config.test_y_csv)

# dictionary containing data paths that will be passed to the generate_dataloader class
data_paths = {
    "train_csv_path": config.train_csv_path,
    "val_csv_path": config.val_csv_path,
    "test_csv_path": config.test_csv_path,
    "train_y_csv_path": config.train_y_csv,
    "val_y_csv_path": config.val_y_csv,
    "test_y_csv_path": config.test_y_csv
}

# prepare our train, validation, and test loaders
train_loader, validation_loader, test_loader = generate_dataloader(train_bs=16, 
                                                                   val_bs=16, 
                                                                   num_workers=0, 
                                                                   data_paths=data_paths,
                                                                  )

In [6]:
# initial configuration
config.n_embd = 10
config.no_num = no_num
config.no_cat = no_cat
config.cats = cats
config.n_head = 2
config.resid_pdrop = 0.8
config.prob_cutmix = 0.3 # used in paper
config.mixup_alpha = 0.2 # used in paper
config.d_k = config.n_embd // config.n_head
config.scale = config.n_head ** -0.5
config.d_v = 10
config.dim_head = 16
config.inner_dim = config.n_head * config.dim_head
config.block_size = no_num + no_cat

## Self Supervised Pre-Training
<p align="center">
    <img width="500" height="350" src="media/media.jpg">
</p>

SAINT implements contrastive pre-training, where the distance between two views of the same point is minimized while maximizing the distance between two different points. This strategy is coupled with denoising to perform pre-training on datasets with varied volumes of labeled data.

In [7]:
x = next(iter(train_loader))[0] # (16, 31)

The CutMix regularization strategy is used to augment samples in the input space, and mixup for samples in the embedding space. Specifically, mixup generates convex combinations of pairs of examples and their labels to regularize the NN to favor simple linear behaviour in-between training examples.

In [8]:
cut_mix = CutMix(config.prob_cutmix)
mix_up = Mixup(config.mixup_alpha)

Continous and categorical features are projected into the higher dimensional embedding space before being passed through the transformer blocks. A seperate single fully-connected layer with a ReLU nonlinearity is used for each continous feature to project the 1-dimensional input into d-dimensional space.

In [9]:
em_1 = Embedding(config.n_embd, config.no_num , config.no_cat, config.cats) # +1 to account for addition of <cls> token
em_2 = Embedding(config.n_embd, config.no_num , config.no_cat, config.cats)

# embed batch
pi = em_1(x)
# embed cutmixed batch
pi_prime_em = em_2(cut_mix(x))
# mixup embedded cutmixed batch
pi_prime = mix_up(pi_prime_em)

## SAINT Architecture

<p align="center">
    <img width="255" height="200" src="media/saint_block.jpg">
</p>
Each layer has two attention blocks: one self-attention block, and one intersample attention block. The former is identical to the transformer block proposed by Vaswani et al., where the model takes in a sequence of feature embeddings and ouputs contextual representations of the same dimension. The latter uses intersample attention in lieu of self-attention, that being the only difference in architecture between the two blocks.

In [10]:
# Self-Attention block
def self_attention(x, config, mask=False):
    
    # query, key, and value projections for all heads, but in a batch
    to_qkv = nn.Linear(config.n_embd, 3 * config.n_embd)
    
    # output projection
    c_proj = nn.Linear(config.n_embd, config.n_embd)
    
    # regularization
    attn_dropout = nn.Dropout(0.1)
    resid_dropout = nn.Dropout(0.1)
    
    # causal mask
    if mask == True:
        nn.Module.register_buffer = ("bias", torch.tril(
            torch.ones(config.block_size, config.block_size)).view(
        1, 1, config.block_size, config.block_size))
    
    # num heads
    h = config.n_head
        
    # calculate query, key, values for all heads in batch and move head forward to be the batch dim
    q, k, v = to_qkv(pi).chunk(3, dim=-1)
    q, k, v = map(lambda x: rearrange(x, 'b n (h d) -> b h n d', h=h), (q, k, v))

    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    attn = F.softmax(att, dim=-1)
    attn = attn_dropout(attn)
    
    y = att @ v
    y = y.transpose(1, 2).contiguous().view(16, 31, 10)
    
    # output projection
    y = resid_dropout(c_proj(y))
    
    return y

Intersample attention computes attention over samples rather than features.
<p align="center">
    <img width="1000" height="1000" src="media/intersample_attention.jpg">
</p>


In [11]:
# Intersample Attention block
def intersample_attention(x, config):
    x = rearrange(x, 'b w d -> () b (w d)')
    return self_attention(x, config)

In [12]:
# Feed Forward
def feed_forward(x, config):
    proj_1 = nn.Linear(config.n_embd, 20)
    proj_2 = nn.Linear(20, config.n_embd)
    dropout = nn.Dropout(0.1)
    activation = NewGELU()
    return proj_2(dropout(activation(proj_1(x))))

In [13]:
# Saint Block
def saint_block(x, config, n_layers=1):
    "SAINT pipeline"
    while n_layers:

        LN = nn.LayerNorm(config.n_embd, config.n_embd)
        MSA = self_attention(x, config)

        z1 = LN(MSA) + pi_prime
        z2 = LN(feed_forward(z1, config)) + z1
        z3 = LN(intersample_attention(z2, config)) + z2
        r = LN(feed_forward(z3, config)) + z3
        
        n_layers -= 1
    
    return r # contextual representation output corresponding to x

ri = saint_block(pi, config)
ri_prime = saint_block(pi_prime, config)

## Projection Heads
Outputs are passed through two pojection heads, each consisting of an MLP with one hidden layer and a ReLU. The projection heads are in this case used to reduce dimensionality before computing contrastive loss.

In [14]:
mlp = nn.ModuleDict(dict(
        proj = nn.Linear(310, config.d_k),
        activation = nn.ReLU(),
        dropout = nn.Dropout(0.1)))

mlp2 = nn.ModuleDict(dict(
        proj = nn.Linear(310, config.d_k),
        activation = nn.ReLU(),
        dropout = nn.Dropout(0.1)))

mlpf1 = lambda x: mlp.dropout(mlp.activation(mlp.proj(x)))
mlpf2 = lambda x: mlp2.dropout(mlp2.activation(mlp2.proj(x)))

ri_ = rearrange(ri, 'b d n -> b (d n)')
ri_prime_= rearrange(ri_prime, 'b a c -> b (a c)')

zi = mlpf1(ri_)
zi_prime = mlpf2(ri_prime_)

## Loss Functions
For pretraining, contrastive and denoising losses between a given data point, and its views generated by CutMix and mixup, are minimized.

<p align="center">
    <img width="1000" height="1000" src="media/loss_functions.jpg">
</p>

In [16]:
def contrastive_loss(zi, zi_prime):
    eps = 1e-7

    zi_prod = einsum('a b, c b -> a c', zi, zi_prime)
    zi_prod = zi_prod / 0.7

    zi_exp = torch.exp(zi_prod)
    zi_exp_sum = torch.sum(zi_exp, dim=-1, keepdim=True)

    return -1.0 * torch.sum(torch.log(F.relu(torch.diag(zi_exp / zi_exp_sum)) + eps))
    

c_loss = contrastive_loss(zi, zi_prime)
print("contrastive loss:\n", c_loss.item())

contrastive loss:
 42.96498489379883


In [17]:
def denoising_loss(xi, ri, ri_prime, config, cats, no_cat, no_num):

    def clones(module, N):
        "Produce N identical layers."
        return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

    mse = nn.MSELoss()
    ce = nn.CrossEntropyLoss()

    cat_mlps = nn.ModuleList()
    for i in range(1, no_cat):
        cat.mlps.append(
            nn.Linear(config.n_embd, cats[i]))

    num_mlp = nn.Sequential(
        nn.Linear(config.n_embd, 1), nn.ReLU())

    num_mlps = clones(num_mlp, no_num)

    denoising_loss = 0.0
    num_loss = 0.0
    cat_loss = 0.0

    for feat_idx in range(1, no_cat): # exlude [cls]
        # get mlp projection for each categorical feature
        ri_feat = cat_mlps[feat_idx - 1](
            ri_prime[:, feat_idx, :].squeeze()) # BS x 1

        xi_feat = xi[:, feat_idx] # BS x 1

        cat_loss += ce(ri_feat.float(), xi_feat.long())
        
    for feat_idx in range(no_num):
        idx = no_cat + feat_idx

        # get the mlp for the feature
        ri_feat = num_mlps[feat_idx](ri_prime[:, idx, :]) # BS x 1

        xi_feat = xi[:, idx] # BS x 1

        num_loss += mse(ri_feat.squeeze().float(), xi_feat.float())

    return num_loss + cat_loss

In [19]:
print(L_pretraining = contrastive_loss(zi, zi_prime) + denoising_loss(x, ri, ri_prime, config, cats, no_cat, no_num); print(L_pretraining)

SyntaxError: invalid syntax (3149371936.py, line 1)