In [1]:
import math
import pandas as pd
import numpy as np
import copy
from einops import rearrange
from typing import List, Dict, Union
from argparse import Namespace

import torch
import torch.nn as nn
from torch import einsum
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

from operations.data import generate_dataset
from operations.data import generate_dataloader
from operations.embeds import Embedding
from operations.model import NewGELU
from operations.utils import generate_splits
from operations.utils import preprocess
from operations.utils import CutMix, Mixup

from sklearn.base import TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
# create dictionary for configuration settings
config = Namespace()

# where to store our train/val/test sets
config.train_csv_path = 'data/train/target/train_targets.csv'
config.train_y_csv_path = 'data/train/label/train_labels.csv'

config.val_csv_path = 'data/val/target/val_targets.csv'
config.val_y_csv_path = 'data/val/label/val_labels.csv'

config.test_csv_path = 'data/test/target/test_targets.csv'
config.test_y_csv_path = 'data/test/label/test_labels.csv'

In [3]:
# read in data
data = pd.read_csv('data/creditcard.csv')

# generate split indices
sup_train_indices, val_indices, test_indices, ssl_train_indices = generate_splits(data.shape[0])

# preprocess data
df_proc, y_proc, no_num, no_cat, cats = preprocess(data.drop(columns=['Class']), data.Class, 0)

In [4]:
# generate train/val/test sets
train_df, train_y = df_proc.iloc[sup_train_indices], y_proc.iloc[sup_train_indices]
val_df, val_y = df_proc.iloc[val_indices], y_proc.iloc[val_indices]
test_df, test_y = df_proc.iloc[test_indices], y_proc.iloc[test_indices]

In [5]:
# dataloader reads in files using their designated paths
train_dataset, val_dataset, test_dataset = generate_dataset(
                                            train_csv_path = config.train_csv_path,
                                            val_csv_path = config.val_csv_path,
                                            test_csv_path = config.test_csv_path,
                                            train_y_csv_path = config.train_y_csv_path,
                                            val_y_csv_path = config.val_y_csv_path,
                                            test_y_csv_path = config.test_y_csv_path)


# prepare our train, validation, and test loaders
train_loader, validation_loader, test_loader = generate_dataloader(train_bs=16, 
                                                                   val_bs=16, 
                                                                   num_workers=0, 
                                                                   data_paths=vars(config),
                                                                  )

In [6]:
class Xi_Pi(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.cut_mix = CutMix(config)
        self.mix_up = Mixup(config)
        
        self.em_1 = Embedding(config.n_embd, config.no_num, config.no_cat, config.cats)
        self.em_2 = Embedding(config.n_embd, config.no_num, config.no_cat, config.cats)
        
    def forward(self, x):
        # embed batch
        pi = self.em_1(x)
        # embed cutmixed batch
        pi_prime_em = self.em_2(self.cut_mix(x))
        # mixup embedded cutmixed batch
        pi_prime = self.mix_up(pi_prime_em)
        
        return pi, pi_prime

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()
        self.n_head = n_head
        
        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        
        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
        
        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, q, k, v, mask=None):
        residual = q
        q = rearrange(self.w_qs(q), 'b l (head k) -> head b l k', head=self.n_head)
        k = rearrange(self.w_ks(k), 'b t (head k) -> head b t k', head=self.n_head)
        v = rearrange(self.w_vs(v), 'b t (head v) -> head b t v', head=self.n_head)
        attn = torch.einsum('hblk,hbtk->hblt', [q, k]) / np.sqrt(q.shape[-1])
        if mask is not None:
            attn = attn.masked_fill(mask[None], -np.inf)
        attn = torch.softmax(attn, dim=3)
        output = torch.einsum('hblt,hbtv->hblv', [attn, v])
        output = rearrange(output, 'head b l v -> b l (head v)')
        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)
        return output, attn

In [8]:
class IntersampleAttention(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.d_model = config.d_model
        self.n_head = config.n_head
        self.d_k = config.d_k
        self.d_v = config.d_v
        self.h_dim = config.h_dim
        
        self.to_qkv = nn.Linear(self.h_dim, 3 * self.h_dim) # [(B, T, 3*C)]
        self.fc = nn.Linear(self.h_dim, self.h_dim)
        
        nn.init.normal_(self.to_qkv.weight, mean=0, std=np.sqrt(2.0 / (self.h_dim + self.d_k)))
        nn.init.xavier_normal_(self.fc.weight)
        
    def attention(self, q, k, v, dropout=None):
        d_k = q.size(-1)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
        p_attn = F.softmax(scores, dim = -1)
        if dropout is not None:
            p_attn = dropout(p_attn)
        output = torch.matmul(p_attn, v)
        return output, p_attn          

    def intersample(self, q, k, v):
        q, k, v = map(lambda x: rearrange(x, 'b w (d h) -> () b h (w d)',h=self.n_head), (q, k, v))
        b, h, n, d = q.shape        
        output, attn = self.attention(q, k, v)
        output = output.squeeze(0)
        output = output.reshape(b, h, n, d)
        return output
        
    def forward(self, x):
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
        x = self.intersample(q, k, v).view(16, 31, 10)
        return self.fc(x)

In [9]:
class FeedForward(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        
        self.feed_forward = nn.ModuleDict(dict(
            proj_1 = nn.Linear(config.n_embd, 20),
            proj_2 = nn.Linear(20, config.n_embd),
            dropout = nn.Dropout(0.1),
            activation = NewGELU()
            ))

        m = self.feed_forward

        self.mlpf = lambda x: m.proj_2(m.dropout(m.activation(m.proj_1(x))))
        
    def forward(self, x):
        return self.mlpf(x)

In [10]:
class SaintPipeline(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.to_qkv = nn.Linear(config.n_embd, 3*config.n_embd)
        self.layer_norm = nn.LayerNorm(config.n_embd)
        self.multihead_attention = MultiHeadAttention(config.n_head, config.n_embd, config.d_k, config.d_v)
        self.FF1 = FeedForward(config)
        self.MISA = IntersampleAttention(config)
        self.FF2 = FeedForward(config)
        
    def forward(self, x):
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
        output, _ = self.multihead_attention(q, k, v)
        # compute multi-head attention
        z1 = self.layer_norm(output) + x
        z2 = self.layer_norm(self.FF1(z1)) + z1
        z2_attn = self.MISA(z2)
        z3 = self.layer_norm(z2_attn) + z2
        r = self.layer_norm(self.FF2(z3)) + z3
        return r

In [11]:
# Projection Head
class MLP(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(config.d_model * config.n_embd, config.dim_head),
            nn.ReLU(),
        )
        
    def forward(self, x):
        proj_1 = self.mlp(x)
        return proj_1

In [12]:
class ContrastiveLoss(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.proj_1 = MLP(config)
        self.proj_2 = MLP(config)
        
    def forward(self, ri, ri_prime):
        ri = ri.reshape(ri.shape[0], -1)
        ri_prime = ri_prime.reshape(ri_prime.shape[0], -1)
        
        zi = self.proj_1(ri)
        zi_prime = self.proj_2(ri_prime)
        
        z_prod = torch.mm(zi, torch.t(zi_prime)) / 0.7
        zi_exp = torch.exp(z_prod)
        zi_sum = torch.sum(zi_exp, dim=-1, keepdim=True)
        z_loss = -1.0 * torch.log(F.relu(torch.diag(zi_exp / zi_sum)) + 1e-7)
        return z_loss

In [13]:
class DenoisingLoss(nn.Module):
    
    def __init__(self, config):
        
        super().__init__()
        self.no_num = config.no_num
        self.no_cat = config.no_cat
        self.cats = config.cats
        self.h_dim = config.h_dim
        
        self.mse = nn.MSELoss()
        self.ce = nn.CrossEntropyLoss()

        self.num_loss = 0.0
        self.cat_loss = 0.0
            
    # To-do:
    # (1) loss compute for categorical features; add to num. loss for total
    # denoising loss
            
    def forward(self, x, ri_prime):
        
        # each MLP has a single perceptron layer with a ReLU non-linearity
        mlp_cat = nn.ModuleList()
        for i in range(1, self.no_cat): # one MLP for each cat. feat.
            mlp_cat.append(nn.Sequential(
                nn.Linear(self.n_dim, self.cats[i])
            ))        
        
        mlp_cont = nn.ModuleList()
        for i in range(1, self.no_num): # one MLP for each cont. feat.
            mlp_cont.append(nn.Sequential(
                nn.Linear(self.h_dim, 1),
                nn.ReLU()
            ))
        
        for idx in range(self.no_num - 1):
            ri_proj = mlp_cont[idx](ri_prime[:, idx, :])
            xi_feat = x[:, idx]

            self.num_loss += self.mse(ri_proj.squeeze().float(), xi_feat.float())
                
        return self.num_loss

In [14]:
#============== Configuration Settings ============
config.h_dim = 10
config.n_embd = 10
config.no_num = no_num
config.no_cat = no_cat
config.cats = cats
config.n_head = 2
config.resid_pdrop = 0.8
config.prob_cutmix = 0.3 # used in paper
config.mixup_alpha = 0.2 # used in paper
config.d_k = config.n_embd // config.n_head
config.scale = config.n_head ** -0.5
config.d_v = 31
config.dim_head = 16
config.inner_dim = config.n_head * config.dim_head
config.d_model = no_num + no_cat
config.mask = None
config.alpha = 1.0
config.attn_pdrop = 0.1
config.prob_cutmix = 0.8

In [15]:
class Model(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.embedding = Xi_Pi(config)
        self.multihead_attention = MultiHeadAttention(config.n_head, config.n_embd, config.d_k, config.d_v)
        self.intersample_attention = IntersampleAttention(config)
        self.saint_pipeline = SaintPipeline(config)
        self.proj_1 = nn.Sequential(nn.Linear(config.n_embd, config.n_embd), nn.ReLU())
        self.proj_2 = nn.Sequential(nn.Linear(config.n_embd, config.n_embd, nn.ReLU()))
        self.contrastive_loss = ContrastiveLoss(config)
        self.denoising_loss = DenoisingLoss(config)

        
        self.to_qkv = nn.Linear(config.n_embd, 3 * config.n_embd)
        
    def forward(self, x):
        pi, pi_prime = self.embedding(x)
        query, key, value = self.to_qkv(pi).chunk(3, dim=-1)
        output, _ = self.multihead_attention(query, key, value)
        output = self.intersample_attention(output)
        ri = self.saint_pipeline(pi)
        ri_prime = self.saint_pipeline(pi_prime)
        zi = self.proj_1(ri).view(ri.size(0), -1)
        zi_prime = self.proj_2(ri_prime).view(ri_prime.size(0), -1)
        c_loss = self.contrastive_loss(zi, zi_prime)
        d_loss = self.denoising_loss(x, ri_prime)
        loss = c_loss + d_loss
        return loss

In [17]:
# determine the device we'll train on
device = 'cpu'

max_iters = 100

model = Model(config).to(device)
print("running on device", device)

# variables that will be assigned to trainer class later for logging and etc
iter_num = 0.0
iter_dt = 0.0

# optimizer
optimizer = torch.optim.Adam(model.parameters())

model.train()
data_iter = iter(train_loader)
while True:
    try:
        batch = next(data_iter)
    except StopIteration:
        data_iter = iter(train_loader)
        batch = next(data_iter)
    # place batch onto device
    batch = [t.to(device) for t in batch]
    x, y = batch
    
    loss = model(x)
    print(loss)
    
    # backprop and update the parameters
    model.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    iter_num += 1
    
    if max_iters is not None and iter_num >= max_iters:
        break

running on device cpu
tensor([265.1386, 261.5462, 263.0639, 263.0174, 262.3326, 263.8615, 262.8708,
        262.2613, 263.6174, 263.4252, 262.8925, 262.2267, 262.7273, 261.2366,
        262.4987, 261.4953], grad_fn=<AddBackward0>)


RuntimeError: grad can be implicitly created only for scalar outputs

In [None]:
# hacked training loop for development testing
lir = []
lossi = []
stepi = []

In [None]:
parameters = [ri, ri_prime, zi, zi_prime, contrastive_loss, denoise_loss, t_loss]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

data_iter = iter(train_loader)


for i in range(1):
    x, y = next(data_iter)
    
#     # backward pass
#     for p in parameters