In [4]:
import os
import pandas as pd
import numpy as np
import copy
import math
import torch
from torch import nn
from typing import Optional, Any
from fastprogress.fastprogress import master_bar, progress_bar

## TODO
- shuffle columns except class
- implement selfsupervised as in [TabNet](https://arxiv.org/pdf/1908.07442.pdf)
- implement mask
- pad datasets to same size and average all outputs and compute the regression on them, like  [this](https://keras.io/examples/nlp/text_classification_with_transformer/) OR the BERT approach, like [this](https://stackoverflow.com/questions/58123393/how-to-use-transformers-for-text-classification)
- regress for bounded target like [this](https://stats.stackexchange.com/questions/11985/how-to-model-bounded-target-variable) or [this](https://stackoverflow.com/questions/51693567/best-way-to-bound-outputs-from-neural-networks-on-reinforcement-learning)
- loss crossentropy or mse?

## TASK
- add more algorithms for regression with multitask
- finetune for tree depth, svm kernel, etc
- selfsupervise for data imputation (like in TabNet)
- finetune for best pre-processing pipeline
- inspect attention plots(?)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
class Encoder(nn.Module):
    
    def __init__(self, d_model, nhead, dim_feedforward=256, dropout=0.1):
        super(Encoder, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = torch.nn.functional.relu

    def forward(self, src: torch.Tensor, src_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        src2 = self.self_attn(src, src, src, attn_mask=src_mask)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

In [13]:
class AttentionMetaExtractor(nn.Module):
    
    def __init__(self, ninp, noutput, nhead=8, nhid=256, nlayers=2, dropout=.25):
        super(AttentionMetaExtractor, self).__init__()
        self.model_type = 'Transformer'
        self.norm = nn.BatchNorm1d(ninp, affine=False,     # no learnable parameters as different ..
                                   track_running_stats=False) # .. batchs means different data.
        encoder_block = Encoder(ninp, nhead, nhid)
        self.encoder = nn.ModuleList([copy.deepcopy(encoder_block) for _ in range(nlayers)])
        self.decoder = nn.Linear(ninp, nhid)
        self.ninp = ninp
        self.avg = nn.AvgPool1d(nhid)
        self.output = nn.Linear(nhid, noutput)
        
    def forward(self, src: torch.Tensor) -> torch.Tensor:
        src = self.norm(src)
        src = src * math.sqrt(self.ninp)
        for block in self.encoder:
            src = block(src)
        output = self.decoder(src)
        output = self.output(self.avg(output))
        return output

In [14]:
ninp = 500 # max number of columns in basedata
nhead = 5
noutput = 1 # number of algorithms accuracies being regressed

In [15]:
model = AttentionMetaExtractor(ninp, noutput, nhead).to(device)

In [16]:
model.eval()

AttentionMetaExtractor(
  (norm): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (encoder): ModuleList(
    (0): Encoder(
      (self_attn): MultiheadAttention(
        (out_proj): _LinearWithBias(in_features=500, out_features=500, bias=True)
      )
      (linear1): Linear(in_features=500, out_features=256, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=256, out_features=500, bias=True)
      (norm1): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): Encoder(
      (self_attn): MultiheadAttention(
        (out_proj): _LinearWithBias(in_features=500, out_features=500, bias=True)
      )
      (linear1): Linear(in_features=500, out_features=256, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Lin

In [17]:
t = torch.rand(1, 2, 500).to(device)

In [18]:
a = model(t)

RuntimeError: mat1 dim 1 must match mat2 dim 0

In [19]:
xtrain = []
ytrain = []

In [20]:
path = "../samples/"
for f in progress_bar(os.listdir(path)):
    df = pd.read_csv(path+f)
    x = df.drop('class', axis=1).values.T
    y = f.split('_')[1]
    xtrain.append(x)
    ytrain.append(y)

In [41]:
batch = 32
def get_batch(i):
    data = torch.tensor(xtrain[i*batch:(i+1)*batch])
    target = torch.tensor(ytrain[i*batch:(i+1)*batch]).reshape(-1)
    return data, target

In [42]:
criterion = nn.CrossEntropyLoss()
lr = 0.1 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [43]:
import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    for batch, i in enumerate(range(0, len(xtrain) - 1)):
        data, targets = get_batch(i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, batch), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [44]:
def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1):
            data, targets = get_batch(i)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [45]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

ValueError: expected sequence of length 20 at dim 1 (got 16)