In [1]:
import os
import pandas as pd
import numpy as np
import copy
import math
import torch
from torch import nn
from typing import Optional, Any
from fastprogress.fastprogress import master_bar, progress_bar
from sklearn.model_selection import train_test_split

## TODO
- implement selfsupervised as in [TabNet](https://arxiv.org/pdf/1908.07442.pdf)
- implement mask?
- pad datasets to same size and average all outputs and compute the regression on them, like  [this](https://keras.io/examples/nlp/text_classification_with_transformer/) OR the BERT approach, like [this](https://stackoverflow.com/questions/58123393/how-to-use-transformers-for-text-classification)
- regress for bounded target like [this](https://stats.stackexchange.com/questions/11985/how-to-model-bounded-target-variable) or [this](https://stackoverflow.com/questions/51693567/best-way-to-bound-outputs-from-neural-networks-on-reinforcement-learning)
- loss crossentropy or mse?
- add more algorithms for regression with multitask
- finetune for tree depth, svm kernel, etc
- selfsupervise for data imputation (like in TabNet)
- finetune for best pre-processing pipeline
- inspect attention plots(?)

In [2]:
X = []
y = []

In [3]:
path = "../samples_train/"
files = os.listdir(path)
for f in progress_bar(files):
    df = pd.read_csv(path+f)
    data = df.values.astype(float).T
    target = float(f.split('_')[-2])
    X.append(data)
    y.append(target)

In [4]:
xtrain, xvalid, ytrain, yvalid = train_test_split(X, y)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
class Encoder(nn.Module):
    
    def __init__(self, d_model, nhead, dim_feedforward=256, dropout=0.1):
        super(Encoder, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = torch.nn.functional.relu

    def forward(self, src: torch.Tensor, src_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        src2 = self.self_attn(src, src, src, attn_mask=src_mask)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

In [7]:
class AttentionMetaExtractor(nn.Module):
    
    def __init__(self, ninp, noutput, nhead=5, nhid=256, nlayers=10, dropout=.25):
        super(AttentionMetaExtractor, self).__init__()
        self.model_type = 'Transformer'
#         self.norm = nn.BatchNorm1d(ninp, affine=False,     # no learnable parameters as different ..
#                                track_running_stats=False) # .. batchs means different data.
        encoder_block = Encoder(ninp, nhead, nhid)
        self.encoder = nn.ModuleList([copy.deepcopy(encoder_block) for _ in range(nlayers)])
        self.decoder = nn.Linear(ninp, nhid)
        self.ninp = ninp
        self.output = nn.Linear(nhid, noutput)
        
    def forward(self, src: torch.Tensor) -> torch.Tensor:
#         src = self.norm(src) # data already normalized
        output = src * math.sqrt(self.ninp)
        for block in self.encoder:
            output = block(output)
        output = torch.mean(output, dim = 1)
        output = self.decoder(output)
        output = self.output(output)
        return output

In [8]:
ninp = 128 # number of rows in base data
nhead = 8
noutput = 1 # number of algorithms accuracies being regressed
nhid = 64

In [9]:
model = AttentionMetaExtractor(ninp, noutput, nhead, nhid).to(device).float()

In [10]:
model.eval()

AttentionMetaExtractor(
  (encoder): ModuleList(
    (0): Encoder(
      (self_attn): MultiheadAttention(
        (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=64, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=64, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): Encoder(
      (self_attn): MultiheadAttention(
        (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=64, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=64, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, ele

In [11]:
def get_batch(i, data, target):
    data = torch.tensor(data[i]).unsqueeze(0).to(device).float()
    target = torch.tensor(target[i]).reshape(-1).to(device).float()
    return data, target

In [12]:
a, b = get_batch(101, xtrain, ytrain)
print(a, a.shape)
print(b, b.shape)

tensor([[[ 0.5977, -0.8230, -0.7769,  ...,  0.0525, -0.6527,  0.5745],
         [ 0.1358, -0.5994, -0.4055,  ..., -0.3641, -0.2863,  0.3330],
         [-0.1603, -0.5288, -0.2194,  ...,  0.1150,  0.2108, -0.5212],
         ...,
         [-0.6785,  0.6437,  0.8022,  ...,  0.8378, -0.2926,  0.3217],
         [-0.7252,  0.8357,  0.3426,  ...,  0.8005, -0.0738,  0.3687],
         [-0.1428,  0.4012, -0.9255,  ..., -1.0973,  0.0720,  0.2380]]],
       device='cuda:0') torch.Size([1, 10, 128])
tensor([0.8680], device='cuda:0') torch.Size([1])


In [13]:
res = model(a)
print(res)
# print(torch.mean(res, 2), torch.mean(res, 2).shape)
# print(torch.std(res, 2), torch.std(res, 2).shape)
# print(res.shape)

tensor([[0.1113]], device='cuda:0', grad_fn=<AddmmBackward>)


In [14]:
#criterion = nn.BCELoss()
criterion = nn.MSELoss()
lr = 0.01 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [15]:
import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    for batch, i in enumerate(range(len(xtrain))):
        data, targets = get_batch(i, xtrain, ytrain)
        optimizer.zero_grad()
        output = model(data)[0]
        loss = criterion(output, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 100
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.3f} | ppl {:8.3f}'.format(
                    epoch, batch, len(xtrain), scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0

            start_time = time.time()

In [16]:
xtest = xtrain # TODO: alterar treino teste

def evaluate(eval_model):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i in range(len(xvalid)): 
            data, targets = get_batch(i, xvalid, yvalid)
            output = eval_model(data)
            output_flat = output[0]
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(xtest) - 1)

In [17]:
best_val_loss = float("inf")
epochs = 20 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.3f} | '
          'valid ppl {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()



| epoch   1 |   100/ 1270 batches | lr 0.00 | ms/batch 33.04 | loss 0.533 | ppl    1.704
| epoch   1 |   200/ 1270 batches | lr 0.00 | ms/batch 32.58 | loss 0.255 | ppl    1.290
| epoch   1 |   300/ 1270 batches | lr 0.00 | ms/batch 31.91 | loss 0.087 | ppl    1.091
| epoch   1 |   400/ 1270 batches | lr 0.00 | ms/batch 31.18 | loss 0.038 | ppl    1.039
| epoch   1 |   500/ 1270 batches | lr 0.00 | ms/batch 30.79 | loss 0.026 | ppl    1.026
| epoch   1 |   600/ 1270 batches | lr 0.00 | ms/batch 31.04 | loss 0.030 | ppl    1.030
| epoch   1 |   700/ 1270 batches | lr 0.00 | ms/batch 31.04 | loss 0.029 | ppl    1.030
| epoch   1 |   800/ 1270 batches | lr 0.00 | ms/batch 31.19 | loss 0.023 | ppl    1.024
| epoch   1 |   900/ 1270 batches | lr 0.00 | ms/batch 31.03 | loss 0.031 | ppl    1.031
| epoch   1 |  1000/ 1270 batches | lr 0.00 | ms/batch 31.11 | loss 0.024 | ppl    1.025
| epoch   1 |  1100/ 1270 batches | lr 0.00 | ms/batch 31.56 | loss 0.028 | ppl    1.028
| epoch   1 |  1200/ 

| epoch   7 |   400/ 1270 batches | lr 0.00 | ms/batch 30.81 | loss 0.022 | ppl    1.022
| epoch   7 |   500/ 1270 batches | lr 0.00 | ms/batch 31.71 | loss 0.017 | ppl    1.018
| epoch   7 |   600/ 1270 batches | lr 0.00 | ms/batch 31.32 | loss 0.018 | ppl    1.018
| epoch   7 |   700/ 1270 batches | lr 0.00 | ms/batch 31.60 | loss 0.019 | ppl    1.019
| epoch   7 |   800/ 1270 batches | lr 0.00 | ms/batch 31.18 | loss 0.017 | ppl    1.017
| epoch   7 |   900/ 1270 batches | lr 0.00 | ms/batch 31.83 | loss 0.021 | ppl    1.021
| epoch   7 |  1000/ 1270 batches | lr 0.00 | ms/batch 34.32 | loss 0.022 | ppl    1.022
| epoch   7 |  1100/ 1270 batches | lr 0.00 | ms/batch 32.14 | loss 0.021 | ppl    1.021
| epoch   7 |  1200/ 1270 batches | lr 0.00 | ms/batch 32.21 | loss 0.015 | ppl    1.015
-----------------------------------------------------------------------------------------
| end of epoch   7 | time: 43.36s | valid loss 0.007 | valid ppl    1.007
-----------------------------------

| epoch  13 |   700/ 1270 batches | lr 0.00 | ms/batch 31.08 | loss 0.018 | ppl    1.018
| epoch  13 |   800/ 1270 batches | lr 0.00 | ms/batch 31.09 | loss 0.016 | ppl    1.017
| epoch  13 |   900/ 1270 batches | lr 0.00 | ms/batch 30.81 | loss 0.021 | ppl    1.022
| epoch  13 |  1000/ 1270 batches | lr 0.00 | ms/batch 30.71 | loss 0.019 | ppl    1.020
| epoch  13 |  1100/ 1270 batches | lr 0.00 | ms/batch 30.68 | loss 0.017 | ppl    1.018
| epoch  13 |  1200/ 1270 batches | lr 0.00 | ms/batch 30.85 | loss 0.013 | ppl    1.013
-----------------------------------------------------------------------------------------
| end of epoch  13 | time: 41.84s | valid loss 0.007 | valid ppl    1.007
-----------------------------------------------------------------------------------------
| epoch  14 |   100/ 1270 batches | lr 0.00 | ms/batch 31.61 | loss 0.023 | ppl    1.023
| epoch  14 |   200/ 1270 batches | lr 0.00 | ms/batch 30.86 | loss 0.021 | ppl    1.021
| epoch  14 |   300/ 1270 batches 

| epoch  19 |  1000/ 1270 batches | lr 0.00 | ms/batch 30.99 | loss 0.019 | ppl    1.020
| epoch  19 |  1100/ 1270 batches | lr 0.00 | ms/batch 30.79 | loss 0.018 | ppl    1.018
| epoch  19 |  1200/ 1270 batches | lr 0.00 | ms/batch 31.16 | loss 0.013 | ppl    1.013
-----------------------------------------------------------------------------------------
| end of epoch  19 | time: 41.86s | valid loss 0.007 | valid ppl    1.007
-----------------------------------------------------------------------------------------
| epoch  20 |   100/ 1270 batches | lr 0.00 | ms/batch 31.20 | loss 0.023 | ppl    1.023
| epoch  20 |   200/ 1270 batches | lr 0.00 | ms/batch 30.67 | loss 0.021 | ppl    1.021
| epoch  20 |   300/ 1270 batches | lr 0.00 | ms/batch 31.28 | loss 0.018 | ppl    1.019
| epoch  20 |   400/ 1270 batches | lr 0.00 | ms/batch 31.00 | loss 0.020 | ppl    1.020
| epoch  20 |   500/ 1270 batches | lr 0.00 | ms/batch 30.58 | loss 0.017 | ppl    1.017
| epoch  20 |   600/ 1270 batches 

In [18]:
from sklearn.metrics import mean_absolute_error
true = []
pred = []
for i in range(10):
    a,b = get_batch(i, xvalid, yvalid)
    b = b[0].cpu().detach().numpy()
    ev = model(a)[0][0].cpu().detach().numpy()
    print(ev, b)
    true.append(b)
    pred.append(ev)
print("MAE:", mean_absolute_error(true, pred))

0.8510657 0.868
0.99176854 0.954
0.8503028 0.851
0.8623636 0.766
0.8887348 0.961
0.8396364 0.633
0.8843679 0.766
0.93044865 0.961
0.8563968 0.907
0.8071856 0.992
MAE: 0.08150021
