In [1]:
import sys
sys.path.append('../')

In [2]:
import random

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.model_selection import train_test_split

In [4]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset 

In [5]:
from mlpack.utils import to_device, to_fp16
from mlpack.trainer import TrainArgs, BaseTrainer

# Data

In [6]:
df = pd.read_csv('../datasets/ENEL/dataset.csv', sep='\t', index_col=0)
df.head()

Unnamed: 0_level_0,2014-01-01 00:00:00,2014-01-02 00:00:00,2014-01-03 00:00:00,2014-01-04 00:00:00,2014-01-05 00:00:00,2014-01-06 00:00:00,2014-01-07 00:00:00,2014-01-08 00:00:00,2014-01-09 00:00:00,2014-01-10 00:00:00,...,2016-10-23 00:00:00,2016-10-24 00:00:00,2016-10-25 00:00:00,2016-10-26 00:00:00,2016-10-27 00:00:00,2016-10-28 00:00:00,2016-10-29 00:00:00,2016-10-30 00:00:00,2016-10-31 00:00:00,flag
CONS_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0387DD8A07E07FDA6271170F86AD9151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.305338,0.306095,0.360579,0.207343,0.331067,0.351877,0.285285,0.34658,0.255016,1
01D6177B5D4FFE0CABA9EF17DAFC2B84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4B75AC4F2D8434CFF62DB64D0BB43103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.963074,1
B32AC8CC6D5D805AC053557AB05F5343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.497605,0.568034,0.490694,0.644715,0.684207,0.590742,0.633854,0.475885,0.38571,1
EDFC78B07BA2908B3395C4EB2304665E,0.106331,0.248451,0.318474,0.128116,0.143158,0.233409,0.201251,0.146789,0.139008,0.133303,...,0.48601,0.39524,0.27283,0.272311,0.304988,0.345965,0.520762,0.467337,0.408207,1


In [148]:
df_train, df_valid = train_test_split(df, random_state=1, test_size=0.2, shuffle=True)

In [149]:
data_train = df_train.values
data_valid = df_valid.values

In [150]:
train2valid_indices = np.load('train2valid.npy')
valid2train_indices = np.load('valid2train.npy')

In [151]:
train2valid_data = data_train[train2valid_indices]

In [152]:
valid2train_data = data_valid[valid2train_indices]

In [153]:
valid2train_data.shape

(392, 1035)

In [154]:
data_train = np.delete(data_train, train2valid_indices, axis=0)
data_train = np.concatenate((data_train, valid2train_data), axis=0)

In [155]:
data_valid = np.delete(data_valid, valid2train_indices, axis=0)
data_valid = np.concatenate((data_valid, train2valid_data), axis=0)

In [157]:
x_train = data_train[:,:-1]
y_train = data_train[:,-1]

x_valid = data_valid[:,:-1]
y_valid = data_valid[:,-1]

In [158]:
x_train = np.concatenate(([2*[0] for _ in range(len(x_train))], x_train), axis=1)
x_valid = np.concatenate(([2*[0] for _ in range(len(x_valid))], x_valid), axis=1)

In [159]:
x_train.shape, x_valid.shape

((33897, 1036), (8475, 1036))

In [160]:
x_train = x_train.reshape(len(x_train), -1, 7)
x_train = np.expand_dims(x_train, 1)

x_valid = x_valid.reshape(len(x_valid), -1, 7)
x_valid = np.expand_dims(x_valid, 1)

In [161]:
x_train = x_train.transpose(0, 1, 3, 2)

In [162]:
x_valid = x_valid.transpose(0, 1, 3, 2)

In [163]:
len(x_valid), len(x_train)

(8475, 33897)

# Dataset

In [52]:
class ENELDataset(Dataset):
    def __init__(self, x, y):
        self.x, self.y = x, y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x, y = self.x[idx], self.y[idx]        
        return x.astype(np.float32), y.astype(np.int64)

In [53]:
ds_train = ENELDataset(x_train, y_train)
ds_valid = ENELDataset(x_valid, y_valid)

In [54]:
dl_train = DataLoader(ds_train, batch_size=64, shuffle=False, pin_memory=True, num_workers=4)
dl_valid = DataLoader(ds_valid, batch_size=64, shuffle=False, pin_memory=True, num_workers=4)

# Models

In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

In [29]:
class SelfAttentionConv(nn.Module):
    
    def __init__(self, *args, **kwargs):
        super().__init__()
        
        self.query = nn.Conv2d(*args, **kwargs)
        self.key = nn.Conv2d(*args, **kwargs)
        self.value = nn.Conv2d(*args, **kwargs)
        
        self.d = self.query.out_channels**(0.5)
        
    def forward(self, *args, **kwargs):
        query = self.query(*args, **kwargs)
        key = self.key(*args, **kwargs)
        value = self.value(*args, **kwargs)
        
        attention = torch.matmul(query, key.transpose(-2, -1))
        attention = attention / self.d
        attention = nn.Softmax(dim=-1)(attention)
        attention = torch.matmul(attention, value)
        return attention

In [30]:
class MySelfConvModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            self.block(1, 90),
            self.block(90,90),
            nn.MaxPool2d(3)
        )
        self.classifier = nn.Linear(90 * 2 * 49, 2)
        
    def block(self, in_channels, out_channels, dropout=0.1):
        return nn.Sequential(
            SelfAttentionConv(in_channels, out_channels, kernel_size=3, padding=1),  # n x out_channels x m x out_features
            nn.ReLU(),
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        o = self.net(x)
        o = o.view(x.shape[0], -1)
        o = self.classifier(o)
        return o

In [31]:
model = MySelfConvModel()

In [32]:
model.to(device)

MySelfConvModel(
  (net): Sequential(
    (0): Sequential(
      (0): SelfAttentionConv(
        (query): Conv2d(1, 90, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (key): Conv2d(1, 90, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (value): Conv2d(1, 90, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
    )
    (1): Sequential(
      (0): SelfAttentionConv(
        (query): Conv2d(90, 90, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (key): Conv2d(90, 90, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (value): Conv2d(90, 90, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
    )
    (2): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Linear(in_features=8820, out_features=2, bias=True)
)

In [33]:
x, y = next(iter(dl_train))

In [34]:
with torch.no_grad():
    o = model(x.to(device))
o.shape

torch.Size([64, 2])

# Eval

In [35]:
import numpy as np

def precision_at_k(y_true,class_probs,k,threshold=0.5,class_of_interest=1,isSorted=False):

    if (not isSorted):

        # Selecting the probs on the class of interest (coi)
        coi_probs = class_probs[:,class_of_interest]

        # print('Selecionando as probs da classe de interesse:',class_of_interest)
        # print(coi_probs)

        # print('Ordem de entrada dos targets')
        # print(y_true)

        # Sorting the coi probs and selecting the top k
        # print('Ordenando as probs e os targets correspondentes:')
        sorted_coi_probs = np.sort(coi_probs)[::-1]
        sorted_y = y_true[np.argsort(coi_probs)[::-1]]
        
        # Calculating the precision for 
        # print(sorted_coi_probs)
        # print(sorted_y)

    else:
        # Code otimization so the algorithm doesn't have to sort the data twice
        sorted_coi_probs = class_probs
        sorted_y = y_true


    # Selecting the top k probs and their respective labels
    sorted_coi_probs = sorted_coi_probs[:k]
    sorted_y = sorted_y[:k]

    # print('Selecionando as top',k,'probs')
    # print(sorted_coi_probs)
    # print(sorted_y)

    # Atributing class based on the threshold
    sorted_predicted_classes = np.where(sorted_coi_probs>threshold,
                                        float(class_of_interest),
                                        0.0)

    # print('Atribuindo a classe baseada no threshold de ',threshold)
#     print(sorted_predicted_classes)

    # print('Precisão em K:')

    # print(k)

    precisionK = np.sum(sorted_predicted_classes == sorted_y)/k  

    return precisionK

def map_at_N(y_true,class_probs,N,thrs=0.5,class_of_interest=1):

    # Calls the precision at k function with the values already sorted and calculates the average precision
    # weighted by the number of positive classes inside the sample

    # Declaring the list to hold the precisions
    pks = []

    # Selecting the probs on the class of interest (coi)
    coi_probs = class_probs[:,class_of_interest]

    # Sorting the class of interest and its respective label
    sorted_coi_probs = np.sort(coi_probs)[::-1]
    sorted_y = y_true[np.argsort(coi_probs)[::-1]]

    # Selecting the top N scores 
    sorted_coi_probs = sorted_coi_probs[:N]
    sorted_y = sorted_y[:N]


    ## TESTANDO APENAS ##
    sorted_y[-1] = 1 

    # Identifying the positions of the class of interest inside the top N
    top_coi_indexes = np.argwhere(sorted_y>0)

    for value in top_coi_indexes:
        
        # Adjusting the index
        limite = value[0] + 1
        
        pks.append(
                    precision_at_k(sorted_y[:limite],
                    sorted_coi_probs[:limite],
                    limite,threshold=thrs,isSorted=True)
                    )

    pks = np.array(pks)
    
    # print(pks)
    # print(sorted_coi_probs)
    # print(sorted_y)
    return pks.mean()

In [42]:
class MyTrainer(BaseTrainer):
    
    @staticmethod
    def dataloader_generator(dataloader):
        for x, y in dataloader:
#             x = x.unsqueeze(-1)
            x, y = to_device(x, y, device=device)
            yield {
                'inputs': {
                    'x': x
                },
                'targets': {
                    'y': y
                }
            }
        
    @staticmethod
    def loss_from_model(model_output, targets, loss_fn=None):
        y = targets['y']
        return loss_fn(model_output, y)
    
    def evaluate_fn(self, model, dataloader, loss_fn):
        model.eval()
        losses = []
        preds = []
        trues = []
        probs = []
        dl_gen = self.dataloader_generator(dataloader)
        for batch in self.tqdm(dl_gen, leave=False, desc='Eval...', total=len(dataloader)):
            inputs = batch['inputs']
            targets = batch['targets']

            with torch.no_grad():
                o = model(**inputs)

            loss = self.loss_from_model(o, targets, loss_fn)

            y = targets['y']
            
            probs += torch.softmax(o, dim=-1).detach().cpu().numpy().tolist()
            preds += o.argmax(1).detach().cpu().numpy().tolist()
            trues += y.detach().cpu().numpy().tolist()
            losses.append(loss.item())

        acc = accuracy_score(trues, preds)
        f1 = f1_score(trues, preds)
        conf = confusion_matrix(trues, preds)
        map100 = map_at_N(np.array(trues), np.array(probs), 100)
#         return trues, preds, probs
        roc = roc_auc_score(np.array(trues), np.array(probs)[:,1])

        both = sorted([
            (x, y) for x, y in zip(preds, trues)
        ], key=lambda x:x[0], reverse=True)

        p = [x for x,_ in both]
        t = [x for _,x in both]

        print('--- Validation ---')
        print(f'F1 = {f1}\t Acc = {acc}')
        print(f'AUC = {auc(p, t)}')
        print(f'MAP@100 = {map100}')
        print(f'ROC = {roc}')
        print(conf)
        return trues, preds, probs
#         return np.array(losses).mean(), map100

# Optim

In [40]:
loss_fn = nn.CrossEntropyLoss(torch.tensor([1., 1.]).to(device))
# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3, weight_decay=2e-3)

In [56]:
args = TrainArgs(40, 'enel_selfatt_conv_5.ckp')

In [57]:
trainer = MyTrainer(grad_steps=1)

In [39]:
model.load_state_dict(torch.load('enel_selfatt_conv_4.ckp'), strict=False)

<All keys matched successfully>

In [129]:
trues, preds, probs = trainer.evaluate_fn(model, dl_valid, loss_fn)

HBox(children=(IntProgress(value=0, description='Eval...', max=133, style=ProgressStyle(description_width='ini…

--- Validation ---
F1 = 0.33308769344141487	 Acc = 0.8932153392330383
AUC = 0.5
MAP@100 = 0.8815388025412902
ROC = 0.7280866531955057
[[7344  392]
 [ 513  226]]


In [130]:
probs = np.array(probs)
preds = np.array(preds)
trues = np.array(trues)

In [131]:
probs_one = probs[:,0]

In [132]:
index_array = np.argsort(probs_one)

In [133]:
index_wrong = (trues == 0) * (preds == 1)

In [134]:
index_ones = index_array[index_wrong]

In [135]:
quantity = min(1000, len(index_ones))
quantity

392

In [136]:
valid_indices_to_change = index_ones[-quantity:] # indices da matriz x_valid que vao para o x_train

In [137]:
# estes sao os casos negativos mais dificeis na validacao
probs_one[valid_indices_to_change]

array([6.03977351e-05, 3.99658224e-04, 1.79992593e-03, 1.15273781e-02,
       3.31634693e-02, 4.33542095e-02, 4.46182042e-02, 6.24680221e-02,
       6.74234480e-02, 7.45389834e-02, 8.40923786e-02, 8.89534429e-02,
       1.00364141e-01, 1.03053436e-01, 1.04054295e-01, 1.20626010e-01,
       1.24012709e-01, 1.69554651e-01, 1.91736534e-01, 2.06555083e-01,
       2.13539004e-01, 2.49864191e-01, 2.63735145e-01, 2.65315384e-01,
       3.25726688e-01, 3.46384883e-01, 3.52418214e-01, 3.71508002e-01,
       3.74995768e-01, 4.01283652e-01, 4.17111039e-01, 4.87894833e-01,
       4.91992354e-01, 5.03432155e-01, 5.26976764e-01, 5.46619773e-01,
       5.53456187e-01, 5.80790162e-01, 5.89053154e-01, 5.90115964e-01,
       5.91422319e-01, 5.92587829e-01, 6.21383965e-01, 6.32724941e-01,
       6.51924372e-01, 6.55579746e-01, 6.76196277e-01, 6.81518137e-01,
       6.89490616e-01, 6.94267511e-01, 7.03260183e-01, 7.08613634e-01,
       7.32004166e-01, 7.33208656e-01, 7.33688712e-01, 7.44545519e-01,
      

# Caso treino

In [138]:
trues, preds, probs = trainer.evaluate_fn(model, dl_train, loss_fn)

HBox(children=(IntProgress(value=0, description='Eval...', max=530, style=ProgressStyle(description_width='ini…

--- Validation ---
F1 = 0.8305873925501432	 Acc = 0.9720919255391333
AUC = 0.5
MAP@100 = 1.0
ROC = 0.9875207747687994
[[30632   389]
 [  557  2319]]


In [139]:
probs = np.array(probs)
preds = np.array(preds)
trues = np.array(trues)

In [140]:
probs_zero = probs[:,0]

In [141]:
index_array = np.argsort(probs_zero)

In [142]:
index_correct = (trues == 0) * (preds == 0)

In [143]:
index_ones = index_array[index_correct]

In [144]:
train_indices_to_change = index_ones[-quantity:] # indices da matriz x_valid que vao para o x_train

In [145]:
# estes sao os casos negativos mais dificeis na validacao
probs_zero[train_indices_to_change]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

# Salvando

In [146]:
np.save('train2valid.npy', train_indices_to_change)

In [147]:
np.save('valid2train.npy', valid_indices_to_change)

In [218]:
# trainer.train(args, model, dl_train, dl_valid, optimizer, loss_fn)

HBox(children=(IntProgress(value=0, description='Training...', max=40, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.34318182621887344


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.3279220779220779	 Acc = 0.8534513274336283
AUC = 0.5
MAP@100 = 0.7368308541053208
ROC = 0.7286458719614672
[[6930  806]
 [ 436  303]]
---Valid
Loss 0.4191767625948962
Metric 0.7368308541053208


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.29265127719162465


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.3089430894308943	 Acc = 0.8696165191740413
AUC = 0.5
MAP@100 = 0.7396782097031307
ROC = 0.7201183542700733
[[7123  613]
 [ 492  247]]
---Valid
Loss 0.3988546264522216
Metric 0.7396782097031307


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.2875800022108112


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.3176620076238882	 Acc = 0.8732743362831858
AUC = 0.5
MAP@100 = 0.7797003082243071
ROC = 0.7271398120381241
[[7151  585]
 [ 489  250]]
---Valid
Loss 0.38085171405006857
Metric 0.7797003082243071


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.268930548857786


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.33053582956746286	 Acc = 0.8776401179941002
AUC = 0.5
MAP@100 = 0.7834990416874006
ROC = 0.7217902382128508
[[7182  554]
 [ 483  256]]
---Valid
Loss 0.3896860725739423
Metric 0.7834990416874006


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.2638881217666015


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.3413014608233732	 Acc = 0.8829498525073747
AUC = 0.5
MAP@100 = 0.7603205275568743
ROC = 0.7258468044941808
[[7226  510]
 [ 482  257]]
---Valid
Loss 0.38137918403920007
Metric 0.7603205275568743


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.25488770980677916


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.31355311355311355	 Acc = 0.8894395280235988
AUC = 0.5
MAP@100 = 0.711244400463223
ROC = 0.722638599493712
[[7324  412]
 [ 525  214]]
---Valid
Loss 0.4088164804612889
Metric 0.711244400463223


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.24572848452779347


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.3315350662089259	 Acc = 0.8391740412979352
AUC = 0.5
MAP@100 = 0.7328787321081119
ROC = 0.7461063715605509
[[6774  962]
 [ 401  338]]
---Valid
Loss 0.4595606554956997
Metric 0.7328787321081119


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.2414106287196011


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.31095406360424027	 Acc = 0.8849557522123894
AUC = 0.5
MAP@100 = 0.7435309083976587
ROC = 0.7168396565693599
[[7280  456]
 [ 519  220]]
---Valid
Loss 0.4113137520411435
Metric 0.7435309083976587


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

-- Train Loss 0.2333498450810324


HBox(children=(IntProgress(value=0, description='Eval...', max=34, style=ProgressStyle(description_width='init…

--- Validation ---
F1 = 0.328400281888654	 Acc = 0.8875516224188791
AUC = 0.5
MAP@100 = 0.6911254258891234
ROC = 0.7254674907957175
[[7289  447]
 [ 506  233]]
---Valid
Loss 0.4113628627622829
Metric 0.6911254258891234


HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

KeyboardInterrupt: 