In [1]:
import os
import torch
import math
import typing
import time
from torch.utils.data import Dataset
from torch_scatter import scatter_max
from torch.utils.data import DataLoader
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from itertools import chain
import numpy as np
from functools import reduce
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch_scatter import scatter_add
from sklearn.metrics import f1_score




In [2]:
!ls holstep

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
#Import the dataset
train_dir = r"C:\Users\user\Downloads\hol\holstep\holstep\train"
test_dir = r"C:\Users\user\Downloads\hol\holstep\holstep\test"

In [4]:
#Device check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 2080 SUPER
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [5]:
# print(torch.__version__)
# Version check on CUDA and pytorch
# Noticable bugs have been fixed later related to library problem

In [6]:
class Theorem:
    def __init__(self, filename, splitting_token = 'c/\\'):
        assert os.path.exists(filename), "Theorem file should exist!"
        with open(filename,'r') as f:
            fl = [x.replace('\n','') for x in f.readlines()]
        self.theorem_name = fl[0].replace('N ','')
        self.theorem_text = fl[1].replace('C ','')
        self.theorem_token = fl[2].replace('T ','')
        self.filename = filename
        self.labels, self.intrm , self.dependencies, i = [], [], [], 3
        self.longest_sequence = 0
        self.nmb_splits = 0
        self.splitting_token = splitting_token
        while i<len(fl):
            if fl[i].startswith('D'):
                self.dependencies.append(fl[i+2].replace('T ','').split(' '))
                i+=3
            elif fl[i].startswith('+') or fl[i].startswith('-'):
                self.intrm.append(fl[i+1].replace('T ','').split(' '))
                self.longest_sequence = max(self.longest_sequence, len(self.intrm[-1]))
                self.nmb_splits = max(self.nmb_splits, self.intrm[-1].count(self.splitting_token)+1)
                self.labels.append(fl[i][0])
                i+=2
            else:
                i+=1
        self.dependencies_length = sum([len(x) for x in self.dependencies])
        self.unique_tokens = set(chain(*self.dependencies)).union(set(chain(*self.intrm))).union(set(chain(*self.theorem_token)))
        self.label_mapping = {'+':1,'-':0}

#         self.intrm.append(self.theorem_token)
        
    def __len__(self):
        return len(self.intrm)
    
    def __str__(self):
        return "Name: {}, text: {}, token: {}".format(self.theorem_name, self.theorem_text, self.theorem_token)
    
    def __repr__(self):
        return self.__str__()
    
    def __getitem__(self, idx):
        return self.intrm[idx], self.dependencies, self.label_mapping[self.labels[idx]]
    

In [7]:
class HOLDataset(Dataset):
    def __init__(self, path, verbose = False, dep_sep = 'SEPARATOR', 
                 null_token = 'NIHIL', intrm_dep_sep = 'SEPARATOR_INTRM', 
                 splitting_token = 'c/\\', emb_dim = 128, theorem_cutoff_size=1000, intrm_cutoff_size=500,
                 intrm_splits = None, 
                 longest_dependencies = None, longest_intrm = None, longest_theorem = None, 
                 unique_tokens = None, dictionary = None
                ):
        self.theorems = []
        if verbose == True: 
            pbar = tqdm(os.listdir(path))
        else:
            pbar = os.listdir(path)
        for f in pbar:
            t = Theorem("{}/{}".format(path,f), splitting_token)
            if len(t)!=0 and len(t.theorem_token)<=theorem_cutoff_size and t.longest_sequence<=intrm_cutoff_size:
                if dictionary is None:
                    self.theorems.append(t)
                else:
                    if t.unique_tokens.issubset(unique_tokens):
                        self.theorems.append(t)
        self.size = sum([len(x) for x in self.theorems])-1
        self.cumulative_size = np.cumsum([len(x) for x in self.theorems])
        self.cumulative_size = np.insert(self.cumulative_size,0,0)
        
        if intrm_splits is None:
            self.intrm_splits = max([x.nmb_splits for x in self.theorems])+1
        else:
            self.intrm_splits = intrm_splits
            
        if longest_dependencies is None:
            self.longest_dependencies = max([x.dependencies_length+len(x.dependencies) for x in self.theorems])
        else:
            self.longest_dependencies = longest_dependencies
        
        if longest_intrm is None:
            self.longest_intrm = max([x.longest_sequence for x in self.theorems])
        else:
            self.longest_intrm = longest_intrm
            
        if longest_theorem is None:
            self.longest_theorem = max([len(x.theorem_token) for x in self.theorems])
        else:
            self.longest_theorem = longest_theorem
        
        if unique_tokens is None:
            self.unique_tokens = set().union(*[x.unique_tokens for x in self.theorems]) # also one new token to separate dependencies and NULL token
        else:
            self.unique_tokens = unique_tokens
            
#         assert dep_sep not in self.unique_tokens, "Dependency separator should be unique!"
#         assert null_token not in self.unique_tokens, "NULL token should be unique!"
#         assert intrm_dep_sep not in self.unique_tokens, "Dep_intrm separator should be unique!"
        self.dep_sep, self.null_token, self.intrm_dep_sep = dep_sep, null_token, intrm_dep_sep
        self.unique_tokens.add(self.dep_sep)
#         self.unique_tokens.add(self.null_token)
        self.unique_tokens.add(self.intrm_dep_sep)
        
        self.split_token = splitting_token
        
#         self.encoder_avg = torch.nn.Embedding(len(self.unique_tokens), emb_dim)
        if dictionary is None:
            self.dictionary = {x:i for i,x in enumerate(self.unique_tokens,1)}
        else:
            self.dictionary = dictionary
        self.dictionary[self.null_token] = 0
        self.unique_tokens.add(self.null_token)
        
    def __len__(self):
        return self.size
    
    def encode_intrm(self, tokens):
        return torch.tensor([self.dictionary[x] for x in tokens])
    
    def encode_dependencies(self, dependencies):
        joined = reduce(lambda a,b:a + [self.dep_sep] + b, dependencies) # hate this function, but looks neat
        return torch.tensor([self.dictionary[x] for x in joined])
    
    def __getitem__(self, idx):
        theorem_index = np.argmax(self.cumulative_size > idx)-1
        if theorem_index<0:
            theorem_index=0
        offset = self.cumulative_size[theorem_index]
        theorem_offset = idx-offset
        try:
            intrm, deps, label = self.theorems[theorem_index][theorem_offset]
        except Exception as inst:
            print(idx, offset, theorem_offset, theorem_index)
            raise inst
        encoded_intrm = self.encode_intrm(intrm)
        encoded_dep = self.encode_dependencies(deps)
        padded_dep = F.pad(encoded_dep,(0,self.longest_dependencies-len(encoded_dep)),value = self.dictionary[self.null_token])
        padded_intrm = F.pad(encoded_intrm, (0,self.longest_intrm-len(encoded_intrm)),value = self.dictionary[self.null_token])
        
        
        splitted = self.encode_intrm(intrm)
        indices = [i for i,x in enumerate(splitted) if x == self.dictionary[self.split_token]]
        
        splitted = torch.split(splitted,
                               tuple(np.diff([0]+indices+[len(splitted)])))
        splitted = torch.stack([F.pad(x,(0,self.longest_intrm-len(x)),value = self.dictionary[self.null_token])
                   for x in splitted])
        null_sequence = [[self.dictionary[self.null_token] for x in range(self.longest_intrm)]]
        padded_avgd = torch.Tensor([null_sequence for x in range(self.intrm_splits-len(splitted))]).long()
        padded_avgd = torch.concat((splitted, padded_avgd.squeeze(1)))
        encoded_proof = self.encode_intrm(self.theorems[theorem_index].theorem_token)
        padded_proof = F.pad(encoded_proof, (0,self.longest_theorem-len(encoded_proof)),value = self.dictionary[self.null_token])
        return {'intrm':padded_intrm, 'deps':padded_dep, 
                'intrm_dep_sep':self.dictionary[self.intrm_dep_sep] ,'label':label,
               'splitted': padded_avgd, 'theorem':padded_proof}
        

In [8]:
train_dataset = HOLDataset(train_dir, verbose=True)

  0%|          | 0/9999 [00:00<?, ?it/s]

In [9]:
test_dataset = HOLDataset(test_dir, verbose=True, intrm_splits = train_dataset.intrm_splits, 
                 longest_dependencies = train_dataset.longest_dependencies, longest_intrm = train_dataset.longest_intrm, 
                          longest_theorem = train_dataset.longest_theorem, 
                 unique_tokens = train_dataset.unique_tokens, dictionary = train_dataset.dictionary)

  0%|          | 0/1411 [00:00<?, ?it/s]

In [23]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [24]:
class TransformerModelMI(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, null_token:int, 
                 max_length_i: int, 
                 max_length_d: int,
                 max_length_a: int,
                 max_length_proof:int,
                 dropout: float = 0.5, output_classes: int=2):
        super().__init__()
        #if norm_layer is None:
        #   norm_layer = nn.BatchNorm2d
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
     
#       dep_encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        
        proof_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)

        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.proof_encoder = TransformerEncoder(proof_layers, nlayers)
        

        self.encoder = nn.Embedding(ntoken, d_model)
        
        self.d_model = d_model
        
        self.null_token = null_token
        self.d_hid = d_hid
        
        self.max_length_i = max_length_i
        self.max_length_d = max_length_d
        self.max_length_a = max_length_a
        self.max_length_proof = max_length_proof
        
        self.decoder_gru = nn.LSTM(d_model, d_hid, batch_first=True, bidirectional=True, num_layers=nlayers)
        self.decoder_final = nn.Linear(2*nlayers*d_hid*2, output_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        
        
    def forward(self, src: Tensor, dependencies:Tensor, splitted: Tensor, proof: Tensor, src_mask: Tensor,
               src_mask_avg:Tensor, src_mask_proof:Tensor):
        
        proof = self.encoder(proof) * math.sqrt(self.d_model)
        proof = self.pos_encoder(proof)
        proof = self.proof_encoder(proof, src_mask_proof)
        
        padding_mask = src==0
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_key_padding_mask=padding_mask)

        
        output = torch.concat((output, proof), axis=1)
#         print(output.shape)
#         print('error concat output')
#         output = torch.transpose(output, 1, 2)
        _, (output_h, output_c) = self.decoder_gru(output)
        output_h = torch.transpose(output_h, 1, 0)
        output_c = torch.transpose(output_c, 1, 0)
        output = torch.concat((output_c, output_h), axis=1)
        

        output = nn.LeakyReLU()(output)
        output = output.flatten(start_dim=1)
#       print('error12')  #bug check
#       print(output.flatten(start_dim=1).shape)
#       print(self.decoder_final.weight.shape)
        output = nn.Softmax(dim=1)(self.decoder_final(output))

        return output

In [25]:
ntokens = len(train_dataset.unique_tokens)  # size of vocabulary
emsize = 64  # embedding dimension
d_hid = 128  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in 4nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = TransformerModelMI(ntokens, emsize, nhead, d_hid, nlayers, train_dataset.dictionary[train_dataset.null_token] ,
                             max_length_i = train_dataset.longest_dependencies, 
                             max_length_d = train_dataset.longest_intrm,
                             max_length_a = train_dataset.intrm_splits,
                           max_length_proof = train_dataset.longest_theorem,
                           dropout=dropout,output_classes=2).to(device)

criterion = nn.CrossEntropyLoss()
# criterion = nn.BCELoss()
lr = 0.005  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum = 0.9)


In [26]:
#nvidia-smi for monitoring memory usage 
train_dataloader = DataLoader(train_dataset, batch_size=60, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=True)

In [27]:
def generate_square_subsequent_mask(sz: int):
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [28]:
def train_loop(model, dataloader, optimizer, i_number):
    size = len(dataloader)
    model.train()  
    total_loss = 0.
    log_interval = 500
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(dataloader.dataset.longest_intrm).to(device)
    src_mask_avg = generate_square_subsequent_mask(dataloader.dataset.intrm_splits).to(device)
    src_mask_proof = generate_square_subsequent_mask(dataloader.dataset.longest_theorem).to(device)
    
    loss_history = []
    pbar = tqdm(dataloader, total = i_number)
    for batch, train_data in enumerate(pbar):
        batch_size = train_data['intrm'].size(0)
        X = train_data['intrm'].to(device)
        dep_intrm_vector = train_data['intrm_dep_sep'].to(device)
        y = train_data['label'].to(device).long()
        splitted = train_data['splitted'].to(device).long()
        deps = train_data['deps'].to(device)
        proof = train_data['theorem'].to(device)
        optimizer.zero_grad()
        output = model(X, deps, splitted, proof, src_mask, src_mask_avg, src_mask_proof)
        #         output = model(torch.concat((X, dep_intrm_vector.reshape(-1,1), deps), axis=1), src_mask)
#         output = output.flatten(end_dim=1)
        loss = criterion(output, y)#.float())

#         loss = criterion(output.view(-1), y.float())
        
        if batch % 5 == 0:
            pbar.set_description("{:.2f}".format(f1_score(y.cpu().numpy(), output.argmax(axis=1).cpu().numpy(), average='micro')))
        
        
        loss.backward()
        optimizer.step()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.005)
        if batch == i_number:
            return loss_history

        total_loss += loss.item()
        
        if batch % log_interval == 0 and batch > 0:
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            loss_history.append(cur_loss)
            rolling_loss = np.mean(loss_history[-5:])
            try:
                ppl = math.exp(cur_loss)
            except Exception as inst:
                ppl = float('inf')
            print(f'| {batch:5d}/{size:5d} batches | {rolling_loss:5.2} rolling loss |'
                  f'ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

    return loss_history
    


In [29]:
def evaluate(model, dataloader, criterion, test_size=10):
    model.eval()
    labels = []
    predictions = []
#     src_mask = generate_square_subsequent_mask(bptt).to(device)
    src_mask = generate_square_subsequent_mask(dataloader.dataset.longest_intrm).to(device)
    src_mask_avg = generate_square_subsequent_mask(dataloader.dataset.intrm_splits).to(device)
    src_mask_proof = generate_square_subsequent_mask(dataloader.dataset.longest_theorem).to(device)
    total_loss = 0
    for i, test_data in enumerate(tqdm(dataloader, total = test_size)):
        #batcg_size = 64
        batch_size = test_data['intrm'].size(0)
        X = test_data['intrm'].to(device)
        dep_intrm_vector = test_data['intrm_dep_sep'].to(device)
        y = test_data['label'].to(device).long()
        splitted = test_data['splitted'].to(device).long()
        deps = test_data['deps'].to(device)
        proof = test_data['theorem'].to(device)

        output = model(X, deps, splitted, proof, src_mask, src_mask_avg, src_mask_proof)
        loss = criterion(output, y)
        labels.append(y.cpu().numpy())        
        predictions.append(output.argmax(axis=1).cpu().numpy())
        total_loss+=loss.item()
        if i>=test_size:
            return labels, predictions, total_loss
    return labels, predictions, total_loss

In [30]:
from sklearn.metrics import classification_report
test_dataloader = DataLoader(test_dataset, batch_size=20, shuffle=True)
test_labels, test_predictions, val_loss = evaluate(model, test_dataloader, criterion ,100)
test_labels = np.array(test_labels).flatten()
preds = np.array(test_predictions).flatten()
print(classification_report(test_labels, preds))


  0%|          | 0/100 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.51      0.49      0.50       983
           1       0.53      0.55      0.54      1037

    accuracy                           0.52      2020
   macro avg       0.52      0.52      0.52      2020
weighted avg       0.52      0.52      0.52      2020



In [31]:
#Debug session

#len(train_dataloader)
#len(test_dataloader)
#train_dataloader
#next(iter(train_dataloader))


In [32]:
with open('train_dataset_preprocessed.pickle','wb') as f:
    pickle.dump(train_dataset, f)

In [None]:
from copy import deepcopy
from tqdm.notebook import trange
from sklearn.metrics import f1_score
import pickle
import gc

num_epochs = 1
loss_history_global = []

evaluation_results = []

grad_list = []

num_iters = 10
experiments = 3
for k in trange(experiments):
    model = TransformerModelMI(ntokens, emsize, nhead, d_hid, nlayers, train_dataset.dictionary[train_dataset.null_token] ,
                             max_length_i = train_dataset.longest_dependencies, 
                             max_length_d = train_dataset.longest_intrm,
                             max_length_a = train_dataset.intrm_splits,
                           max_length_proof = train_dataset.longest_theorem,
                           dropout=dropout,output_classes=2).to(device)

    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum = 0.9)
    val_history_global = []
    for i in trange(num_iters):
        train_dataloader = DataLoader(train_dataset, batch_size=60, shuffle=True)
        test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

        test_labels, test_predictions, val_loss = evaluate(model, test_dataloader, criterion, 
                                                           int(len(test_dataloader)/10))
        test_labels = np.array(test_labels).flatten()
        
        
        preds = np.array(test_predictions).flatten()
        print(f1_score(test_labels, preds, average='micro'))
        
        loss_history = train_loop(model, train_dataloader, i_number = int(len(train_dataloader)/num_iters), optimizer = optimizer)
        loss_history_global.append(deepcopy(loss_history))
        
        evaluation_results.append((deepcopy(test_labels), deepcopy(preds)))
        val_history_global.append(val_loss)
        total_norm = 0
        for p in model.parameters():
            param_norm = p.grad.detach().data.norm(2)
            total_norm += param_norm.item() ** 2
            total_norm = total_norm ** 0.5

        with open('{}_{}_history.pickle'.format(i,k),'wb') as f:
            pickle.dump({'eval_res':evaluation_results,"loss_hist":loss_history,'grad':total_norm,
                         'val_loss':val_history_global},f)
        torch.save(model, '{}_{}_.model'.format(i,k))
           
    del model
    gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2562 [00:00<?, ?it/s]

0.5034139680062427


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.69 rolling loss |ms/batch 473.95 | loss  0.69 | ppl     2.00
|  1000/19283 batches |  0.68 rolling loss |ms/batch 476.39 | loss  0.67 | ppl     1.96
|  1500/19283 batches |  0.67 rolling loss |ms/batch 472.12 | loss  0.65 | ppl     1.91


  0%|          | 0/2562 [00:00<?, ?it/s]

0.6483612953570035


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.63 rolling loss |ms/batch 479.50 | loss  0.63 | ppl     1.89
|  1000/19283 batches |  0.63 rolling loss |ms/batch 483.15 | loss  0.62 | ppl     1.87
|  1500/19283 batches |  0.63 rolling loss |ms/batch 480.17 | loss  0.62 | ppl     1.86


  0%|          | 0/2562 [00:00<?, ?it/s]

0.6787943815840811


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.61 rolling loss |ms/batch 476.42 | loss  0.61 | ppl     1.84
|  1000/19283 batches |  0.61 rolling loss |ms/batch 479.08 | loss  0.61 | ppl     1.84
|  1500/19283 batches |  0.61 rolling loss |ms/batch 477.95 | loss  0.60 | ppl     1.83


  0%|          | 0/2562 [00:00<?, ?it/s]

0.6913772922356614


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |   0.6 rolling loss |ms/batch 477.17 | loss  0.60 | ppl     1.81
|  1000/19283 batches |   0.6 rolling loss |ms/batch 474.52 | loss  0.59 | ppl     1.81
|  1500/19283 batches |  0.59 rolling loss |ms/batch 473.22 | loss  0.59 | ppl     1.81


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7169332813109637


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.58 rolling loss |ms/batch 475.89 | loss  0.58 | ppl     1.79
|  1000/19283 batches |  0.58 rolling loss |ms/batch 477.86 | loss  0.58 | ppl     1.79
|  1500/19283 batches |  0.58 rolling loss |ms/batch 479.63 | loss  0.58 | ppl     1.78


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7351736246586033


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.58 rolling loss |ms/batch 486.03 | loss  0.58 | ppl     1.78
|  1000/19283 batches |  0.57 rolling loss |ms/batch 472.63 | loss  0.57 | ppl     1.77
|  1500/19283 batches |  0.57 rolling loss |ms/batch 475.90 | loss  0.57 | ppl     1.76


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7376121732344908


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.57 rolling loss |ms/batch 445.70 | loss  0.57 | ppl     1.76
|  1000/19283 batches |  0.56 rolling loss |ms/batch 467.10 | loss  0.56 | ppl     1.75
|  1500/19283 batches |  0.56 rolling loss |ms/batch 464.69 | loss  0.56 | ppl     1.75


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7530238002341008


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.56 rolling loss |ms/batch 467.87 | loss  0.56 | ppl     1.74
|  1000/19283 batches |  0.56 rolling loss |ms/batch 461.83 | loss  0.55 | ppl     1.74
|  1500/19283 batches |  0.55 rolling loss |ms/batch 467.26 | loss  0.55 | ppl     1.74


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7557549746390948


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.55 rolling loss |ms/batch 460.88 | loss  0.55 | ppl     1.74
|  1000/19283 batches |  0.55 rolling loss |ms/batch 456.44 | loss  0.55 | ppl     1.73
|  1500/19283 batches |  0.55 rolling loss |ms/batch 463.41 | loss  0.55 | ppl     1.73


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7657042528287165


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.55 rolling loss |ms/batch 454.46 | loss  0.55 | ppl     1.73
|  1000/19283 batches |  0.55 rolling loss |ms/batch 474.80 | loss  0.54 | ppl     1.72
|  1500/19283 batches |  0.55 rolling loss |ms/batch 457.39 | loss  0.55 | ppl     1.73


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2562 [00:00<?, ?it/s]

0.49951229028482247


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.69 rolling loss |ms/batch 451.78 | loss  0.69 | ppl     1.99
|  1000/19283 batches |  0.68 rolling loss |ms/batch 451.20 | loss  0.67 | ppl     1.96
|  1500/19283 batches |  0.67 rolling loss |ms/batch 450.76 | loss  0.65 | ppl     1.92


  0%|          | 0/2562 [00:00<?, ?it/s]

0.6413382754584471


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.63 rolling loss |ms/batch 462.61 | loss  0.63 | ppl     1.88
|  1000/19283 batches |  0.63 rolling loss |ms/batch 468.31 | loss  0.63 | ppl     1.87
|  1500/19283 batches |  0.63 rolling loss |ms/batch 465.95 | loss  0.62 | ppl     1.86


  0%|          | 0/2562 [00:00<?, ?it/s]

0.6821108076472884


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.61 rolling loss |ms/batch 473.66 | loss  0.61 | ppl     1.85
|  1000/19283 batches |  0.61 rolling loss |ms/batch 466.68 | loss  0.60 | ppl     1.83
|  1500/19283 batches |  0.61 rolling loss |ms/batch 464.98 | loss  0.60 | ppl     1.83


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7019118220834959


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |   0.6 rolling loss |ms/batch 464.22 | loss  0.60 | ppl     1.81
|  1000/19283 batches |  0.59 rolling loss |ms/batch 460.86 | loss  0.59 | ppl     1.81
|  1500/19283 batches |  0.59 rolling loss |ms/batch 462.64 | loss  0.59 | ppl     1.80


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7140070230198986


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.58 rolling loss |ms/batch 463.46 | loss  0.58 | ppl     1.79
|  1000/19283 batches |  0.58 rolling loss |ms/batch 463.65 | loss  0.58 | ppl     1.78
|  1500/19283 batches |  0.58 rolling loss |ms/batch 461.38 | loss  0.57 | ppl     1.78


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7224931720639876


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.57 rolling loss |ms/batch 465.19 | loss  0.57 | ppl     1.77
|  1000/19283 batches |  0.57 rolling loss |ms/batch 461.02 | loss  0.57 | ppl     1.77
|  1500/19283 batches |  0.57 rolling loss |ms/batch 463.49 | loss  0.57 | ppl     1.76


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7341006632852126


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.56 rolling loss |ms/batch 459.48 | loss  0.56 | ppl     1.75
|  1000/19283 batches |  0.56 rolling loss |ms/batch 457.67 | loss  0.56 | ppl     1.76
|  1500/19283 batches |  0.56 rolling loss |ms/batch 456.89 | loss  0.56 | ppl     1.75


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7417089348419822


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.56 rolling loss |ms/batch 450.87 | loss  0.56 | ppl     1.75
|  1000/19283 batches |  0.56 rolling loss |ms/batch 479.63 | loss  0.56 | ppl     1.74
|  1500/19283 batches |  0.56 rolling loss |ms/batch 551.51 | loss  0.56 | ppl     1.75


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7445376511900117


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.55 rolling loss |ms/batch 527.56 | loss  0.55 | ppl     1.74
|  1000/19283 batches |  0.55 rolling loss |ms/batch 527.43 | loss  0.55 | ppl     1.74
|  1500/19283 batches |  0.55 rolling loss |ms/batch 522.60 | loss  0.55 | ppl     1.74


  0%|          | 0/2562 [00:00<?, ?it/s]

0.7617050331642606


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.55 rolling loss |ms/batch 568.35 | loss  0.55 | ppl     1.73
|  1000/19283 batches |  0.55 rolling loss |ms/batch 1260.42 | loss  0.55 | ppl     1.73
|  1500/19283 batches |  0.55 rolling loss |ms/batch 558.39 | loss  0.55 | ppl     1.73


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2562 [00:00<?, ?it/s]

0.5045844713226687


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.69 rolling loss |ms/batch 533.68 | loss  0.69 | ppl     2.00
|  1000/19283 batches |  0.68 rolling loss |ms/batch 540.37 | loss  0.68 | ppl     1.97
|  1500/19283 batches |  0.68 rolling loss |ms/batch 546.13 | loss  0.66 | ppl     1.93


  0%|          | 0/2562 [00:00<?, ?it/s]

0.6196839641045649


  0%|          | 0/1928 [00:00<?, ?it/s]

|   500/19283 batches |  0.64 rolling loss |ms/batch 548.92 | loss  0.64 | ppl     1.89
|  1000/19283 batches |  0.63 rolling loss |ms/batch 554.70 | loss  0.63 | ppl     1.88
|  1500/19283 batches |  0.63 rolling loss |ms/batch 555.26 | loss  0.62 | ppl     1.85


  0%|          | 0/2562 [00:00<?, ?it/s]

0.674112368318377


  0%|          | 0/1928 [00:00<?, ?it/s]

In [None]:
model = torch.load("9_2_.model")

In [None]:
from sklearn.metrics import classification_report
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)
test_labels, test_predictions, val_loss = evaluate(model, test_dataloader, criterion ,len(test_dataloader))

In [None]:
test_labels_flatten = list(chain(*[x.tolist() for x in test_labels]))
test_prediction_flatten = list(chain(*[x.tolist() for x in test_predictions]))

In [None]:
print(classification_report(test_labels_flatten, test_prediction_flatten))

In [None]:
import pandas as pd
report = classification_report(test_labels_flatten, test_prediction_flatten, output_dict=True)
df = pd.DataFrame(report).transpose()
df.to_csv('report.csv',  float_format='%.3f')