# BERT Embeddings and Next Sentence Prediction

In [None]:
!pip install transformers datasets tokenizers

In [3]:
import os
from pathlib import Path
import torch
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.optim import Adam

**Clean data**

In [4]:
def process_text(text: str):    
    pattern = re.compile( 
        r'\*\*\d+\*\*|'  
        r'\(#\w+\)|'  
        r'[\*]|'  
        r'\b\w+\.(?:json)\b|'  
        r'-{2,}|'  
        r'>(?:\s+)?|'  
        r'<[^>]*>|'  
        r"!\S+\.(?:jpg|jpeg|gif|png)|"  
        r'IMAGENIMAGEN|'  
        r'(!image\.png)|'  
        r'(!untitled\.png)|'
        r'(/\.attachments/untitled-[a-zA-Z0-9\-]+\.png)|'
        r'(/\.attachments/image-[a-zA-Z0-9\-]+\.png)|'
        r'!media/[a-zA-Z0-9]+\.png|'
        r'image\d+\.png|'
        r'[^\w\s\\p{L}]+'
        r'([a-zA-ZáéíóúÁÉÍÓÚñÑüÜ]+)(https?://)|',     
        flags=re.IGNORECASE
    )

    
    clean_text = pattern.sub('', text.lower())    
    clean_text = re.sub(r"!\S+\.(?:jpg|jpeg|gif|png)", "IMAGEN", clean_text)
    clean_text = re.sub(r'\s\)\s', ' ', clean_text)
    clean_text = re.sub(r'<[^>]+>', '', clean_text)

    return clean_text

**Chunk data**

In [5]:
os.mkdir('./datasets')

In [6]:
MAX_LEN = 512
corpus_wiki_data = "./datasets/corpus_wtout_nlines.txt"
wiki_lines = list()

with open(corpus_wiki_data, 'r') as w:
  lines = w.readlines()
    
def chunk_text(texto, MAX_LEN=MAX_LEN):  
    if len(texto) <= MAX_LEN:
        return texto
    else:
        split_texts = []
        num_splits = math.ceil(len(texto) // MAX_LEN)
        for i in range(num_splits):
            split_text = texto[i * MAX_LEN: (i+1) * MAX_LEN]
            split_texts.append(split_text)
        if len(texto) % MAX_LEN != 0:
            split_text = texto[num_splits * MAX_LEN:]
            split_texts.append(split_text)
        return split_texts

for line in lines:
  clean_text = process_text(line)
  batch_line = chunk_text(clean_text, MAX_LEN)
  wiki_lines.append(batch_line)

In [7]:
wiki_lines[:5]

['[[_toc_]]\n',
 '# 1. introducción\n',
 'en esta sección se encontrará información sobre que debes tener en cuenta para poder realizar las pruebas de manera local y que están integradas con adp y en ambientesbc.\n',
 '# 2. consideraciones\n',
 ['- aplica para pruebas como: [performance modular](https://grupobancolombia.visualstudio.com/vicepresidencia%20servicios%20de%20tecnolog%c3%ada/_wiki/wikis/vicepresidencia%20servicios%20de%20tecnolog%c3%ada.wiki/1988/proceso-para-realizar-pruebas-de-performance-de-componente-modulares) , [aceptación](https://grupobancolombia.visualstudio.com/vicepresidencia%20servicios%20de%20tecnolog%c3%ada/_wiki/wikis/vicepresidencia%20servicios%20de%20tecnolog%c3%ada.wiki/9573/pruebas-de-integraci%c3%b3n-(acceptancetest))',
  ' y funcionales e2e.\n']]

In [17]:
# Splitting 
lines_dic = {}
for line in range(len(wiki_lines)):  
  lines_dic[f"L{line}"] = wiki_lines[line]

# Pares QA
pairs = []
for line in range(len(wiki_lines)):  
  qa_pairs = list()
  if line == len(wiki_lines) - 1:
    break

  first = lines_dic[f"L{line}"]
  second = lines_dic[f"L{line+1}"]

  qa_pairs.append(first)
  qa_pairs.append(second)
  pairs.append(qa_pairs)

# Data type str
for i in range(len(pairs)):  
  for j in range(len(pairs[i])):
    if type(pairs[i][j]) == list:      
      pairs[i][j] = " ".join(pairs[i][j])    

In [19]:
pairs[105]

['4. [¿cuál es la politica de calidad de devops que se debe tener en cuenta en las etapas iniciales?](https://grupobancolombia.visualstudio.com/vicepresidencia%20servicios%20de%20tecnolog%c3%ada/_wiki/wikis/vicepresidencia%20servicios%20de%20tecnolog%c3%ada.wiki/12944/pol%c3%adtica-de-calidad-de-devops)\n',
 '5. [¿cómo reservo el ambiente de pruebas?](https://grupobancolombia.visualstudio.com/vicepresidencia%20servicios%20de%20tecnolog%c3%ada/_wiki/wikis/vicepresidencia%20servicios%20de%20tecnolog%c3%ada.wiki/4877/como-crear-una-reserva)\n']

# Tokenization

**WordPiece Tokenization**

In [20]:
os.mkdir('./data')

In [21]:
text_data = []
file_count = 0

for sample in tqdm.tqdm([x[0] for x in pairs]):  
  text_data.append(sample)

  if len(text_data) == 1000:
    with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
      fp.write('\n'.join(text_data))
    text_data = []
    file_count += 1

100%|██████████| 4928/4928 [00:00<00:00, 403260.69it/s]


In [23]:
paths = [str(x) for x in Path('./data').glob('**/*.txt')]

# Entrenando nuestro propio tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)
tokenizer.train(
    files=paths,
    vocab_size=30_000,
    min_frequency=2,
    limit_alphabet=1000,
    wordpieces_prefix='',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
)

os.mkdir('./bert-it-1')
tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)



In [24]:
class BERTDataset(Dataset):
  def __init__(self, data_pair, tokenizer, seq_len=64):
    self.tokenizer = tokenizer
    self.seq_len = seq_len
    self.corpus_lines = len(data_pair)
    self.lines = data_pair
  
  def __len__(self):
    return self.corpus_lines

  def __getitem__(self, item):
    
    t1, t2, is_next_label = self.get_sent(item)

    t1_random, t1_label = self.random_word(t1)
    t2_random, t2_label = self.random_word(t2)

    t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
    t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
    t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
    t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

    segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
    bert_input = (t1 + t2)[:self.seq_len]
    bert_label = (t1_label + t2_label)[:self.seq_len]
    padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
    bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

    output = {"bert_input": bert_input,
              "bert_label": bert_label,
              "segment_label": segment_label,
              "is_next": is_next_label}

    return {key: torch.tensor(value) for key, value in output.items()}

  def random_word(self, sentence):
    tokens = sentence.split()
    output_label = []
    output = []
    # 15% of the tokens
    for i, token in enumerate(tokens):
        prob = random.random()
        # remove cls and sep token
        token_id = self.tokenizer(token)['input_ids'][1:-1]
        if prob < 0.15:
            prob /= 0.15
            # 80% 
            if prob < 0.8:
                for i in range(len(token_id)):
                    output.append(self.tokenizer.vocab['[MASK]'])
            # 10% 
            elif prob < 0.9:
                for i in range(len(token_id)):
                    output.append(random.randrange(len(self.tokenizer.vocab)))
            # 10% 
            else:
                output.append(token_id)
            output_label.append(token_id)
        else:
            output.append(token_id)
            for i in range(len(token_id)):
                output_label.append(0)

    # flattening
    output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
    output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
    assert len(output) == len(output_label)
    return output, output_label

  def get_sent(self, index):
    t1, t2 = self.get_corpus_line(index)

    if random.random() > 0.5:
      return t1, t2, 1
    else:
      return t1, self.get_random_line(), 0
  
  def get_corpus_line(self, item):
    return self.lines[item][0], self.lines[item][1]
  
  def get_random_line(self):
    return self.lines[random.randrange(len(self.lines))][1]

In [25]:
train_data = BERTDataset(
    pairs, seq_len=MAX_LEN, tokenizer=tokenizer)
train_loader = DataLoader(
    train_data, batch_size=32, shuffle=True, pin_memory=True)
sample_data = next(iter(train_loader))

In [28]:
train_data[random.randrange(len(train_data))]

{'bert_input': tensor([   1,   16,   35,   37,    3,    3,   38,  336,    4,    3,  115,   36,
          130,  406,   38,    2,   16,  141,  111, 2158,  205, 5032, 1941,   40,
          149, 3950,   95,    3, 1803,  829,   17,  319,  127,  149, 1609,    4,
         2882,   40, 1609, 4909,   17,    2,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,

# Embedding

In [35]:
class PositionalEmbedding(torch.nn.Module):

    def __init__(self, d_model, max_len=128):
        super().__init__()

        # Positional encodings
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        for pos in range(max_len):   
            
            for i in range(0, d_model, 2):   
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        
        self.pe = pe.unsqueeze(0)        

    def forward(self, x):
        return self.pe

class BERTEmbedding(torch.nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, embed_size, seq_len=512, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """

        super().__init__()
        self.embed_size = embed_size
        # (m, seq_len) --> (m, seq_len, embed_size)        
        self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0)
        self.position = PositionalEmbedding(d_model=embed_size, max_len=seq_len)
        self.dropout = torch.nn.Dropout(p=dropout)
       
    def forward(self, sequence, segment_label):
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)

In [36]:
### attention layers
class MultiHeadedAttention(torch.nn.Module):
    
    def __init__(self, heads, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = torch.nn.Dropout(dropout)

        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)
        self.output_linear = torch.nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, d_model)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, d_model)
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value)   
        
        # (batch_size, max_len, d_model) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)   
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        
        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))

        # (batch_size, h, max_len, max_len)
        scores = scores.masked_fill(mask == 0, -1e9)    

        # (batch_size, h, max_len, max_len)
        # Attention weight for all non-pad tokens        
        weights = F.softmax(scores, dim=-1)           
        weights = self.dropout(weights)

        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)

        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, d_model)
        context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)

        # (batch_size, max_len, d_model)
        return self.output_linear(context)

class FeedForward(torch.nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, middle_dim=2048, dropout=0.1):
        super(FeedForward, self).__init__()
        
        self.fc1 = torch.nn.Linear(d_model, middle_dim)
        self.fc2 = torch.nn.Linear(middle_dim, d_model)
        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.GELU()

    def forward(self, x):
        out = self.activation(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

class EncoderLayer(torch.nn.Module):
    def __init__(
        self, 
        d_model=768,
        heads=12, 
        feed_forward_hidden=768 * 4, 
        dropout=0.1
        ):
        super(EncoderLayer, self).__init__()
        self.layernorm = torch.nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadedAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        # embeddings: (batch_size, max_len, d_model)
        # encoder mask: (batch_size, 1, 1, max_len)
        # result: (batch_size, max_len, d_model)
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        # residual layer
        interacted = self.layernorm(interacted + embeddings)
        
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

In [37]:
class BERT(torch.nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, d_model=768, n_layers=12, heads=12, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads

        # paper noted they used 4 * hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = d_model * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=d_model)

        # multi-layers transformer blocks, deep network
        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderLayer(d_model, heads, d_model * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info):
        # attention masking for padded token
        # (batch_size, 1, seq_len, seq_len)
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x, segment_info)

        # running over multiple transformer blocks
        for encoder in self.encoder_blocks:
            x = encoder.forward(x, mask)
        return x

class NextSentencePrediction(torch.nn.Module):
    """
    2-class classification model : is_next, is_not_next
    """

    def __init__(self, hidden):
        """
        :param hidden: BERT model output size
        """
        super().__init__()
        self.linear = torch.nn.Linear(hidden, 2)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # use only the first token which is the [CLS]
        return self.softmax(self.linear(x[:, 0]))

class MaskedLanguageModel(torch.nn.Module):
    """
    predicting origin token from masked input sequence
    n-class classification problem, n-class = vocab_size
    """

    def __init__(self, hidden, vocab_size):
        """
        :param hidden: output size of BERT model
        :param vocab_size: total vocab size
        """
        super().__init__()
        self.linear = torch.nn.Linear(hidden, vocab_size)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))

class BERTLM(torch.nn.Module):
    """
    BERT Language Model
    Next Sentence Prediction Model + Masked Language Model
    """

    def __init__(self, bert: BERT, vocab_size):
        """
        :param bert: BERT model which should be trained
        :param vocab_size: total vocab size for masked_lm
        """

        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.d_model)
        self.mask_lm = MaskedLanguageModel(self.bert.d_model, vocab_size)

    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        return self.next_sentence(x), self.mask_lm(x)

In [38]:
class ScheduledOptim():
    """Wrapper class for learning rate scheduling"""

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):        
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):        
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [39]:
class BERTTrainer:
    def __init__(
        self, 
        model, 
        train_dataloader, 
        test_dataloader=None, 
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=10000,
        log_freq=10,
        device='cuda'
        ):

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, self.model.bert.d_model, n_warmup_steps=warmup_steps
            )

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
    
    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        
        avg_loss = 0.0
        total_correct = 0
        total_element = 0
        
        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:

            # batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])

            # NLL(negative log likelihood) loss of is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])
            
            # transpose to (m, vocab_size, seq_len) vs (m, seq_len)
            # criterion(mask_lm_output.view(-1, mask_lm_output.size(-1)), data["bert_label"].view(-1))
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = next_loss + mask_loss

            # backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))
        print(
            f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(data_iter)}, \
            total_acc={total_correct * 100.0 / total_element}"
        ) 

In [40]:
train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

bert_model = BERT(
  vocab_size=len(tokenizer.vocab),
  d_model=768,
  n_layers=2,
  heads=12,
  dropout=0.1
)

bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
bert_trainer = BERTTrainer(bert_lm, train_loader, device="cpu")
epochs = 1

for epoch in range(epochs):
  bert_trainer.train(epoch)

Total Parameters: 27307105


EP_train:0:   1%|| 1/154 [01:02<2:39:17, 62.47s/it]

{'epoch': 0, 'iter': 0, 'avg_loss': 9.557798385620117, 'avg_acc': 37.5, 'loss': 9.557798385620117}


EP_train:0:   7%|| 11/154 [11:12<2:28:10, 62.17s/it]

{'epoch': 0, 'iter': 10, 'avg_loss': 9.522670659151943, 'avg_acc': 51.13636363636363, 'loss': 9.480085372924805}


EP_train:0:  14%|| 21/154 [21:11<2:15:11, 60.99s/it]

{'epoch': 0, 'iter': 20, 'avg_loss': 9.489825657435826, 'avg_acc': 50.44642857142857, 'loss': 9.434412002563477}


EP_train:0:  20%|| 31/154 [31:17<2:04:53, 60.92s/it]

{'epoch': 0, 'iter': 30, 'avg_loss': 9.461046126581007, 'avg_acc': 49.69758064516129, 'loss': 9.392536163330078}


EP_train:0:  27%|| 41/154 [41:54<2:03:48, 65.74s/it]

{'epoch': 0, 'iter': 40, 'avg_loss': 9.423962895463152, 'avg_acc': 50.381097560975604, 'loss': 9.2989501953125}


EP_train:0:  33%|| 51/154 [52:14<1:46:46, 62.20s/it]

{'epoch': 0, 'iter': 50, 'avg_loss': 9.386271663740569, 'avg_acc': 50.55147058823529, 'loss': 9.237812995910645}


EP_train:0:  40%|| 61/154 [1:02:26<1:36:41, 62.38s/it]

{'epoch': 0, 'iter': 60, 'avg_loss': 9.35043177057485, 'avg_acc': 50.0, 'loss': 9.187257766723633}


EP_train:0:  46%|| 71/154 [1:12:38<1:25:32, 61.83s/it]

{'epoch': 0, 'iter': 70, 'avg_loss': 9.317107012574102, 'avg_acc': 50.220070422535215, 'loss': 9.130542755126953}


EP_train:0:  53%|| 81/154 [1:22:44<1:14:20, 61.10s/it]

{'epoch': 0, 'iter': 80, 'avg_loss': 9.282886622864524, 'avg_acc': 50.27006172839506, 'loss': 9.056076049804688}


EP_train:0:  59%|| 91/154 [1:32:59<1:05:34, 62.45s/it]

{'epoch': 0, 'iter': 90, 'avg_loss': 9.24680620759398, 'avg_acc': 49.96565934065934, 'loss': 8.895912170410156}


EP_train:0:  66%|| 101/154 [1:43:24<55:18, 62.61s/it]

{'epoch': 0, 'iter': 100, 'avg_loss': 9.210352567162845, 'avg_acc': 50.21658415841584, 'loss': 8.820825576782227}


EP_train:0:  72%|| 111/154 [1:53:29<43:24, 60.56s/it]

{'epoch': 0, 'iter': 110, 'avg_loss': 9.172703665656012, 'avg_acc': 50.05630630630631, 'loss': 8.730839729309082}


EP_train:0:  79%|| 121/154 [2:03:36<33:43, 61.31s/it]

{'epoch': 0, 'iter': 120, 'avg_loss': 9.134493583490041, 'avg_acc': 50.05165289256198, 'loss': 8.737908363342285}


EP_train:0:  85%|| 131/154 [2:13:23<22:27, 58.60s/it]

{'epoch': 0, 'iter': 130, 'avg_loss': 9.09056054180815, 'avg_acc': 50.0, 'loss': 8.479402542114258}


EP_train:0:  92%|| 141/154 [2:23:20<12:58, 59.89s/it]

{'epoch': 0, 'iter': 140, 'avg_loss': 9.047352621741329, 'avg_acc': 50.310283687943254, 'loss': 8.420921325683594}


EP_train:0:  98%|| 151/154 [2:33:31<03:01, 60.52s/it]

{'epoch': 0, 'iter': 150, 'avg_loss': 9.00233828310935, 'avg_acc': 50.18625827814569, 'loss': 8.34437084197998}


EP_train:0: 100%|| 154/154 [2:36:29<00:00, 60.97s/it]

EP0, train:             avg_loss=8.988147760366465,             total_acc=50.10146103896104



