## CNN with GLOVE Embeddings for Sentence Classification

In [1]:
# Imports
!pip install portalocker
!pip install torchmetrics

import argparse
import logging
import time
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import DATASETS
from torchtext.prototype.transforms import load_sp_model, PRETRAINED_SP_MODEL, SentencePieceTokenizer
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext.vocab import GloVe
from tqdm import tqdm

torch.autograd.set_detect_anomaly(True)

Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2
Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.1


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7a8213e435e0>

In [2]:
# Initialize some variables
DATASET = "AG_NEWS"
DATA_DIR = ".data"
DEVICE = "cpu"
EMBED_DIM = 300
LR = 1.0
BATCH_SIZE = 128
NUM_EPOCHS = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE

### Prepare the Vocabulary/Tokenize Text

In [6]:
# Get basic tokenizer
basic_english_tokenizer =get_tokenizer("basic_english")
TOKENIZER = basic_english_tokenizer

# This takes a (x, y) pair from data_iter and returns tokenized list of words
# For every sentence, it should yield a tokenized set of string
# "a b c d" -> ["a", "b", "c", "d"]
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield TOKENIZER(text)

# For text, return the tokens for each word
# This might work like "The man walks" -> ["the", "man", "walks"] -> [17, 123, 5]
def text_pipeline(text):
    return VOCAB(TOKENIZER(text))

# Return the label as an integer between 0 and num_classes - 1 inclusive
def label_pipeline(label):
    return int(label) - 1

# For each batch, return a tensor of labels and a tensor of ints representing the words in associated sentences
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        label_list.append(label_pipeline(_label))

        # Return a list of ints
        # This basically returns a tensor of the token ids and then puts them into text_list
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        # Append to text_list
        text_list.append(processed_text.clone().detach())


    # Pad and make into tensors as needed
    label_list = torch.tensor(label_list, dtype=torch.int64)

    text_list = pad_sequence(text_list, batch_first=True)

    return label_list.to(DEVICE), text_list.to(DEVICE)

# Load data
train_iter, test_iter = DATASETS[DATASET]()
num_class = len(set([label for (label, _) in train_iter]))
print(f"The number of classes is {num_class} ...")
VOCAB = build_vocab_from_iterator(yield_tokens(train_iter), specials=('<pad>', '<unk>'))
# Make the default index the same as that of the unk_token.
VOCAB.set_default_index(VOCAB['<unk>'])
print("Total number of words:",len(VOCAB))

# Define test/train sets
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

# Set up the dataloader
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

# Peek at the training data
for yb, xb in train_dataloader:
    print("Shape y:",yb.shape, "Shape x:",xb.shape)
    print( "x:", xb)
    break

The number of classes is 4 ...
Total number of words: 95812
Shape y: torch.Size([128]) Shape x: torch.Size([128, 114])
x: tensor([[ 9742,   896,    12,  ...,     0,     0,     0],
        [ 2143, 39793,     5,  ...,     0,     0,     0],
        [ 4078,  3609,  9705,  ...,     0,     0,     0],
        ...,
        [ 3672, 39012,   147,  ...,     0,     0,     0],
        [ 5579,   721,  8684,  ...,     0,     0,     0],
        [ 1096,     5,   663,  ...,     0,     0,     0]])


### Get GLOVE embeddings

In [8]:
# GLOVE is a pretrained set of word vectors that comes with torchtext; it is roughly 2G of data.
# This will take a bit of time
GLOVE = GloVe()
print("Total number of words:",len(GLOVE), "Shape of embeddings:",GLOVE.vectors.shape)

Total number of words: 2196017 Shape of embeddings: torch.Size([2196017, 300])


### Define the model: CNN with GLOVE Embeddings

In [11]:
class CNN1dTextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        num_class,
        embed_dim = 300,
        use_pretrained = True,
        fine_tune_embeddings = True,
        debug = False
    ):

        super(CNN1dTextClassificationModel, self).__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=PADDING_IDX
        )

        # A linear map from total number of filters (3) to num_class
        self.fc = nn.Linear(3, num_class)

        if use_pretrained:
            # Set the embeddings to not requiring gradients since we'll try and modify
            self.embedding.weight.requires_grad = False
            for i in range(vocab_size):
                # Get the token for the index i
                token = VOCAB.lookup_token(i)
                # Modify the embedding for index i by the embedding for that token
                # Do this only if token is in the stoi dictionary for GLOVE
                if token in GLOVE.stoi:
                   self.embedding.weight[i, :] = GLOVE.get_vecs_by_tokens(TOKENIZER(token),lower_case_backup=True)
            self.embedding.weight.requires_grad = True
        else:
            # Otherwise, initialize the weights as specified below
            self.init_weights()

        # If weights do not get changed, turn off gradients for the GloVe embeddings
        if not fine_tune_embeddings:
          self.embedding.weight.requires_grad = False

        # Define a Conv1d layer that collapses all the channels and does not collapse the time dimension
        self.cnn1 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=1)

        # Define 3 Conv1d layers each having 1 filter and kernel sizes 2, 3 and 4
        self.cnn2 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=2)
        self.cnn3 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=3)
        self.cnn4 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=4)

        self.debug = debug

    def init_weights(self):
        initrange = 0.5
        # Initialize the embedding weight matrix to uniform between the [-0.5, 0.5]
        self.embedding.weight.data.uniform_(-initrange,initrange)

        # Initialize the weight matrix of fc to uniform between the [-0.5, 0.5]
        self.fc.weight.data.uniform_(-initrange,initrange)

        # Initialize the bias for fc to zero
        self.fc.bias.data.fill_(0.0)


    # B = batch_size, L = sequence length, D = vector dimension
    def forward(self, text):
        # B X L X D
        # Get the embeddings for the text passed in
        embedded = self.embedding(text)

        if self.debug:
            print('embedding', embedded.shape)
        # B X D X L
        # Transpose the embedding above as needed
        embedded = embedded.transpose(1, 2)

        # B X 1 X L
        # Pass through cnn1
        cnn1 =  self.cnn1(embedded)
        if self.debug:
            print('cnn1', cnn1.shape)

        # B X 1
        # Take Max pooling over time
        cnn1 = nn.MaxPool1d(kernel_size =cnn1.shape[2], stride=1)(cnn1).squeeze(-1)
        if self.debug:
          print('cnn1 after max pool', cnn1.shape)

        # B X 1 X L - 1
        # Pass through cnn2 and add a RELU
        cnn2 =F.relu(self.cnn2(embedded))
        if self.debug:
            print('cnn2', cnn2.shape)

        # B X 1 X L - 2
        # Pass through cnn3 and add a RELU
        cnn3 = F.relu(self.cnn3(embedded))
        if self.debug:
            print('cnn3', cnn3.shape)

        # B X 1 X L - 3
        # Pass through cnn4 and add a RELU
        cnn4 = F.relu(self.cnn4(embedded))
        if self.debug:
            print('cnn4', cnn4.shape)

        # B X 1 in all cases
        # Apply max pooling to each of cnn2, cnn3 and cnn4
        cnn2 = nn.MaxPool1d(kernel_size =cnn2.shape[2], stride=1)(cnn2).squeeze(-1)
        cnn3 = nn.MaxPool1d(kernel_size =cnn3.shape[2], stride=1)(cnn3).squeeze(-1)
        cnn4 = nn.MaxPool1d(kernel_size =cnn4.shape[2], stride=1)(cnn4).squeeze(-1)

        # B X 1 in all cases
        # Apply max pooling over time
        if self.debug:
            print('cnn2 after max', cnn2.shape)

        # Add to each cnn2, 3, 4 a skip connection to cnn1 and average the results
        cnn2 = (cnn1 + cnn2)/2
        cnn3 = (cnn1 + cnn3)/2
        cnn4 = (cnn1 + cnn4)/2
        if self.debug:
            print('cnn2 after skip connection', cnn2.shape)

        # B X 3
        # Concatenate the above
        cnn_concat = torch.cat((cnn2, cnn3,cnn4), 1)
        if self.debug:
            print('cnn concat', cnn_concat.shape)
            # Set the debug to False after the first forward pass
            self.debug = False

        # Pass through an appropriate Linear layer to get the right dimensions needed
        out = self.fc(cnn_concat)
        return out

### Training the Model

In [12]:
# If this is True, we will initialize the Embedding layer with GLOVE
USE_PRETRANED = True

# If this is True, we will allow for gradient updates on the nn.Embedding layer
FINE_TUNE_EMBEDDINGS = True

# Set the loss appropriately
criterion = nn.CrossEntropyLoss()

model =  CNN1dTextClassificationModel(
        len(VOCAB),
        num_class,
        embed_dim = EMBED_DIM,
        use_pretrained = USE_PRETRANED,
        fine_tune_embeddings = FINE_TUNE_EMBEDDINGS,
        debug = True
    ).to(DEVICE)

# Set the optimizer to SGD
# Add an L2 regularizer of 0.00001
optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay =0.00001)

# Set the scheduler to StepLR with gamma=0.1 and step_size = 1.0
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1.0, gamma=0.1)


def train(dataloader, model, optimizer, criterion, epoch):
    model.train()
    total_acc, total_count = 0, 0
    total_loss, total_batches = 0.0, 0
    total_zero_gradients_percentage = []
    log_interval = 10

    for idx, (label, text) in tqdm(enumerate(dataloader)):
        # Set gradients to zero
        optimizer.zero_grad()

        # Get the predictions
        predicted_label = model(text)

        # Get the loss
        loss = criterion(predicted_label, label)

        # Do back propagation and get the gradients
        loss.backward()

        # Get the loss per batch and the number of batches
        with torch.no_grad():
            total_loss+=loss.item()
            total_batches+=1

        # Loop through all the parameters
        # Specifically, for this batch, get the percentage of zero gradients across all parameters
        # Append this to the list above which will print out the total every 10 batches
        total_nonzero_gradients = 0.0
        total_param_count = 0.0

        for i in model.parameters():
          if i.grad!= None:
            total_nonzero_gradients+= torch.count_nonzero(i.grad).item()
            total_param_count += i.grad.data.numel()

        total_zero_gradients_percentage+=[1-(total_nonzero_gradients/total_param_count)]

        # # Clip the gradient at 10.0
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0)

        # Do an optimization step
        optimizer.step()

        # Get the accuracy
        _, prediction = torch.max(predicted_label.data, 1)
        total_acc += (label==prediction).sum().item()

        total_count += label.size(0)

        # Log results
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f} "
                "| loss {:8.3f} "
                "| zero gradients percentage {:8.3f}".format(
                    epoch, idx,
                    len(dataloader),
                    total_acc / total_count,
                    total_loss / total_batches,
                    torch.tensor(total_zero_gradients_percentage).mean().item()
                    )
            )
            # Reset variables as needed
            total_acc, total_count = 0, 0
            total_loss, total_batches = 0.0, 0
            total_zero_gradients = []

### Model Validation:

In [13]:
def evaluate(dataloader, model, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    total_loss = 0.0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            # Get the predicted labels
            predicted_label = model(text)
            # Get the accuracy
            _, prediction = torch.max(predicted_label.data, 1)
            loss = criterion(input=predicted_label, target=label)
            total_acc += (label==prediction).sum().item()
            # Update the total count
            total_count +=  label.size(0)
            # Update the total loss
            total_loss += loss
    return total_acc / total_count, total_loss / total_count

### Model Execution:

In [None]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val, loss_val = evaluate(valid_dataloader, model, criterion)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch,
            time.time() - epoch_start_time,
            accu_val,
            loss_val
            )
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test, loss_test = evaluate(test_dataloader, model, criterion)
print("test accuracy {:8.3f}".format(accu_test))
print("test loss {:8.3f}".format(loss_test))

0it [00:00, ?it/s]

embedding torch.Size([128, 75, 300])
cnn1 torch.Size([128, 1, 75])
cnn1 after max pool torch.Size([128, 1])
cnn2 torch.Size([128, 1, 74])
cnn3 torch.Size([128, 1, 73])
cnn4 torch.Size([128, 1, 72])
cnn2 after max torch.Size([128, 1])
cnn2 after skip connection torch.Size([128, 1])
cnn concat torch.Size([128, 3])


11it [00:04,  2.08it/s]

| epoch   1 |    10/  891 batches | accuracy    0.385 | loss    1.266 | zero gradients percentage    0.994


21it [00:10,  1.75it/s]

| epoch   1 |    20/  891 batches | accuracy    0.607 | loss    0.972 | zero gradients percentage    0.994


31it [00:15,  2.23it/s]

| epoch   1 |    30/  891 batches | accuracy    0.693 | loss    0.798 | zero gradients percentage    0.994


41it [00:19,  2.46it/s]

| epoch   1 |    40/  891 batches | accuracy    0.721 | loss    0.729 | zero gradients percentage    0.994


51it [00:25,  1.60it/s]

| epoch   1 |    50/  891 batches | accuracy    0.768 | loss    0.649 | zero gradients percentage    0.994


61it [00:30,  2.10it/s]

| epoch   1 |    60/  891 batches | accuracy    0.770 | loss    0.643 | zero gradients percentage    0.994


71it [00:35,  2.35it/s]

| epoch   1 |    70/  891 batches | accuracy    0.804 | loss    0.564 | zero gradients percentage    0.994


81it [00:40,  1.57it/s]

| epoch   1 |    80/  891 batches | accuracy    0.784 | loss    0.572 | zero gradients percentage    0.993


91it [00:45,  2.08it/s]

| epoch   1 |    90/  891 batches | accuracy    0.803 | loss    0.566 | zero gradients percentage    0.993


101it [00:49,  2.35it/s]

| epoch   1 |   100/  891 batches | accuracy    0.802 | loss    0.547 | zero gradients percentage    0.993


111it [00:55,  1.67it/s]

| epoch   1 |   110/  891 batches | accuracy    0.805 | loss    0.551 | zero gradients percentage    0.993


121it [01:01,  2.11it/s]

| epoch   1 |   120/  891 batches | accuracy    0.816 | loss    0.516 | zero gradients percentage    0.993


131it [01:05,  2.38it/s]

| epoch   1 |   130/  891 batches | accuracy    0.820 | loss    0.521 | zero gradients percentage    0.993


141it [01:10,  1.99it/s]

| epoch   1 |   140/  891 batches | accuracy    0.834 | loss    0.483 | zero gradients percentage    0.993


151it [01:15,  1.99it/s]

| epoch   1 |   150/  891 batches | accuracy    0.827 | loss    0.507 | zero gradients percentage    0.993


161it [01:20,  2.31it/s]

| epoch   1 |   160/  891 batches | accuracy    0.830 | loss    0.475 | zero gradients percentage    0.993


171it [01:24,  2.32it/s]

| epoch   1 |   170/  891 batches | accuracy    0.818 | loss    0.503 | zero gradients percentage    0.993


181it [01:30,  1.84it/s]

| epoch   1 |   180/  891 batches | accuracy    0.848 | loss    0.454 | zero gradients percentage    0.993


191it [01:35,  2.00it/s]

| epoch   1 |   190/  891 batches | accuracy    0.845 | loss    0.471 | zero gradients percentage    0.993


201it [01:40,  2.14it/s]

| epoch   1 |   200/  891 batches | accuracy    0.840 | loss    0.500 | zero gradients percentage    0.993


211it [01:46,  1.72it/s]

| epoch   1 |   210/  891 batches | accuracy    0.805 | loss    0.578 | zero gradients percentage    0.993


221it [01:50,  1.99it/s]

| epoch   1 |   220/  891 batches | accuracy    0.830 | loss    0.477 | zero gradients percentage    0.993


231it [01:55,  2.38it/s]

| epoch   1 |   230/  891 batches | accuracy    0.851 | loss    0.449 | zero gradients percentage    0.993


241it [02:00,  1.72it/s]

| epoch   1 |   240/  891 batches | accuracy    0.848 | loss    0.447 | zero gradients percentage    0.993


251it [02:05,  2.30it/s]

| epoch   1 |   250/  891 batches | accuracy    0.826 | loss    0.477 | zero gradients percentage    0.993


261it [02:09,  2.33it/s]

| epoch   1 |   260/  891 batches | accuracy    0.827 | loss    0.487 | zero gradients percentage    0.993


271it [02:14,  1.71it/s]

| epoch   1 |   270/  891 batches | accuracy    0.845 | loss    0.465 | zero gradients percentage    0.993


281it [02:19,  2.27it/s]

| epoch   1 |   280/  891 batches | accuracy    0.850 | loss    0.423 | zero gradients percentage    0.993


291it [02:23,  2.36it/s]

| epoch   1 |   290/  891 batches | accuracy    0.850 | loss    0.457 | zero gradients percentage    0.993


301it [02:28,  1.96it/s]

| epoch   1 |   300/  891 batches | accuracy    0.836 | loss    0.454 | zero gradients percentage    0.993


311it [02:34,  1.94it/s]

| epoch   1 |   310/  891 batches | accuracy    0.852 | loss    0.448 | zero gradients percentage    0.993


321it [02:38,  2.22it/s]

| epoch   1 |   320/  891 batches | accuracy    0.845 | loss    0.464 | zero gradients percentage    0.993


331it [02:42,  2.41it/s]

| epoch   1 |   330/  891 batches | accuracy    0.836 | loss    0.464 | zero gradients percentage    0.993


341it [02:48,  1.72it/s]

| epoch   1 |   340/  891 batches | accuracy    0.838 | loss    0.479 | zero gradients percentage    0.993


351it [02:53,  2.23it/s]

| epoch   1 |   350/  891 batches | accuracy    0.849 | loss    0.441 | zero gradients percentage    0.993


361it [02:57,  2.13it/s]

| epoch   1 |   360/  891 batches | accuracy    0.866 | loss    0.409 | zero gradients percentage    0.993


371it [03:03,  1.57it/s]

| epoch   1 |   370/  891 batches | accuracy    0.864 | loss    0.443 | zero gradients percentage    0.993


381it [03:07,  2.34it/s]

| epoch   1 |   380/  891 batches | accuracy    0.844 | loss    0.429 | zero gradients percentage    0.993


391it [03:12,  2.44it/s]

| epoch   1 |   390/  891 batches | accuracy    0.841 | loss    0.473 | zero gradients percentage    0.993


401it [03:17,  1.64it/s]

| epoch   1 |   400/  891 batches | accuracy    0.859 | loss    0.431 | zero gradients percentage    0.993


411it [03:22,  2.11it/s]

| epoch   1 |   410/  891 batches | accuracy    0.853 | loss    0.494 | zero gradients percentage    0.993


421it [03:26,  2.34it/s]

| epoch   1 |   420/  891 batches | accuracy    0.851 | loss    0.457 | zero gradients percentage    0.993


431it [03:31,  1.74it/s]

| epoch   1 |   430/  891 batches | accuracy    0.841 | loss    0.481 | zero gradients percentage    0.993


441it [03:37,  2.05it/s]

| epoch   1 |   440/  891 batches | accuracy    0.834 | loss    0.496 | zero gradients percentage    0.993


451it [03:41,  2.02it/s]

| epoch   1 |   450/  891 batches | accuracy    0.834 | loss    0.505 | zero gradients percentage    0.993


461it [03:46,  2.15it/s]

| epoch   1 |   460/  891 batches | accuracy    0.855 | loss    0.484 | zero gradients percentage    0.993


471it [03:52,  1.81it/s]

| epoch   1 |   470/  891 batches | accuracy    0.863 | loss    0.403 | zero gradients percentage    0.993


481it [03:56,  2.22it/s]

| epoch   1 |   480/  891 batches | accuracy    0.854 | loss    0.411 | zero gradients percentage    0.993


491it [04:01,  2.09it/s]

| epoch   1 |   490/  891 batches | accuracy    0.854 | loss    0.435 | zero gradients percentage    0.993


501it [04:07,  1.59it/s]

| epoch   1 |   500/  891 batches | accuracy    0.852 | loss    0.426 | zero gradients percentage    0.993


511it [04:11,  2.34it/s]

| epoch   1 |   510/  891 batches | accuracy    0.866 | loss    0.378 | zero gradients percentage    0.993


521it [04:16,  2.11it/s]

| epoch   1 |   520/  891 batches | accuracy    0.855 | loss    0.421 | zero gradients percentage    0.993


531it [04:22,  1.70it/s]

| epoch   1 |   530/  891 batches | accuracy    0.846 | loss    0.445 | zero gradients percentage    0.993


541it [04:26,  2.25it/s]

| epoch   1 |   540/  891 batches | accuracy    0.833 | loss    0.483 | zero gradients percentage    0.993


551it [04:31,  2.10it/s]

| epoch   1 |   550/  891 batches | accuracy    0.870 | loss    0.401 | zero gradients percentage    0.993


561it [04:36,  1.66it/s]

| epoch   1 |   560/  891 batches | accuracy    0.845 | loss    0.464 | zero gradients percentage    0.993


571it [04:41,  2.51it/s]

| epoch   1 |   570/  891 batches | accuracy    0.868 | loss    0.405 | zero gradients percentage    0.993


581it [04:44,  2.60it/s]

| epoch   1 |   580/  891 batches | accuracy    0.837 | loss    0.459 | zero gradients percentage    0.993


591it [04:48,  2.15it/s]

| epoch   1 |   590/  891 batches | accuracy    0.841 | loss    0.459 | zero gradients percentage    0.993


601it [04:54,  1.91it/s]

| epoch   1 |   600/  891 batches | accuracy    0.843 | loss    0.440 | zero gradients percentage    0.993


611it [04:58,  2.68it/s]

| epoch   1 |   610/  891 batches | accuracy    0.859 | loss    0.425 | zero gradients percentage    0.993


621it [05:02,  2.27it/s]

| epoch   1 |   620/  891 batches | accuracy    0.866 | loss    0.401 | zero gradients percentage    0.993


631it [05:07,  1.90it/s]

| epoch   1 |   630/  891 batches | accuracy    0.845 | loss    0.459 | zero gradients percentage    0.993


641it [05:12,  2.14it/s]

| epoch   1 |   640/  891 batches | accuracy    0.828 | loss    0.470 | zero gradients percentage    0.993


651it [05:16,  2.36it/s]

| epoch   1 |   650/  891 batches | accuracy    0.850 | loss    0.420 | zero gradients percentage    0.993


661it [05:22,  1.72it/s]

| epoch   1 |   660/  891 batches | accuracy    0.863 | loss    0.433 | zero gradients percentage    0.993


671it [05:27,  2.17it/s]

| epoch   1 |   670/  891 batches | accuracy    0.847 | loss    0.442 | zero gradients percentage    0.993


681it [05:31,  2.52it/s]

| epoch   1 |   680/  891 batches | accuracy    0.877 | loss    0.366 | zero gradients percentage    0.993


691it [05:35,  2.15it/s]

| epoch   1 |   690/  891 batches | accuracy    0.871 | loss    0.376 | zero gradients percentage    0.993


701it [05:42,  1.80it/s]

| epoch   1 |   700/  891 batches | accuracy    0.863 | loss    0.412 | zero gradients percentage    0.993


711it [05:46,  2.34it/s]

| epoch   1 |   710/  891 batches | accuracy    0.859 | loss    0.403 | zero gradients percentage    0.993


721it [05:50,  2.21it/s]

| epoch   1 |   720/  891 batches | accuracy    0.862 | loss    0.399 | zero gradients percentage    0.993


731it [05:57,  1.57it/s]

| epoch   1 |   730/  891 batches | accuracy    0.857 | loss    0.412 | zero gradients percentage    0.993


741it [06:01,  2.24it/s]

| epoch   1 |   740/  891 batches | accuracy    0.844 | loss    0.444 | zero gradients percentage    0.993


751it [06:05,  2.42it/s]

| epoch   1 |   750/  891 batches | accuracy    0.864 | loss    0.402 | zero gradients percentage    0.993


761it [06:11,  1.64it/s]

| epoch   1 |   760/  891 batches | accuracy    0.874 | loss    0.379 | zero gradients percentage    0.993


771it [06:16,  2.18it/s]

| epoch   1 |   770/  891 batches | accuracy    0.850 | loss    0.407 | zero gradients percentage    0.993


781it [06:20,  2.37it/s]

| epoch   1 |   780/  891 batches | accuracy    0.874 | loss    0.392 | zero gradients percentage    0.993


791it [06:25,  1.78it/s]

| epoch   1 |   790/  891 batches | accuracy    0.884 | loss    0.356 | zero gradients percentage    0.993


801it [06:30,  2.13it/s]

| epoch   1 |   800/  891 batches | accuracy    0.859 | loss    0.399 | zero gradients percentage    0.993


811it [06:34,  2.43it/s]

| epoch   1 |   810/  891 batches | accuracy    0.869 | loss    0.398 | zero gradients percentage    0.993


821it [06:39,  1.82it/s]

| epoch   1 |   820/  891 batches | accuracy    0.877 | loss    0.354 | zero gradients percentage    0.993


831it [06:44,  2.10it/s]

| epoch   1 |   830/  891 batches | accuracy    0.859 | loss    0.425 | zero gradients percentage    0.993


841it [06:49,  2.40it/s]

| epoch   1 |   840/  891 batches | accuracy    0.868 | loss    0.395 | zero gradients percentage    0.993


851it [06:53,  2.45it/s]

| epoch   1 |   850/  891 batches | accuracy    0.867 | loss    0.399 | zero gradients percentage    0.993


861it [06:59,  1.57it/s]

| epoch   1 |   860/  891 batches | accuracy    0.860 | loss    0.419 | zero gradients percentage    0.993


871it [07:03,  2.38it/s]

| epoch   1 |   870/  891 batches | accuracy    0.856 | loss    0.419 | zero gradients percentage    0.993


881it [07:08,  2.25it/s]

| epoch   1 |   880/  891 batches | accuracy    0.865 | loss    0.402 | zero gradients percentage    0.993


891it [07:13,  2.05it/s]

| epoch   1 |   890/  891 batches | accuracy    0.870 | loss    0.390 | zero gradients percentage    0.993





-----------------------------------------------------------
| end of epoch   1 | time: 437.16s | valid accuracy    0.869 
-----------------------------------------------------------


11it [00:04,  2.25it/s]

| epoch   2 |    10/  891 batches | accuracy    0.876 | loss    0.366 | zero gradients percentage    0.993


21it [00:09,  1.91it/s]

| epoch   2 |    20/  891 batches | accuracy    0.859 | loss    0.385 | zero gradients percentage    0.993


31it [00:14,  2.16it/s]

| epoch   2 |    30/  891 batches | accuracy    0.869 | loss    0.384 | zero gradients percentage    0.993


41it [00:19,  2.16it/s]

| epoch   2 |    40/  891 batches | accuracy    0.872 | loss    0.366 | zero gradients percentage    0.993


51it [00:23,  2.17it/s]

| epoch   2 |    50/  891 batches | accuracy    0.887 | loss    0.355 | zero gradients percentage    0.993


61it [00:30,  1.83it/s]

| epoch   2 |    60/  891 batches | accuracy    0.869 | loss    0.356 | zero gradients percentage    0.993


71it [00:34,  2.35it/s]

| epoch   2 |    70/  891 batches | accuracy    0.879 | loss    0.344 | zero gradients percentage    0.993


81it [00:38,  2.27it/s]

| epoch   2 |    80/  891 batches | accuracy    0.870 | loss    0.381 | zero gradients percentage    0.993


91it [00:44,  1.73it/s]

| epoch   2 |    90/  891 batches | accuracy    0.884 | loss    0.357 | zero gradients percentage    0.993


101it [00:48,  2.30it/s]

| epoch   2 |   100/  891 batches | accuracy    0.863 | loss    0.430 | zero gradients percentage    0.993


111it [00:53,  2.20it/s]

| epoch   2 |   110/  891 batches | accuracy    0.880 | loss    0.370 | zero gradients percentage    0.993


121it [00:59,  1.50it/s]

| epoch   2 |   120/  891 batches | accuracy    0.887 | loss    0.360 | zero gradients percentage    0.993


131it [01:03,  2.28it/s]

| epoch   2 |   130/  891 batches | accuracy    0.864 | loss    0.369 | zero gradients percentage    0.993


141it [01:07,  2.41it/s]

| epoch   2 |   140/  891 batches | accuracy    0.874 | loss    0.370 | zero gradients percentage    0.993


151it [01:12,  1.90it/s]

| epoch   2 |   150/  891 batches | accuracy    0.878 | loss    0.350 | zero gradients percentage    0.993


161it [01:17,  2.21it/s]

| epoch   2 |   160/  891 batches | accuracy    0.881 | loss    0.371 | zero gradients percentage    0.993


171it [01:21,  2.34it/s]

| epoch   2 |   170/  891 batches | accuracy    0.887 | loss    0.347 | zero gradients percentage    0.993


181it [01:26,  2.23it/s]

| epoch   2 |   180/  891 batches | accuracy    0.878 | loss    0.357 | zero gradients percentage    0.992


191it [01:31,  1.94it/s]

| epoch   2 |   190/  891 batches | accuracy    0.887 | loss    0.358 | zero gradients percentage    0.992


201it [01:36,  2.31it/s]

| epoch   2 |   200/  891 batches | accuracy    0.868 | loss    0.388 | zero gradients percentage    0.992


211it [01:40,  2.31it/s]

| epoch   2 |   210/  891 batches | accuracy    0.880 | loss    0.362 | zero gradients percentage    0.992


221it [01:46,  1.54it/s]

| epoch   2 |   220/  891 batches | accuracy    0.874 | loss    0.379 | zero gradients percentage    0.992


231it [01:51,  2.23it/s]

| epoch   2 |   230/  891 batches | accuracy    0.878 | loss    0.377 | zero gradients percentage    0.992


241it [01:55,  2.41it/s]

| epoch   2 |   240/  891 batches | accuracy    0.884 | loss    0.346 | zero gradients percentage    0.992


251it [02:01,  1.67it/s]

| epoch   2 |   250/  891 batches | accuracy    0.870 | loss    0.383 | zero gradients percentage    0.992


261it [02:05,  2.17it/s]

| epoch   2 |   260/  891 batches | accuracy    0.873 | loss    0.380 | zero gradients percentage    0.992


271it [02:10,  2.31it/s]

| epoch   2 |   270/  891 batches | accuracy    0.866 | loss    0.369 | zero gradients percentage    0.992


281it [02:14,  1.87it/s]

| epoch   2 |   280/  891 batches | accuracy    0.869 | loss    0.352 | zero gradients percentage    0.992


291it [02:20,  2.01it/s]

| epoch   2 |   290/  891 batches | accuracy    0.875 | loss    0.361 | zero gradients percentage    0.992


301it [02:24,  2.44it/s]

| epoch   2 |   300/  891 batches | accuracy    0.884 | loss    0.363 | zero gradients percentage    0.992


311it [02:29,  1.98it/s]

| epoch   2 |   310/  891 batches | accuracy    0.880 | loss    0.344 | zero gradients percentage    0.992


321it [02:35,  1.90it/s]

| epoch   2 |   320/  891 batches | accuracy    0.890 | loss    0.319 | zero gradients percentage    0.992


331it [02:39,  2.27it/s]

| epoch   2 |   330/  891 batches | accuracy    0.887 | loss    0.342 | zero gradients percentage    0.992


341it [02:43,  2.29it/s]

| epoch   2 |   340/  891 batches | accuracy    0.892 | loss    0.348 | zero gradients percentage    0.992


351it [02:49,  1.79it/s]

| epoch   2 |   350/  891 batches | accuracy    0.869 | loss    0.367 | zero gradients percentage    0.992


361it [02:53,  2.36it/s]

| epoch   2 |   360/  891 batches | accuracy    0.855 | loss    0.414 | zero gradients percentage    0.992


371it [02:57,  2.36it/s]

| epoch   2 |   370/  891 batches | accuracy    0.888 | loss    0.339 | zero gradients percentage    0.992


381it [03:03,  1.78it/s]

| epoch   2 |   380/  891 batches | accuracy    0.882 | loss    0.361 | zero gradients percentage    0.992


391it [03:07,  2.44it/s]

| epoch   2 |   390/  891 batches | accuracy    0.884 | loss    0.344 | zero gradients percentage    0.992


401it [03:12,  2.44it/s]

| epoch   2 |   400/  891 batches | accuracy    0.885 | loss    0.360 | zero gradients percentage    0.992


411it [03:16,  1.96it/s]

| epoch   2 |   410/  891 batches | accuracy    0.885 | loss    0.347 | zero gradients percentage    0.992


421it [03:22,  2.19it/s]

| epoch   2 |   420/  891 batches | accuracy    0.890 | loss    0.330 | zero gradients percentage    0.992


431it [03:26,  2.43it/s]

| epoch   2 |   430/  891 batches | accuracy    0.882 | loss    0.328 | zero gradients percentage    0.992


441it [03:30,  2.11it/s]

| epoch   2 |   440/  891 batches | accuracy    0.876 | loss    0.349 | zero gradients percentage    0.992


451it [03:36,  1.77it/s]

| epoch   2 |   450/  891 batches | accuracy    0.889 | loss    0.344 | zero gradients percentage    0.992


461it [03:41,  2.13it/s]

| epoch   2 |   460/  891 batches | accuracy    0.871 | loss    0.382 | zero gradients percentage    0.992


471it [03:46,  2.20it/s]

| epoch   2 |   470/  891 batches | accuracy    0.870 | loss    0.381 | zero gradients percentage    0.992


481it [03:51,  1.78it/s]

| epoch   2 |   480/  891 batches | accuracy    0.872 | loss    0.368 | zero gradients percentage    0.992


491it [03:56,  2.39it/s]

| epoch   2 |   490/  891 batches | accuracy    0.889 | loss    0.346 | zero gradients percentage    0.992


501it [04:00,  2.46it/s]

| epoch   2 |   500/  891 batches | accuracy    0.888 | loss    0.334 | zero gradients percentage    0.992


511it [04:05,  1.74it/s]

| epoch   2 |   510/  891 batches | accuracy    0.895 | loss    0.326 | zero gradients percentage    0.992


521it [04:10,  2.09it/s]

| epoch   2 |   520/  891 batches | accuracy    0.872 | loss    0.358 | zero gradients percentage    0.992


531it [04:15,  2.25it/s]

| epoch   2 |   530/  891 batches | accuracy    0.858 | loss    0.418 | zero gradients percentage    0.992


541it [04:20,  1.81it/s]

| epoch   2 |   540/  891 batches | accuracy    0.860 | loss    0.411 | zero gradients percentage    0.992


551it [04:25,  2.23it/s]

| epoch   2 |   550/  891 batches | accuracy    0.886 | loss    0.353 | zero gradients percentage    0.992


561it [04:29,  2.56it/s]

| epoch   2 |   560/  891 batches | accuracy    0.869 | loss    0.370 | zero gradients percentage    0.992


571it [04:33,  2.27it/s]

| epoch   2 |   570/  891 batches | accuracy    0.876 | loss    0.373 | zero gradients percentage    0.992


581it [04:39,  1.86it/s]

| epoch   2 |   580/  891 batches | accuracy    0.870 | loss    0.385 | zero gradients percentage    0.992


591it [04:43,  2.36it/s]

| epoch   2 |   590/  891 batches | accuracy    0.870 | loss    0.391 | zero gradients percentage    0.992


601it [04:48,  2.10it/s]

| epoch   2 |   600/  891 batches | accuracy    0.880 | loss    0.359 | zero gradients percentage    0.992


611it [04:54,  1.61it/s]

| epoch   2 |   610/  891 batches | accuracy    0.882 | loss    0.337 | zero gradients percentage    0.992


621it [04:58,  2.42it/s]

| epoch   2 |   620/  891 batches | accuracy    0.886 | loss    0.365 | zero gradients percentage    0.992


631it [05:03,  2.33it/s]

| epoch   2 |   630/  891 batches | accuracy    0.891 | loss    0.347 | zero gradients percentage    0.992


641it [05:08,  1.64it/s]

| epoch   2 |   640/  891 batches | accuracy    0.864 | loss    0.396 | zero gradients percentage    0.992


651it [05:13,  2.32it/s]

| epoch   2 |   650/  891 batches | accuracy    0.879 | loss    0.383 | zero gradients percentage    0.992


661it [05:17,  2.37it/s]

| epoch   2 |   660/  891 batches | accuracy    0.898 | loss    0.356 | zero gradients percentage    0.992


671it [05:22,  1.84it/s]

| epoch   2 |   670/  891 batches | accuracy    0.871 | loss    0.380 | zero gradients percentage    0.992


681it [05:27,  2.03it/s]

| epoch   2 |   680/  891 batches | accuracy    0.882 | loss    0.351 | zero gradients percentage    0.992


691it [05:32,  2.36it/s]

| epoch   2 |   690/  891 batches | accuracy    0.884 | loss    0.352 | zero gradients percentage    0.992


701it [05:36,  1.95it/s]

| epoch   2 |   700/  891 batches | accuracy    0.895 | loss    0.325 | zero gradients percentage    0.992


711it [05:42,  2.14it/s]

| epoch   2 |   710/  891 batches | accuracy    0.877 | loss    0.371 | zero gradients percentage    0.992


721it [05:46,  2.39it/s]

| epoch   2 |   720/  891 batches | accuracy    0.883 | loss    0.374 | zero gradients percentage    0.992


731it [05:51,  2.34it/s]

| epoch   2 |   730/  891 batches | accuracy    0.880 | loss    0.330 | zero gradients percentage    0.992


741it [05:57,  1.84it/s]

| epoch   2 |   740/  891 batches | accuracy    0.880 | loss    0.388 | zero gradients percentage    0.992


751it [06:01,  2.07it/s]

| epoch   2 |   750/  891 batches | accuracy    0.870 | loss    0.375 | zero gradients percentage    0.992


761it [06:05,  2.29it/s]

| epoch   2 |   760/  891 batches | accuracy    0.890 | loss    0.325 | zero gradients percentage    0.992


771it [06:11,  1.63it/s]

| epoch   2 |   770/  891 batches | accuracy    0.878 | loss    0.374 | zero gradients percentage    0.992


781it [06:16,  2.19it/s]

| epoch   2 |   780/  891 batches | accuracy    0.879 | loss    0.329 | zero gradients percentage    0.992


791it [06:20,  2.40it/s]

| epoch   2 |   790/  891 batches | accuracy    0.878 | loss    0.377 | zero gradients percentage    0.992


801it [06:26,  1.66it/s]

| epoch   2 |   800/  891 batches | accuracy    0.886 | loss    0.348 | zero gradients percentage    0.992


811it [06:31,  2.26it/s]

| epoch   2 |   810/  891 batches | accuracy    0.891 | loss    0.318 | zero gradients percentage    0.992


821it [06:35,  2.27it/s]

| epoch   2 |   820/  891 batches | accuracy    0.880 | loss    0.362 | zero gradients percentage    0.992


831it [06:40,  1.75it/s]

| epoch   2 |   830/  891 batches | accuracy    0.870 | loss    0.377 | zero gradients percentage    0.992


841it [06:45,  2.13it/s]

| epoch   2 |   840/  891 batches | accuracy    0.881 | loss    0.354 | zero gradients percentage    0.992


851it [06:49,  2.31it/s]

| epoch   2 |   850/  891 batches | accuracy    0.884 | loss    0.364 | zero gradients percentage    0.992


861it [06:54,  2.15it/s]

| epoch   2 |   860/  891 batches | accuracy    0.887 | loss    0.356 | zero gradients percentage    0.992


871it [07:00,  1.88it/s]

| epoch   2 |   870/  891 batches | accuracy    0.870 | loss    0.359 | zero gradients percentage    0.992


881it [07:04,  2.30it/s]

| epoch   2 |   880/  891 batches | accuracy    0.873 | loss    0.392 | zero gradients percentage    0.992


891it [07:08,  2.08it/s]

| epoch   2 |   890/  891 batches | accuracy    0.899 | loss    0.315 | zero gradients percentage    0.992





-----------------------------------------------------------
| end of epoch   2 | time: 432.96s | valid accuracy    0.878 
-----------------------------------------------------------


11it [00:05,  2.45it/s]

| epoch   3 |    10/  891 batches | accuracy    0.889 | loss    0.328 | zero gradients percentage    0.993


21it [00:09,  2.06it/s]

| epoch   3 |    20/  891 batches | accuracy    0.888 | loss    0.363 | zero gradients percentage    0.992


31it [00:14,  1.71it/s]

| epoch   3 |    30/  891 batches | accuracy    0.884 | loss    0.371 | zero gradients percentage    0.992


41it [00:19,  2.37it/s]

| epoch   3 |    40/  891 batches | accuracy    0.886 | loss    0.339 | zero gradients percentage    0.992


51it [00:24,  2.37it/s]

| epoch   3 |    50/  891 batches | accuracy    0.889 | loss    0.319 | zero gradients percentage    0.992


61it [00:29,  1.71it/s]

| epoch   3 |    60/  891 batches | accuracy    0.875 | loss    0.370 | zero gradients percentage    0.992


71it [00:34,  2.15it/s]

| epoch   3 |    70/  891 batches | accuracy    0.890 | loss    0.331 | zero gradients percentage    0.992


81it [00:38,  2.32it/s]

| epoch   3 |    80/  891 batches | accuracy    0.884 | loss    0.339 | zero gradients percentage    0.992


91it [00:42,  2.36it/s]

| epoch   3 |    90/  891 batches | accuracy    0.892 | loss    0.337 | zero gradients percentage    0.992


101it [00:48,  1.61it/s]

| epoch   3 |   100/  891 batches | accuracy    0.874 | loss    0.376 | zero gradients percentage    0.992


111it [00:53,  2.30it/s]

| epoch   3 |   110/  891 batches | accuracy    0.901 | loss    0.296 | zero gradients percentage    0.992


121it [00:57,  2.35it/s]

| epoch   3 |   120/  891 batches | accuracy    0.884 | loss    0.325 | zero gradients percentage    0.992


131it [01:03,  1.57it/s]

| epoch   3 |   130/  891 batches | accuracy    0.887 | loss    0.337 | zero gradients percentage    0.992


141it [01:08,  2.14it/s]

| epoch   3 |   140/  891 batches | accuracy    0.884 | loss    0.358 | zero gradients percentage    0.992


151it [01:12,  2.35it/s]

| epoch   3 |   150/  891 batches | accuracy    0.872 | loss    0.381 | zero gradients percentage    0.992


161it [01:18,  1.73it/s]

| epoch   3 |   160/  891 batches | accuracy    0.884 | loss    0.347 | zero gradients percentage    0.992


171it [01:22,  2.41it/s]

| epoch   3 |   170/  891 batches | accuracy    0.887 | loss    0.335 | zero gradients percentage    0.992


181it [01:26,  2.33it/s]

| epoch   3 |   180/  891 batches | accuracy    0.879 | loss    0.359 | zero gradients percentage    0.992


191it [01:31,  1.91it/s]

| epoch   3 |   190/  891 batches | accuracy    0.881 | loss    0.367 | zero gradients percentage    0.992


201it [01:37,  2.21it/s]

| epoch   3 |   200/  891 batches | accuracy    0.894 | loss    0.315 | zero gradients percentage    0.992


211it [01:41,  2.32it/s]

| epoch   3 |   210/  891 batches | accuracy    0.887 | loss    0.365 | zero gradients percentage    0.992


221it [01:46,  1.92it/s]

| epoch   3 |   220/  891 batches | accuracy    0.892 | loss    0.335 | zero gradients percentage    0.992


231it [01:51,  2.11it/s]

| epoch   3 |   230/  891 batches | accuracy    0.888 | loss    0.313 | zero gradients percentage    0.992


241it [01:55,  2.42it/s]

| epoch   3 |   240/  891 batches | accuracy    0.881 | loss    0.348 | zero gradients percentage    0.992


251it [02:00,  2.34it/s]

| epoch   3 |   250/  891 batches | accuracy    0.869 | loss    0.377 | zero gradients percentage    0.992


261it [02:06,  1.49it/s]

| epoch   3 |   260/  891 batches | accuracy    0.893 | loss    0.337 | zero gradients percentage    0.992


271it [02:10,  2.44it/s]

| epoch   3 |   270/  891 batches | accuracy    0.892 | loss    0.335 | zero gradients percentage    0.992


281it [02:14,  2.28it/s]

| epoch   3 |   280/  891 batches | accuracy    0.875 | loss    0.357 | zero gradients percentage    0.992


291it [02:20,  1.71it/s]

| epoch   3 |   290/  891 batches | accuracy    0.883 | loss    0.332 | zero gradients percentage    0.992


301it [02:25,  2.20it/s]

| epoch   3 |   300/  891 batches | accuracy    0.887 | loss    0.347 | zero gradients percentage    0.992


311it [02:29,  2.35it/s]

| epoch   3 |   310/  891 batches | accuracy    0.885 | loss    0.334 | zero gradients percentage    0.992


321it [02:34,  1.70it/s]

| epoch   3 |   320/  891 batches | accuracy    0.887 | loss    0.358 | zero gradients percentage    0.992


331it [02:39,  2.09it/s]

| epoch   3 |   330/  891 batches | accuracy    0.880 | loss    0.360 | zero gradients percentage    0.992


341it [02:43,  2.48it/s]

| epoch   3 |   340/  891 batches | accuracy    0.880 | loss    0.365 | zero gradients percentage    0.992


351it [02:48,  1.91it/s]

| epoch   3 |   350/  891 batches | accuracy    0.878 | loss    0.366 | zero gradients percentage    0.992


361it [02:54,  1.78it/s]

| epoch   3 |   360/  891 batches | accuracy    0.891 | loss    0.332 | zero gradients percentage    0.992


371it [02:58,  2.44it/s]

| epoch   3 |   370/  891 batches | accuracy    0.898 | loss    0.294 | zero gradients percentage    0.992


381it [03:02,  2.36it/s]

| epoch   3 |   380/  891 batches | accuracy    0.894 | loss    0.323 | zero gradients percentage    0.992


391it [03:08,  1.78it/s]

| epoch   3 |   390/  891 batches | accuracy    0.884 | loss    0.357 | zero gradients percentage    0.992


401it [03:13,  2.36it/s]

| epoch   3 |   400/  891 batches | accuracy    0.889 | loss    0.331 | zero gradients percentage    0.992


411it [03:17,  2.17it/s]

| epoch   3 |   410/  891 batches | accuracy    0.865 | loss    0.399 | zero gradients percentage    0.992


421it [03:23,  1.65it/s]

| epoch   3 |   420/  891 batches | accuracy    0.883 | loss    0.361 | zero gradients percentage    0.992


431it [03:28,  2.26it/s]

| epoch   3 |   430/  891 batches | accuracy    0.879 | loss    0.358 | zero gradients percentage    0.992


441it [03:32,  2.39it/s]

| epoch   3 |   440/  891 batches | accuracy    0.887 | loss    0.332 | zero gradients percentage    0.992


451it [03:37,  1.54it/s]

| epoch   3 |   450/  891 batches | accuracy    0.884 | loss    0.370 | zero gradients percentage    0.992


461it [03:43,  2.23it/s]

| epoch   3 |   460/  891 batches | accuracy    0.887 | loss    0.355 | zero gradients percentage    0.992


471it [03:47,  2.47it/s]

| epoch   3 |   470/  891 batches | accuracy    0.880 | loss    0.370 | zero gradients percentage    0.992


481it [03:51,  1.96it/s]

| epoch   3 |   480/  891 batches | accuracy    0.876 | loss    0.378 | zero gradients percentage    0.992


491it [03:57,  2.06it/s]

| epoch   3 |   490/  891 batches | accuracy    0.881 | loss    0.350 | zero gradients percentage    0.992


501it [04:01,  2.39it/s]

| epoch   3 |   500/  891 batches | accuracy    0.885 | loss    0.355 | zero gradients percentage    0.992


511it [04:06,  2.21it/s]

| epoch   3 |   510/  891 batches | accuracy    0.873 | loss    0.368 | zero gradients percentage    0.992


521it [04:11,  1.84it/s]

| epoch   3 |   520/  891 batches | accuracy    0.873 | loss    0.371 | zero gradients percentage    0.992


531it [04:16,  2.33it/s]

| epoch   3 |   530/  891 batches | accuracy    0.874 | loss    0.358 | zero gradients percentage    0.992


541it [04:20,  2.45it/s]

| epoch   3 |   540/  891 batches | accuracy    0.870 | loss    0.351 | zero gradients percentage    0.992


551it [04:25,  1.73it/s]

| epoch   3 |   550/  891 batches | accuracy    0.893 | loss    0.337 | zero gradients percentage    0.992


561it [04:30,  2.32it/s]

| epoch   3 |   560/  891 batches | accuracy    0.878 | loss    0.362 | zero gradients percentage    0.992


571it [04:34,  2.54it/s]

| epoch   3 |   570/  891 batches | accuracy    0.874 | loss    0.369 | zero gradients percentage    0.992


581it [04:39,  1.73it/s]

| epoch   3 |   580/  891 batches | accuracy    0.887 | loss    0.337 | zero gradients percentage    0.992


591it [04:45,  1.97it/s]

| epoch   3 |   590/  891 batches | accuracy    0.880 | loss    0.331 | zero gradients percentage    0.992


601it [04:49,  2.40it/s]

| epoch   3 |   600/  891 batches | accuracy    0.875 | loss    0.373 | zero gradients percentage    0.992


611it [04:54,  1.69it/s]

| epoch   3 |   610/  891 batches | accuracy    0.880 | loss    0.346 | zero gradients percentage    0.992


621it [04:59,  2.21it/s]

| epoch   3 |   620/  891 batches | accuracy    0.889 | loss    0.330 | zero gradients percentage    0.992


631it [05:03,  2.37it/s]

| epoch   3 |   630/  891 batches | accuracy    0.905 | loss    0.318 | zero gradients percentage    0.992


641it [05:07,  2.51it/s]

| epoch   3 |   640/  891 batches | accuracy    0.870 | loss    0.402 | zero gradients percentage    0.992


651it [05:13,  1.63it/s]

| epoch   3 |   650/  891 batches | accuracy    0.884 | loss    0.343 | zero gradients percentage    0.992


661it [05:18,  2.31it/s]

| epoch   3 |   660/  891 batches | accuracy    0.884 | loss    0.362 | zero gradients percentage    0.992


671it [05:22,  2.49it/s]

| epoch   3 |   670/  891 batches | accuracy    0.894 | loss    0.332 | zero gradients percentage    0.992


681it [05:27,  1.74it/s]

| epoch   3 |   680/  891 batches | accuracy    0.888 | loss    0.333 | zero gradients percentage    0.992


691it [05:32,  2.16it/s]

| epoch   3 |   690/  891 batches | accuracy    0.877 | loss    0.351 | zero gradients percentage    0.992


701it [05:36,  2.44it/s]

| epoch   3 |   700/  891 batches | accuracy    0.893 | loss    0.322 | zero gradients percentage    0.992


711it [05:41,  1.81it/s]

| epoch   3 |   710/  891 batches | accuracy    0.865 | loss    0.357 | zero gradients percentage    0.992


721it [05:46,  2.35it/s]

| epoch   3 |   720/  891 batches | accuracy    0.884 | loss    0.336 | zero gradients percentage    0.992


731it [05:50,  2.50it/s]

| epoch   3 |   730/  891 batches | accuracy    0.878 | loss    0.336 | zero gradients percentage    0.992


741it [05:54,  2.72it/s]

| epoch   3 |   740/  891 batches | accuracy    0.887 | loss    0.335 | zero gradients percentage    0.992


751it [06:00,  1.79it/s]

| epoch   3 |   750/  891 batches | accuracy    0.883 | loss    0.339 | zero gradients percentage    0.992


761it [06:04,  2.59it/s]

| epoch   3 |   760/  891 batches | accuracy    0.873 | loss    0.368 | zero gradients percentage    0.992


771it [06:08,  2.32it/s]

| epoch   3 |   770/  891 batches | accuracy    0.900 | loss    0.335 | zero gradients percentage    0.992


781it [06:12,  1.87it/s]

| epoch   3 |   780/  891 batches | accuracy    0.878 | loss    0.372 | zero gradients percentage    0.992


791it [06:17,  2.43it/s]

| epoch   3 |   790/  891 batches | accuracy    0.886 | loss    0.333 | zero gradients percentage    0.992


801it [06:21,  2.68it/s]

| epoch   3 |   800/  891 batches | accuracy    0.879 | loss    0.348 | zero gradients percentage    0.992


811it [06:25,  2.48it/s]

| epoch   3 |   810/  891 batches | accuracy    0.880 | loss    0.373 | zero gradients percentage    0.992


821it [06:31,  1.91it/s]

| epoch   3 |   820/  891 batches | accuracy    0.863 | loss    0.391 | zero gradients percentage    0.992


831it [06:35,  2.55it/s]

| epoch   3 |   830/  891 batches | accuracy    0.873 | loss    0.367 | zero gradients percentage    0.992


841it [06:39,  2.65it/s]

| epoch   3 |   840/  891 batches | accuracy    0.865 | loss    0.371 | zero gradients percentage    0.992


851it [06:43,  1.89it/s]

| epoch   3 |   850/  891 batches | accuracy    0.866 | loss    0.384 | zero gradients percentage    0.992


861it [06:48,  2.26it/s]

| epoch   3 |   860/  891 batches | accuracy    0.884 | loss    0.370 | zero gradients percentage    0.992


871it [06:52,  2.44it/s]

| epoch   3 |   870/  891 batches | accuracy    0.880 | loss    0.344 | zero gradients percentage    0.992


881it [06:56,  2.46it/s]

| epoch   3 |   880/  891 batches | accuracy    0.883 | loss    0.366 | zero gradients percentage    0.992


891it [07:02,  2.11it/s]

| epoch   3 |   890/  891 batches | accuracy    0.886 | loss    0.350 | zero gradients percentage    0.992





-----------------------------------------------------------
| end of epoch   3 | time: 425.61s | valid accuracy    0.876 
-----------------------------------------------------------


11it [00:04,  2.30it/s]

| epoch   4 |    10/  891 batches | accuracy    0.881 | loss    0.372 | zero gradients percentage    0.992


21it [00:10,  1.71it/s]

| epoch   4 |    20/  891 batches | accuracy    0.874 | loss    0.345 | zero gradients percentage    0.992


31it [00:15,  2.23it/s]

| epoch   4 |    30/  891 batches | accuracy    0.891 | loss    0.311 | zero gradients percentage    0.992


41it [00:19,  2.46it/s]

| epoch   4 |    40/  891 batches | accuracy    0.884 | loss    0.342 | zero gradients percentage    0.992


51it [00:24,  1.92it/s]

| epoch   4 |    50/  891 batches | accuracy    0.885 | loss    0.346 | zero gradients percentage    0.992


61it [00:30,  1.94it/s]

| epoch   4 |    60/  891 batches | accuracy    0.893 | loss    0.351 | zero gradients percentage    0.992


71it [00:34,  2.41it/s]

| epoch   4 |    70/  891 batches | accuracy    0.866 | loss    0.370 | zero gradients percentage    0.992


81it [00:38,  2.65it/s]

| epoch   4 |    80/  891 batches | accuracy    0.899 | loss    0.309 | zero gradients percentage    0.992


91it [00:43,  1.74it/s]

| epoch   4 |    90/  891 batches | accuracy    0.893 | loss    0.324 | zero gradients percentage    0.992


101it [00:47,  2.72it/s]

| epoch   4 |   100/  891 batches | accuracy    0.898 | loss    0.308 | zero gradients percentage    0.992


111it [00:51,  2.31it/s]

| epoch   4 |   110/  891 batches | accuracy    0.879 | loss    0.323 | zero gradients percentage    0.992


121it [00:56,  1.98it/s]

| epoch   4 |   120/  891 batches | accuracy    0.882 | loss    0.339 | zero gradients percentage    0.992


131it [01:01,  2.27it/s]

| epoch   4 |   130/  891 batches | accuracy    0.889 | loss    0.328 | zero gradients percentage    0.992


141it [01:05,  2.49it/s]

| epoch   4 |   140/  891 batches | accuracy    0.894 | loss    0.330 | zero gradients percentage    0.992


151it [01:08,  2.77it/s]

| epoch   4 |   150/  891 batches | accuracy    0.878 | loss    0.351 | zero gradients percentage    0.992


161it [01:13,  1.90it/s]

| epoch   4 |   160/  891 batches | accuracy    0.887 | loss    0.344 | zero gradients percentage    0.992


171it [01:18,  2.17it/s]

| epoch   4 |   170/  891 batches | accuracy    0.893 | loss    0.349 | zero gradients percentage    0.992


181it [01:22,  2.32it/s]

| epoch   4 |   180/  891 batches | accuracy    0.878 | loss    0.369 | zero gradients percentage    0.992


191it [01:26,  2.20it/s]

| epoch   4 |   190/  891 batches | accuracy    0.873 | loss    0.365 | zero gradients percentage    0.992


201it [01:31,  2.10it/s]

| epoch   4 |   200/  891 batches | accuracy    0.878 | loss    0.385 | zero gradients percentage    0.992


211it [01:36,  2.46it/s]

| epoch   4 |   210/  891 batches | accuracy    0.868 | loss    0.386 | zero gradients percentage    0.992


221it [01:40,  2.44it/s]

| epoch   4 |   220/  891 batches | accuracy    0.896 | loss    0.304 | zero gradients percentage    0.992


231it [01:45,  1.61it/s]

| epoch   4 |   230/  891 batches | accuracy    0.884 | loss    0.361 | zero gradients percentage    0.992


241it [01:50,  2.35it/s]

| epoch   4 |   240/  891 batches | accuracy    0.877 | loss    0.360 | zero gradients percentage    0.992


251it [01:54,  2.35it/s]

| epoch   4 |   250/  891 batches | accuracy    0.881 | loss    0.352 | zero gradients percentage    0.992


261it [01:59,  1.79it/s]

| epoch   4 |   260/  891 batches | accuracy    0.885 | loss    0.346 | zero gradients percentage    0.992


271it [02:04,  2.14it/s]

| epoch   4 |   270/  891 batches | accuracy    0.881 | loss    0.362 | zero gradients percentage    0.992


281it [02:09,  2.26it/s]

| epoch   4 |   280/  891 batches | accuracy    0.865 | loss    0.393 | zero gradients percentage    0.992


291it [02:14,  1.84it/s]

| epoch   4 |   290/  891 batches | accuracy    0.891 | loss    0.334 | zero gradients percentage    0.992


301it [02:19,  2.00it/s]

| epoch   4 |   300/  891 batches | accuracy    0.891 | loss    0.337 | zero gradients percentage    0.992


311it [02:24,  2.17it/s]

| epoch   4 |   310/  891 batches | accuracy    0.898 | loss    0.313 | zero gradients percentage    0.992


321it [02:29,  1.70it/s]

| epoch   4 |   320/  891 batches | accuracy    0.881 | loss    0.359 | zero gradients percentage    0.992


331it [02:35,  2.05it/s]

| epoch   4 |   330/  891 batches | accuracy    0.891 | loss    0.329 | zero gradients percentage    0.992


341it [02:39,  2.15it/s]

| epoch   4 |   340/  891 batches | accuracy    0.885 | loss    0.358 | zero gradients percentage    0.992


351it [02:43,  2.38it/s]

| epoch   4 |   350/  891 batches | accuracy    0.887 | loss    0.346 | zero gradients percentage    0.992


361it [02:49,  1.85it/s]

| epoch   4 |   360/  891 batches | accuracy    0.874 | loss    0.383 | zero gradients percentage    0.992


371it [02:53,  2.38it/s]

| epoch   4 |   370/  891 batches | accuracy    0.909 | loss    0.282 | zero gradients percentage    0.992


381it [02:58,  2.48it/s]

| epoch   4 |   380/  891 batches | accuracy    0.873 | loss    0.372 | zero gradients percentage    0.992


391it [03:03,  1.78it/s]

| epoch   4 |   390/  891 batches | accuracy    0.869 | loss    0.359 | zero gradients percentage    0.992


401it [03:07,  2.35it/s]

| epoch   4 |   400/  891 batches | accuracy    0.888 | loss    0.343 | zero gradients percentage    0.992


411it [03:12,  2.43it/s]

| epoch   4 |   410/  891 batches | accuracy    0.880 | loss    0.369 | zero gradients percentage    0.992


421it [03:17,  1.58it/s]

| epoch   4 |   420/  891 batches | accuracy    0.883 | loss    0.352 | zero gradients percentage    0.992


431it [03:22,  2.12it/s]

| epoch   4 |   430/  891 batches | accuracy    0.887 | loss    0.331 | zero gradients percentage    0.992


441it [03:27,  2.33it/s]

| epoch   4 |   440/  891 batches | accuracy    0.901 | loss    0.322 | zero gradients percentage    0.992


451it [03:31,  1.90it/s]

| epoch   4 |   450/  891 batches | accuracy    0.866 | loss    0.384 | zero gradients percentage    0.992


461it [03:37,  1.89it/s]

| epoch   4 |   460/  891 batches | accuracy    0.880 | loss    0.346 | zero gradients percentage    0.992


471it [03:42,  2.28it/s]

| epoch   4 |   470/  891 batches | accuracy    0.875 | loss    0.355 | zero gradients percentage    0.992


481it [03:46,  2.13it/s]

| epoch   4 |   480/  891 batches | accuracy    0.899 | loss    0.309 | zero gradients percentage    0.992


491it [03:52,  1.98it/s]

| epoch   4 |   490/  891 batches | accuracy    0.870 | loss    0.374 | zero gradients percentage    0.992


501it [03:57,  2.02it/s]

| epoch   4 |   500/  891 batches | accuracy    0.877 | loss    0.358 | zero gradients percentage    0.992


511it [04:01,  2.37it/s]

| epoch   4 |   510/  891 batches | accuracy    0.905 | loss    0.310 | zero gradients percentage    0.992


521it [04:07,  1.64it/s]

| epoch   4 |   520/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.992


531it [04:11,  2.28it/s]

| epoch   4 |   530/  891 batches | accuracy    0.880 | loss    0.350 | zero gradients percentage    0.992


541it [04:16,  2.04it/s]

| epoch   4 |   540/  891 batches | accuracy    0.873 | loss    0.392 | zero gradients percentage    0.992


551it [04:21,  1.72it/s]

| epoch   4 |   550/  891 batches | accuracy    0.887 | loss    0.341 | zero gradients percentage    0.992


561it [04:26,  2.36it/s]

| epoch   4 |   560/  891 batches | accuracy    0.875 | loss    0.361 | zero gradients percentage    0.992


571it [04:30,  2.35it/s]

| epoch   4 |   570/  891 batches | accuracy    0.895 | loss    0.335 | zero gradients percentage    0.992


581it [04:36,  1.63it/s]

| epoch   4 |   580/  891 batches | accuracy    0.869 | loss    0.384 | zero gradients percentage    0.992


591it [04:41,  2.25it/s]

| epoch   4 |   590/  891 batches | accuracy    0.885 | loss    0.351 | zero gradients percentage    0.992


601it [04:45,  2.30it/s]

| epoch   4 |   600/  891 batches | accuracy    0.891 | loss    0.347 | zero gradients percentage    0.992


611it [04:50,  1.92it/s]

| epoch   4 |   610/  891 batches | accuracy    0.890 | loss    0.360 | zero gradients percentage    0.992


621it [04:55,  2.06it/s]

| epoch   4 |   620/  891 batches | accuracy    0.878 | loss    0.338 | zero gradients percentage    0.992


631it [05:00,  2.36it/s]

| epoch   4 |   630/  891 batches | accuracy    0.880 | loss    0.360 | zero gradients percentage    0.992


641it [05:04,  2.14it/s]

| epoch   4 |   640/  891 batches | accuracy    0.885 | loss    0.365 | zero gradients percentage    0.992


651it [05:10,  1.89it/s]

| epoch   4 |   650/  891 batches | accuracy    0.880 | loss    0.347 | zero gradients percentage    0.992


661it [05:14,  2.42it/s]

| epoch   4 |   660/  891 batches | accuracy    0.872 | loss    0.373 | zero gradients percentage    0.992


671it [05:19,  2.01it/s]

| epoch   4 |   670/  891 batches | accuracy    0.882 | loss    0.363 | zero gradients percentage    0.992


681it [05:25,  1.62it/s]

| epoch   4 |   680/  891 batches | accuracy    0.872 | loss    0.381 | zero gradients percentage    0.992


691it [05:29,  2.44it/s]

| epoch   4 |   690/  891 batches | accuracy    0.876 | loss    0.364 | zero gradients percentage    0.992


701it [05:34,  2.33it/s]

| epoch   4 |   700/  891 batches | accuracy    0.892 | loss    0.339 | zero gradients percentage    0.992


711it [05:40,  1.52it/s]

| epoch   4 |   710/  891 batches | accuracy    0.870 | loss    0.354 | zero gradients percentage    0.992


721it [05:44,  2.27it/s]

| epoch   4 |   720/  891 batches | accuracy    0.888 | loss    0.332 | zero gradients percentage    0.992


731it [05:49,  2.18it/s]

| epoch   4 |   730/  891 batches | accuracy    0.906 | loss    0.301 | zero gradients percentage    0.992


741it [05:54,  1.66it/s]

| epoch   4 |   740/  891 batches | accuracy    0.873 | loss    0.372 | zero gradients percentage    0.992


751it [05:59,  2.22it/s]

| epoch   4 |   750/  891 batches | accuracy    0.883 | loss    0.328 | zero gradients percentage    0.992


761it [06:03,  2.39it/s]

| epoch   4 |   760/  891 batches | accuracy    0.898 | loss    0.312 | zero gradients percentage    0.992


771it [06:08,  1.99it/s]

| epoch   4 |   770/  891 batches | accuracy    0.877 | loss    0.354 | zero gradients percentage    0.992


781it [06:13,  2.09it/s]

| epoch   4 |   780/  891 batches | accuracy    0.878 | loss    0.357 | zero gradients percentage    0.992


791it [06:18,  2.51it/s]

| epoch   4 |   790/  891 batches | accuracy    0.889 | loss    0.319 | zero gradients percentage    0.992


801it [06:22,  2.51it/s]

| epoch   4 |   800/  891 batches | accuracy    0.877 | loss    0.371 | zero gradients percentage    0.992


811it [06:28,  1.71it/s]

| epoch   4 |   810/  891 batches | accuracy    0.885 | loss    0.348 | zero gradients percentage    0.992


821it [06:32,  2.31it/s]

| epoch   4 |   820/  891 batches | accuracy    0.881 | loss    0.361 | zero gradients percentage    0.992


831it [06:37,  2.28it/s]

| epoch   4 |   830/  891 batches | accuracy    0.888 | loss    0.326 | zero gradients percentage    0.992


841it [06:42,  1.60it/s]

| epoch   4 |   840/  891 batches | accuracy    0.867 | loss    0.401 | zero gradients percentage    0.992


851it [06:47,  2.20it/s]

| epoch   4 |   850/  891 batches | accuracy    0.888 | loss    0.369 | zero gradients percentage    0.992


861it [06:51,  2.26it/s]

| epoch   4 |   860/  891 batches | accuracy    0.882 | loss    0.342 | zero gradients percentage    0.992


871it [06:57,  1.66it/s]

| epoch   4 |   870/  891 batches | accuracy    0.884 | loss    0.367 | zero gradients percentage    0.992


881it [07:02,  2.25it/s]

| epoch   4 |   880/  891 batches | accuracy    0.879 | loss    0.328 | zero gradients percentage    0.992


891it [07:06,  2.09it/s]

| epoch   4 |   890/  891 batches | accuracy    0.890 | loss    0.335 | zero gradients percentage    0.992





-----------------------------------------------------------
| end of epoch   4 | time: 429.29s | valid accuracy    0.876 
-----------------------------------------------------------


11it [00:06,  1.92it/s]

| epoch   5 |    10/  891 batches | accuracy    0.883 | loss    0.333 | zero gradients percentage    0.992


21it [00:10,  2.52it/s]

| epoch   5 |    20/  891 batches | accuracy    0.874 | loss    0.387 | zero gradients percentage    0.992


31it [00:14,  2.40it/s]

| epoch   5 |    30/  891 batches | accuracy    0.884 | loss    0.347 | zero gradients percentage    0.992


41it [00:19,  1.82it/s]

| epoch   5 |    40/  891 batches | accuracy    0.884 | loss    0.361 | zero gradients percentage    0.992


51it [00:24,  2.58it/s]

| epoch   5 |    50/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.992


61it [00:28,  2.61it/s]

| epoch   5 |    60/  891 batches | accuracy    0.873 | loss    0.367 | zero gradients percentage    0.992


71it [00:31,  2.38it/s]

| epoch   5 |    70/  891 batches | accuracy    0.880 | loss    0.362 | zero gradients percentage    0.992


81it [00:37,  2.06it/s]

| epoch   5 |    80/  891 batches | accuracy    0.877 | loss    0.361 | zero gradients percentage    0.992


91it [00:40,  2.67it/s]

| epoch   5 |    90/  891 batches | accuracy    0.871 | loss    0.382 | zero gradients percentage    0.992


101it [00:45,  2.42it/s]

| epoch   5 |   100/  891 batches | accuracy    0.882 | loss    0.372 | zero gradients percentage    0.992


111it [00:49,  1.98it/s]

| epoch   5 |   110/  891 batches | accuracy    0.882 | loss    0.368 | zero gradients percentage    0.992


121it [00:54,  2.55it/s]

| epoch   5 |   120/  891 batches | accuracy    0.882 | loss    0.346 | zero gradients percentage    0.992


131it [00:58,  2.62it/s]

| epoch   5 |   130/  891 batches | accuracy    0.880 | loss    0.352 | zero gradients percentage    0.992


141it [01:01,  2.69it/s]

| epoch   5 |   140/  891 batches | accuracy    0.887 | loss    0.344 | zero gradients percentage    0.992


151it [01:07,  1.93it/s]

| epoch   5 |   150/  891 batches | accuracy    0.874 | loss    0.359 | zero gradients percentage    0.992


161it [01:11,  2.54it/s]

| epoch   5 |   160/  891 batches | accuracy    0.890 | loss    0.354 | zero gradients percentage    0.992


171it [01:15,  2.26it/s]

| epoch   5 |   170/  891 batches | accuracy    0.880 | loss    0.348 | zero gradients percentage    0.992


181it [01:20,  1.72it/s]

| epoch   5 |   180/  891 batches | accuracy    0.897 | loss    0.329 | zero gradients percentage    0.992


191it [01:25,  2.25it/s]

| epoch   5 |   190/  891 batches | accuracy    0.889 | loss    0.353 | zero gradients percentage    0.992


201it [01:30,  2.22it/s]

| epoch   5 |   200/  891 batches | accuracy    0.886 | loss    0.325 | zero gradients percentage    0.992


211it [01:34,  1.78it/s]

| epoch   5 |   210/  891 batches | accuracy    0.883 | loss    0.334 | zero gradients percentage    0.992


221it [01:40,  2.12it/s]

| epoch   5 |   220/  891 batches | accuracy    0.880 | loss    0.374 | zero gradients percentage    0.992


231it [01:44,  2.28it/s]

| epoch   5 |   230/  891 batches | accuracy    0.881 | loss    0.360 | zero gradients percentage    0.992


241it [01:48,  2.45it/s]

| epoch   5 |   240/  891 batches | accuracy    0.887 | loss    0.331 | zero gradients percentage    0.992


251it [01:54,  1.67it/s]

| epoch   5 |   250/  891 batches | accuracy    0.877 | loss    0.371 | zero gradients percentage    0.992


261it [01:58,  2.33it/s]

| epoch   5 |   260/  891 batches | accuracy    0.888 | loss    0.343 | zero gradients percentage    0.992


271it [02:03,  2.29it/s]

| epoch   5 |   270/  891 batches | accuracy    0.884 | loss    0.332 | zero gradients percentage    0.992


281it [02:08,  1.74it/s]

| epoch   5 |   280/  891 batches | accuracy    0.883 | loss    0.341 | zero gradients percentage    0.992


291it [02:13,  2.12it/s]

| epoch   5 |   290/  891 batches | accuracy    0.895 | loss    0.319 | zero gradients percentage    0.992


301it [02:18,  2.21it/s]

| epoch   5 |   300/  891 batches | accuracy    0.887 | loss    0.336 | zero gradients percentage    0.992


311it [02:23,  1.85it/s]

| epoch   5 |   310/  891 batches | accuracy    0.879 | loss    0.346 | zero gradients percentage    0.992


321it [02:28,  2.24it/s]

| epoch   5 |   320/  891 batches | accuracy    0.889 | loss    0.340 | zero gradients percentage    0.992


331it [02:32,  2.44it/s]

| epoch   5 |   330/  891 batches | accuracy    0.877 | loss    0.335 | zero gradients percentage    0.992


341it [02:37,  1.89it/s]

| epoch   5 |   340/  891 batches | accuracy    0.888 | loss    0.343 | zero gradients percentage    0.992


351it [02:42,  2.01it/s]

| epoch   5 |   350/  891 batches | accuracy    0.883 | loss    0.346 | zero gradients percentage    0.992


361it [02:47,  2.45it/s]

| epoch   5 |   360/  891 batches | accuracy    0.895 | loss    0.307 | zero gradients percentage    0.992


371it [02:51,  2.43it/s]

| epoch   5 |   370/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.992


381it [02:57,  1.70it/s]

| epoch   5 |   380/  891 batches | accuracy    0.884 | loss    0.353 | zero gradients percentage    0.992


391it [03:01,  2.14it/s]

| epoch   5 |   390/  891 batches | accuracy    0.887 | loss    0.351 | zero gradients percentage    0.992


401it [03:05,  2.37it/s]

| epoch   5 |   400/  891 batches | accuracy    0.886 | loss    0.348 | zero gradients percentage    0.992


411it [03:11,  1.58it/s]

| epoch   5 |   410/  891 batches | accuracy    0.870 | loss    0.369 | zero gradients percentage    0.992


421it [03:16,  2.39it/s]

| epoch   5 |   420/  891 batches | accuracy    0.876 | loss    0.381 | zero gradients percentage    0.992


431it [03:20,  2.47it/s]

| epoch   5 |   430/  891 batches | accuracy    0.881 | loss    0.339 | zero gradients percentage    0.992


441it [03:26,  1.62it/s]

| epoch   5 |   440/  891 batches | accuracy    0.900 | loss    0.329 | zero gradients percentage    0.992


451it [03:31,  2.10it/s]

| epoch   5 |   450/  891 batches | accuracy    0.863 | loss    0.376 | zero gradients percentage    0.992


461it [03:35,  2.31it/s]

| epoch   5 |   460/  891 batches | accuracy    0.893 | loss    0.336 | zero gradients percentage    0.992


471it [03:39,  1.98it/s]

| epoch   5 |   470/  891 batches | accuracy    0.888 | loss    0.354 | zero gradients percentage    0.992


481it [03:45,  1.89it/s]

| epoch   5 |   480/  891 batches | accuracy    0.875 | loss    0.357 | zero gradients percentage    0.992


491it [03:50,  2.39it/s]

| epoch   5 |   490/  891 batches | accuracy    0.902 | loss    0.298 | zero gradients percentage    0.992


501it [03:54,  2.04it/s]

| epoch   5 |   500/  891 batches | accuracy    0.880 | loss    0.367 | zero gradients percentage    0.992


511it [04:00,  1.94it/s]

| epoch   5 |   510/  891 batches | accuracy    0.900 | loss    0.349 | zero gradients percentage    0.992


521it [04:04,  2.48it/s]

| epoch   5 |   520/  891 batches | accuracy    0.894 | loss    0.351 | zero gradients percentage    0.992


531it [04:08,  2.40it/s]

| epoch   5 |   530/  891 batches | accuracy    0.894 | loss    0.308 | zero gradients percentage    0.992


541it [04:14,  1.68it/s]

| epoch   5 |   540/  891 batches | accuracy    0.873 | loss    0.352 | zero gradients percentage    0.992


551it [04:19,  2.23it/s]

| epoch   5 |   550/  891 batches | accuracy    0.880 | loss    0.369 | zero gradients percentage    0.992


561it [04:23,  2.14it/s]

| epoch   5 |   560/  891 batches | accuracy    0.882 | loss    0.335 | zero gradients percentage    0.992


571it [04:28,  1.58it/s]

| epoch   5 |   570/  891 batches | accuracy    0.873 | loss    0.361 | zero gradients percentage    0.992


581it [04:33,  2.38it/s]

| epoch   5 |   580/  891 batches | accuracy    0.875 | loss    0.360 | zero gradients percentage    0.992


591it [04:37,  2.32it/s]

| epoch   5 |   590/  891 batches | accuracy    0.873 | loss    0.361 | zero gradients percentage    0.992


601it [04:42,  1.80it/s]

| epoch   5 |   600/  891 batches | accuracy    0.894 | loss    0.346 | zero gradients percentage    0.992


611it [04:48,  2.19it/s]

| epoch   5 |   610/  891 batches | accuracy    0.886 | loss    0.333 | zero gradients percentage    0.992


621it [04:52,  2.17it/s]

| epoch   5 |   620/  891 batches | accuracy    0.886 | loss    0.352 | zero gradients percentage    0.992


631it [04:56,  2.19it/s]

| epoch   5 |   630/  891 batches | accuracy    0.884 | loss    0.347 | zero gradients percentage    0.992


641it [05:02,  1.76it/s]

| epoch   5 |   640/  891 batches | accuracy    0.877 | loss    0.328 | zero gradients percentage    0.992


651it [05:06,  2.31it/s]

| epoch   5 |   650/  891 batches | accuracy    0.893 | loss    0.319 | zero gradients percentage    0.992


661it [05:11,  2.34it/s]

| epoch   5 |   660/  891 batches | accuracy    0.880 | loss    0.350 | zero gradients percentage    0.992


671it [05:17,  1.59it/s]

| epoch   5 |   670/  891 batches | accuracy    0.884 | loss    0.333 | zero gradients percentage    0.992


681it [05:21,  2.51it/s]

| epoch   5 |   680/  891 batches | accuracy    0.876 | loss    0.337 | zero gradients percentage    0.992


691it [05:25,  2.34it/s]

| epoch   5 |   690/  891 batches | accuracy    0.887 | loss    0.345 | zero gradients percentage    0.992


701it [05:30,  1.80it/s]

| epoch   5 |   700/  891 batches | accuracy    0.889 | loss    0.339 | zero gradients percentage    0.992


711it [05:35,  2.46it/s]

| epoch   5 |   710/  891 batches | accuracy    0.893 | loss    0.316 | zero gradients percentage    0.992


721it [05:39,  2.58it/s]

| epoch   5 |   720/  891 batches | accuracy    0.885 | loss    0.353 | zero gradients percentage    0.992


731it [05:43,  2.13it/s]

| epoch   5 |   730/  891 batches | accuracy    0.882 | loss    0.351 | zero gradients percentage    0.992


741it [05:48,  1.95it/s]

| epoch   5 |   740/  891 batches | accuracy    0.902 | loss    0.324 | zero gradients percentage    0.992


751it [05:52,  2.41it/s]

| epoch   5 |   750/  891 batches | accuracy    0.871 | loss    0.341 | zero gradients percentage    0.992


761it [05:56,  2.34it/s]

| epoch   5 |   760/  891 batches | accuracy    0.892 | loss    0.342 | zero gradients percentage    0.992


771it [06:02,  1.86it/s]

| epoch   5 |   770/  891 batches | accuracy    0.861 | loss    0.378 | zero gradients percentage    0.992


781it [06:07,  2.19it/s]

| epoch   5 |   780/  891 batches | accuracy    0.882 | loss    0.356 | zero gradients percentage    0.992


791it [06:11,  1.93it/s]

| epoch   5 |   790/  891 batches | accuracy    0.884 | loss    0.356 | zero gradients percentage    0.992


801it [06:16,  1.86it/s]

| epoch   5 |   800/  891 batches | accuracy    0.863 | loss    0.388 | zero gradients percentage    0.992


811it [06:22,  1.76it/s]

| epoch   5 |   810/  891 batches | accuracy    0.885 | loss    0.366 | zero gradients percentage    0.992


821it [06:26,  2.49it/s]

| epoch   5 |   820/  891 batches | accuracy    0.883 | loss    0.365 | zero gradients percentage    0.992


831it [06:30,  2.60it/s]

| epoch   5 |   830/  891 batches | accuracy    0.893 | loss    0.329 | zero gradients percentage    0.992


841it [06:35,  1.77it/s]

| epoch   5 |   840/  891 batches | accuracy    0.888 | loss    0.341 | zero gradients percentage    0.992


851it [06:40,  2.29it/s]

| epoch   5 |   850/  891 batches | accuracy    0.877 | loss    0.377 | zero gradients percentage    0.992


861it [06:44,  2.64it/s]

| epoch   5 |   860/  891 batches | accuracy    0.881 | loss    0.345 | zero gradients percentage    0.992


871it [06:48,  2.08it/s]

| epoch   5 |   870/  891 batches | accuracy    0.884 | loss    0.366 | zero gradients percentage    0.992


881it [06:54,  1.90it/s]

| epoch   5 |   880/  891 batches | accuracy    0.881 | loss    0.373 | zero gradients percentage    0.992


891it [06:58,  2.13it/s]

| epoch   5 |   890/  891 batches | accuracy    0.909 | loss    0.316 | zero gradients percentage    0.992





-----------------------------------------------------------
| end of epoch   5 | time: 421.52s | valid accuracy    0.876 
-----------------------------------------------------------
Checking the results of test dataset.
test accuracy    0.875
test loss    0.003
