# For-Debugging Notebook

This notebook is a notebook for debugging `unlearning` function, based on the following notebook:
- https://www.kaggle.com/code/eleni30fillou/run-unlearn-finetune

## How to Use

1. implement your `unlearning` function;
2. turn on `internet on` in the right panel;
3. set the variable `USE_MOCK` to `True` in the 2nd code cell;
4. (Optional) modity other parameters in the same cell like `n_checkpoints`;
5. if your codes work,
   - turn off `internet on` in the right panel;
   - set the variable `USE_MOCK` to `False` in the 2nd code cell;
   - save the notebook;
   - and submit!

## Updates
- Ver.5:
  - add a stopwatch decorator
  - make `unlearning` return a updated model
- Ver.4: fix seed

In [1]:
import os
import subprocess

import pandas as pd
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet18
from torch.utils.data import DataLoader, Dataset

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 
DEVICE

'cuda'

In [2]:
torch.manual_seed(128)

Gr = torch.Generator()
Gr.manual_seed(256)

Gf = torch.Generator()
Gf.manual_seed(512)

Gv = torch.Generator()
Gv.manual_seed(1024)

<torch._C.Generator at 0x7cb10d583a70>

In [3]:
# Mock setting

import logging
import requests
import tqdm
from torch.utils.data import Subset
from torchvision import transforms

USE_MOCK: bool = True

if USE_MOCK:
    logging.warning('Running with Mock')
    logging.warning('In this mode, internet access may be required.')

    # The number of checkpoints in this mode.
    # NOTE: 512 checkpoints are required in this competition.
    n_checkpoints = 5
    
    # The directory for a dataset and a pretrained model
    mock_dir = './mock'
    mock_model_path = os.path.join(mock_dir, "weights_resnet18_cifar10.pth")
    os.makedirs(mock_dir, exist_ok=True)

In [4]:
# It's really important to add an accelerator to your notebook, as otherwise the submission will fail.
# We recomment using the P100 GPU rather than T4 as it's faster and will increase the chances of passing the time cut-off threshold.

if DEVICE != 'cuda':
    raise RuntimeError('Make sure you have added an accelerator to your notebook; the submission will fail otherwise!')

In [6]:
# Helper functions for loading the hidden dataset.

if USE_MOCK:
    
    class DatasetWrapper(Dataset):
        
        def __init__(self, ds: Dataset):
            self._ds = ds
    
        def __len__(self):
            return len(self._ds)
    
        def __getitem__(self, index):
            item = self._ds[index]
            result = {
                'image': item[0],
                'image_id': index,
                'age_group': item[1],
                'age': item[1],
                'person_id': index,
            }
            return result
    
    def get_dataset(batch_size, retain_ratio=0.8, thinning_param: int=1000, root=mock_dir) -> tuple[DataLoader, DataLoader, DataLoader]:
        
        # utils
        normalize = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ])

        # create dataset
        train_ds = DatasetWrapper(torchvision.datasets.CIFAR10(root=mock_dir, train=True, download=True, transform=normalize))
        retain_ds = Subset(train_ds, range(0, int(len(train_ds)*retain_ratio), thinning_param))
        forget_ds = Subset(train_ds, range(int(len(train_ds)*retain_ratio), len(train_ds), thinning_param))
        val_ds = DatasetWrapper(torchvision.datasets.CIFAR10(root=mock_dir, train=False, download=True, transform=normalize))

        retain_loader = DataLoader(retain_ds, batch_size=batch_size, shuffle=True)
        forget_loader = DataLoader(forget_ds, batch_size=batch_size, shuffle=True)
        validation_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

        return retain_loader, forget_loader, validation_loader
    
    # For test
#     for sample in get_dataset(3)[0]:
#         print(sample)
#         break
    
else:
    def load_example(df_row):
        image = torchvision.io.read_image(df_row['image_path'])
        result = {
            'image': image,
            'image_id': df_row['image_id'],
            'age_group': df_row['age_group'],
            'age': df_row['age'],
            'person_id': df_row['person_id']
        }
        return result


    class HiddenDataset(Dataset):
        '''The hidden dataset.'''
        def __init__(self, split='train'):
            super().__init__()
            self.examples = []

            df = pd.read_csv(f'/kaggle/input/neurips-2023-machine-unlearning/{split}.csv')
            df['image_path'] = df['image_id'].apply(lambda x: os.path.join('/kaggle/input/neurips-2023-machine-unlearning/', 'images', x.split('-')[0], x.split('-')[1] + '.png'))
            df = df.sort_values(by='image_path')
            df.apply(lambda row: self.examples.append(load_example(row)), axis=1)
            if len(self.examples) == 0:
                raise ValueError('No examples.')

        def __len__(self):
            return len(self.examples)

        def __getitem__(self, idx):
            example = self.examples[idx]
            image = example['image']
            image = image.to(torch.float32)
            example['image'] = image
            return example


    def get_dataset(batch_size):
        '''Get the dataset.'''
        retain_ds = HiddenDataset(split='retain')
        forget_ds = HiddenDataset(split='forget')
        val_ds = HiddenDataset(split='validation')

        retain_loader = DataLoader(retain_ds, batch_size=batch_size, shuffle=True, generator=Gr)
        forget_loader = DataLoader(forget_ds, batch_size=batch_size, shuffle=True, generator=Gf)
        validation_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, generator=Gv)

        return retain_loader, forget_loader, validation_loader

In [7]:
# Utils
from contextlib import contextmanager
import time

@contextmanager
def stopwatch(name='STOPWATCH'):
    s = time.time()
    try:
        yield
    finally:
        print(f"{name}: {time.time()-s} seconds passed")
        
# for test
# with stopwatch():
#     for i in range(5):
#         time.sleep(1)

In [10]:
import copy
import itertools
from tqdm import tqdm

def get_mean_var(p, is_base_dist=False, alpha=3e-6):
        
    var = copy.deepcopy(1./(p.grad2_acc+1e-8))
    
    var = var.clamp(max=1e3)
    
    #if p.size(0) == num_classes:
    #    var = var.clamp(max=1e2)
    
    var = alpha * var

    if p.ndim > 1:
        var = var.mean(dim=1, keepdim=True).expand_as(p).clone()
        
    if not is_base_dist:
        mu = copy.deepcopy(p.data0.clone())
    else:
        mu = copy.deepcopy(p.data0.clone())
        
    #if p.size(0) == num_classes and num_to_forget is None:
    #    mu[class_to_forget] = 0
    #    var[class_to_forget] = 0.0001
    #if p.size(0) == num_classes:
        # Last layer
    #    var *= 10
    if p.ndim == 1:
        # BatchNorm
        var *= 10
        var*=1
    return mu, var


def unlearning(
    net, 
    retain_loader, 
    forget_loader, 
    val_loader):

    # netf = copy.deepcopy(net)
    for p in itertools.chain(net.parameters()):
        p.data0 = copy.deepcopy(p.data.clone())
    
    net.eval()
    loss_fn = nn.CrossEntropyLoss()

    for p in net.parameters():
        p.grad_acc = 0
        p.grad2_acc = 0
        
    
    for sample in tqdm(retain_loader):

        data = sample["image"]
        orig_target = sample["age_group"]
        print(orig_target)
        
        data, orig_target = data.to(DEVICE), orig_target.to(DEVICE)
        
        output = net(data)
        print(output)
        
        prob = torch.nn.functional.softmax(output, dim=-1).data
        print(prob)
        
        print(output.shape[1])

        for y in range(output.shape[1]):
            
            target = torch.empty_like(orig_target).fill_(y)
            print(target)
            
            loss = loss_fn(output, target)
            print(loss)
            
            net.zero_grad()
            
            loss.backward(retain_graph=True)
            
            for p in net.parameters():
                if p.requires_grad:
                    p.grad_acc += (orig_target == target).float() * p.grad.data
                    p.grad2_acc += prob[:, y] * p.grad.data.pow(2)
                    
    for p in net.parameters():
        
        p.grad_acc /= len(retain_loader)
        p.grad2_acc /= len(retain_loader)
        print(p.grad_acc.dim)
        print(p.grad2_acc.dim)
        
    alpha = 1e-6
    torch.manual_seed(1756)
    for i, p in enumerate(net.parameters()):
        mu, var = get_mean_var(p, False, alpha=alpha)
        p.data = mu + var* torch.empty_like(p.data0).normal_()
    
    net.eval()
    return net

In [None]:
import copy
import itertools
from tqdm import tqdm


def unlearning(
    net, 
    retain_loader, 
    forget_loader, 
    val_loader):

    for p in itertools.chain(net.parameters()):
        p.data0 = copy.deepcopy(p.data.clone())

    loss_fn = nn.CrossEntropyLoss()

    for p in net.parameters():
        p.grad_acc = 0
        p.grad2_acc = 0
        
    
    for sample in tqdm(retain_loader):

        data = sample["image"]
        orig_target = sample["age_group"]

        data, orig_target = data.to(DEVICE), orig_target.to(DEVICE)
        output = net(data)
        prob = torch.nn.functional.softmax(output, dim=-1).data

        for y in range(output.shape[1]):
            
            target = torch.empty_like(orig_target).fill_(y)
            print(target)
            
            loss = loss_fn(output, target)
            print(loss)
            
            net.zero_grad()
            
            loss.backward(retain_graph=True)
            
            for p in net.parameters():
                if p.requires_grad:
                    p.grad_acc += (orig_target == target).float() * p.grad.data
                    p.grad2_acc += prob[:, y] * p.grad.data.pow(2)
                    
    for p in net.parameters():
        
        p.grad_acc /= len(retain_loader)
        p.grad2_acc /= len(retain_loader)
        print(p.grad_acc.dim)
        print(p.grad2_acc.dim)
        
    alpha = 1e-6
    torch.manual_seed(1756)
    for i, p in enumerate(net.parameters()):
        mu, var = get_mean_var(p, False, alpha=alpha)
        p.data = mu + var* grad_acc  * torch.empty_like(p.data0).normal_()
    
    net.eval()
    return net

In [11]:
if USE_MOCK:
    
    # NOTE: Almost same as the original codes
    
    # Download
    if not os.path.exists(mock_model_path):
        response = requests.get("https://storage.googleapis.com/unlearning-challenge/weights_resnet18_cifar10.pth")
        open(mock_model_path, "wb").write(response.content)    
    
    os.makedirs('/kaggle/tmp2', exist_ok=True)
    retain_loader, forget_loader, validation_loader = get_dataset(1)
    net = resnet18(weights=None, num_classes=10)
    
    #for batch in retain_loader:
        # Print the type and structure of the batch
    #    print(type(batch))
    #    print(len(batch))  # To see how many values are in the tuple

        # Uncomment the following line to print the batch itself
    #    print(batch)

    #loader_length = len(validation_loader)
    #print("Number of values returned by train_loader:", loader_length)
    #loader_length = len(forget_loader)
    #print("Number of values returned by forget_loader:", loader_length)
    net.to(DEVICE)
    for i in range(n_checkpoints):
        net.load_state_dict(torch.load(mock_model_path))
        net_ = unlearning(net, retain_loader, forget_loader, validation_loader)
        state = net_.state_dict()
        torch.save(state, f'/kaggle/tmp2/unlearned_checkpoint_{i}.pth')

    # Ensure that submission.zip will contain exactly 512 checkpoints 
    # (if this is not the case, an exception will be thrown).
    unlearned_ckpts = os.listdir('/kaggle/tmp2')
    if len(unlearned_ckpts) != n_checkpoints:
        raise RuntimeError('Expected exactly 512 checkpoints. The submission will throw an exception otherwise.')
    
        
    print(os.listdir('/kaggle/tmp2'))
    #subprocess.run('zip submission.zip /kaggle/tmp2/*.pth', shell=True)
    subprocess.run('zip submission.zip /kaggle/tmp2/*.pth', shell=True)
    
else:
    if os.path.exists('/kaggle/input/neurips-2023-machine-unlearning/empty.txt'):
        # mock submission
        subprocess.run('touch submission.zip', shell=True)
    else:

        # Note: it's really important to create the unlearned checkpoints outside of the working directory 
        # as otherwise this notebook may fail due to running out of disk space.
        # The below code saves them in /kaggle/tmp to avoid that issue.

        os.makedirs('/kaggle/tmp', exist_ok=True)
        retain_loader, forget_loader, validation_loader = get_dataset(64)
        net = resnet18(weights=None, num_classes=10)
        net.to(DEVICE)
        for i in range(512):
            net.load_state_dict(torch.load('/kaggle/input/neurips-2023-machine-unlearning/original_model.pth'))
            net_ = unlearning(net, retain_loader, forget_loader, validation_loader)
            state = net_.state_dict()
            torch.save(state, f'/kaggle/tmp/unlearned_checkpoint_{i}.pth')

        # Ensure that submission.zip will contain exactly 512 checkpoints 
        # (if this is not the case, an exception will be thrown).
        unlearned_ckpts = os.listdir('/kaggle/tmp')
        if len(unlearned_ckpts) != 512:
            raise RuntimeError('Expected exactly 512 checkpoints. The submission will throw an exception otherwise.')

        subprocess.run('zip submission.zip /kaggle/tmp/*.pth', shell=True)


Files already downloaded and verified
Files already downloaded and verified


  2%|▎         | 1/40 [00:00<00:04,  9.68it/s]

tensor([8])
tensor([[-0.1813, -1.9498,  0.6527, -0.8094, -3.3110, -0.5479, -2.3913, -1.2656,
         10.9227, -1.1194]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.5050e-05, 2.5676e-06, 3.4656e-05, 8.0316e-06, 6.5822e-07, 1.0432e-05,
         1.6510e-06, 5.0894e-06, 9.9992e-01, 5.8908e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(11.1041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.8725, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.2700, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.7321, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(14.2337, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.4706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.3141, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.1883, 

 10%|█         | 4/40 [00:00<00:03,  9.85it/s]

tensor([1])
tensor([[-0.8671,  8.4216, -1.7000, -1.5789, -2.7154, -0.6648, -0.3491, -0.8559,
          0.1213,  0.1882]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.2369e-05, 9.9893e-01, 4.0160e-05, 4.5331e-05, 1.4548e-05, 1.1308e-04,
         1.5505e-04, 9.3414e-05, 2.4819e-04, 2.6536e-04]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.2897, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.1226, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.0015, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.1381, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.0874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.7718, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(9.2785, devic

 12%|█▎        | 5/40 [00:00<00:03,  9.63it/s]

tensor([6])
tensor([[ 1.0570, -3.5207,  2.4737,  0.7948, -1.7219,  1.6019,  6.8481, -4.5315,
          0.9258, -3.9275]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.9768e-03, 3.0596e-05, 1.2274e-02, 2.2901e-03, 1.8487e-04, 5.1329e-03,
         9.7447e-01, 1.1135e-05, 2.6106e-03, 2.0370e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.8169, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(4.4003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.0792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.5959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(5.2721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.4054, device

 18%|█▊        | 7/40 [00:00<00:03,  9.29it/s]

tensor([9], device='cuda:0')
tensor(12.3975, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-2.1914, -2.1133, -1.0397,  3.0514, -2.4000,  1.1859,  8.8577, -0.7359,
         -1.9526, -2.6624]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.5844e-05, 1.7131e-05, 5.0123e-05, 2.9976e-03, 1.2861e-05, 4.6409e-04,
         9.9634e-01, 6.7915e-05, 2.0117e-05, 9.8919e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(11.0528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.9746, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.9010, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(5.8099, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.2613, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(7.6754, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0037, devi

 20%|██        | 8/40 [00:00<00:03,  9.48it/s]

tensor([9], device='cuda:0')
tensor(10.9986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.8194, -0.1051, -0.9614, -3.4720, -1.9014, -1.6733, -2.0099, -2.1061,
          0.7421,  0.6674]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9988e-01, 1.8009e-05, 7.6488e-06, 6.2126e-07, 2.9879e-06, 3.7535e-06,
         2.6808e-06, 2.4350e-06, 4.2018e-05, 3.8994e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.9246, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.7810, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(14.2915, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(12.7210, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.4928, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(12.8294, d

 30%|███       | 12/40 [00:01<00:02,  9.84it/s]

tensor([6])
tensor([[-0.4976, -3.9528,  1.7074,  0.4844, -0.4140,  3.4785,  8.1837, -3.5865,
         -3.5089, -1.8946]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.6782e-04, 5.2994e-06, 1.5221e-03, 4.4804e-04, 1.8245e-04, 8.9461e-03,
         9.8867e-01, 7.6443e-06, 8.2612e-06, 4.1505e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(8.6926, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.1479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(6.4877, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(7.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.6090, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.7165, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7815, device

 35%|███▌      | 14/40 [00:01<00:02,  9.90it/s]

tensor([0])
tensor([[10.8824, -2.6733, -1.9444,  0.5012,  1.3101, -0.8342, -2.5633, -1.5779,
          0.1563, -3.2573]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9986e-01, 1.2964e-06, 2.6873e-06, 3.1006e-05, 6.9621e-05, 8.1556e-06,
         1.4472e-06, 3.8769e-06, 2.1960e-05, 7.2299e-07]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.5559, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(12.8270, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.3813, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.5724, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.7168, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.4459, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.4605, de

 40%|████      | 16/40 [00:01<00:02,  9.94it/s]

tensor([[10.6275, -0.7283, -0.4201, -1.6450, -1.2606, -2.3152, -2.5185, -2.7050,
          2.4562, -1.4913]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9967e-01, 1.1698e-05, 1.5919e-05, 4.6770e-06, 6.8693e-06, 2.3929e-06,
         1.9526e-06, 1.6203e-06, 2.8256e-04, 5.4541e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.3561, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.0480, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(12.2729, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.8884, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.9430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.1464, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(13.3329, device='cuda:

 45%|████▌     | 18/40 [00:01<00:02,  9.88it/s]

tensor([[2.7771e-05, 1.2010e-05, 2.0971e-03, 4.5599e-05, 2.9284e-05, 3.1284e-06,
         9.9775e-01, 3.8791e-06, 2.9121e-05, 3.3506e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.4915, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.3298, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(6.1672, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(9.9956, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.4385, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.4599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.4440, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tens

 50%|█████     | 20/40 [00:02<00:02,  9.88it/s]

tensor([[ 9.8667, -2.6076,  2.2265,  1.3939, -0.4768, -1.7761, -3.1672, -2.5319,
         -0.3874, -2.5405]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9922e-01, 3.8207e-06, 4.8033e-04, 2.0891e-04, 3.2175e-05, 8.7750e-06,
         2.1832e-06, 4.1212e-06, 3.5185e-05, 4.0859e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.4751, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(7.6410, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(8.4736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.3443, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.6436, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.0347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.3994, device='cuda:0'

 55%|█████▌    | 22/40 [00:02<00:01,  9.88it/s]

tensor([[ 0.1308,  1.3012, -2.8782, -1.4426, -2.1191, -2.2085, -2.1480, -1.9619,
          1.5066,  9.8196]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[6.1941e-05, 1.9965e-04, 3.0560e-06, 1.2842e-05, 6.5286e-06, 5.9705e-06,
         6.3428e-06, 7.6402e-06, 2.4516e-04, 9.9945e-01]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.6893, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(8.5190, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(12.6984, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.2628, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.9393, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.0287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(11.9682, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7821, device='cuda:0

 57%|█████▊    | 23/40 [00:02<00:01,  9.92it/s]

tensor([[-0.6042, -1.4840, -1.4724, -1.3033, -0.1522,  0.7553, -0.7027,  9.1970,
         -2.2136, -2.0201]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.5359e-05, 2.2965e-05, 2.3235e-05, 2.7515e-05, 8.6993e-05, 2.1558e-04,
         5.0165e-05, 9.9949e-01, 1.1072e-05, 1.3436e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.6815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.6699, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.5008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.3497, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(8.4422, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.9002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(0.0005, device='cuda:0', 

 65%|██████▌   | 26/40 [00:02<00:01, 10.00it/s]

tensor([1], device='cuda:0')
tensor(11.2328, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.3296, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(9.2128, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(5.8942, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.4429, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(10.3002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(9.1658, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(8.9409, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-0.7409, -2.9658, -0.2267,  1.1246, -2.8687,  1.9476, -1.0061,  8.4390,
         -1.9760, -1.7271]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.0282e-04, 1.1113e-05

 70%|███████   | 28/40 [00:02<00:01,  9.88it/s]

tensor([2], device='cuda:0')
tensor(10.2772, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(9.2805, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.9453, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.6148, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.8470, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(10.1653, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.1796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[ 8.9780, -1.5297, -0.3120, -2.2158, -1.3013, -2.9711, -0.6241, -0.9367,
          1.8987, -0.9862]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9882e-01, 2.7291e-05, 9.2228e-05, 1.3743e-05, 3.4294e-05, 6.4571e-06,
         6.7502e-05, 4.9382e-05, 8.4136

 75%|███████▌  | 30/40 [00:03<00:01,  9.90it/s]

tensor([2], device='cuda:0')
tensor(7.5162, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(0.0282, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.6526, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.8239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(6.3297, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.4541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(4.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.4190, -1.0823, -1.3970, -1.3568, -1.2319, -0.2008,  9.7629, -1.2554,
         -1.0703, -1.7496]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.7844e-05, 1.9494e-05, 1.4232e-05, 1.4816e-05, 1.6786e-05, 4.7071e-05,
         9.9980e-01, 1.6397e-05, 1.9729e-0

 78%|███████▊  | 31/40 [00:03<00:00,  9.85it/s]

tensor(13.5391, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(4.9127, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(12.4925, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.2113, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.9738, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(15.3389, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(13.1690, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-2.8801, -1.7697,  2.4038,  2.3613,  1.6345, -2.0878,  9.2529, -3.6529,
         -3.3121, -1.9502]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.3653e-06, 1.6287e-05, 1.0577e-03, 1.0136e-03, 4.9006e-04, 1.1849e-05,
         9.9739e-01, 2.4771e-06, 3.4832e-06, 1.3597e-05]], device

 82%|████████▎ | 33/40 [00:03<00:00,  9.98it/s]

tensor([3], device='cuda:0')
tensor(6.6898, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.8006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.9082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(10.0372, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.2108, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(9.9918, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3])
tensor([[-0.8539, -1.3425, -0.7991,  8.7618, -3.1169,  0.2326,  3.1779, -3.0092,
         -1.9416, -1.1092]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[6.6395e-05, 4.0735e-05, 7.0136e-05, 9.9580e-01, 6.9078e-06, 1.9680e-04,
         3.7422e-03, 7.6931e-06, 2.2374e-05, 5.1434e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.6199, device

 88%|████████▊ | 35/40 [00:03<00:00, 10.00it/s]

tensor([4], device='cuda:0')
tensor(10.9041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.6455, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(8.6135, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(7.0210, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.2080, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[ 9.9581,  0.5512, -0.5752, -1.7119,  0.9371, -2.1866, -2.7616, -2.8858,
          0.5560, -1.8814]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9966e-01, 8.2129e-05, 2.6627e-05, 8.5433e-06, 1.2080e-04, 5.3147e-06,
         2.9905e-06, 2.6415e-06, 8.2522e-05, 7.2114e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(9.4072, device

 95%|█████████▌| 38/40 [00:03<00:00, 10.00it/s]

tensor([5], device='cuda:0')
tensor(14.7692, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.0833e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.0850, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.0378, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.6542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4])
tensor([[-1.8237, -2.5839, -0.9481, -1.6009, 10.9585,  1.5663, -0.0274, -0.7193,
         -2.3457, -2.4760]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.8101e-06, 1.3138e-06, 6.7450e-06, 3.5113e-06, 9.9987e-01, 8.3360e-05,
         1.6936e-05, 8.4788e-06, 1.6672e-06, 1.4635e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(12.7823, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.5426, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.906

100%|██████████| 40/40 [00:04<00:00,  9.87it/s]

tensor([6], device='cuda:0')
tensor(13.5130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(13.8028, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.5909, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.7198, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[11.0398, -2.2049, -0.8019, -0.7685, -2.5368, -0.8875, -1.6842, -0.7550,
         -0.2986, -1.1026]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9995e-01, 1.7697e-06, 7.1977e-06, 7.4422e-06, 1.2698e-06, 6.6070e-06,
         2.9787e-06, 7.5436e-06, 1.1907e-05, 5.3284e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.2093e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.2447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.8418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.808


  2%|▎         | 1/40 [00:00<00:03,  9.98it/s]

tensor([6])
tensor([[ 1.0570, -3.5207,  2.4737,  0.7948, -1.7219,  1.6019,  6.8481, -4.5315,
          0.9258, -3.9275]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.9768e-03, 3.0596e-05, 1.2274e-02, 2.2901e-03, 1.8487e-04, 5.1329e-03,
         9.7447e-01, 1.1135e-05, 2.6106e-03, 2.0370e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.8169, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(4.4003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.0792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.5959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(5.2721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.4054, device

  8%|▊         | 3/40 [00:00<00:03, 10.17it/s]

tensor([[-0.7045, -1.7519,  0.3266, -1.3801, -0.6670,  0.2259, -0.2169, -3.2389,
         -1.8716,  9.2782]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[4.6174e-05, 1.6200e-05, 1.2947e-04, 2.3494e-05, 4.7934e-05, 1.1707e-04,
         7.5188e-05, 3.6618e-06, 1.4372e-05, 9.9953e-01]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.9831, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.0305, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(8.9520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.6588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.9457, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.0528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.4955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.5176, device='cuda:0', 

 12%|█▎        | 5/40 [00:00<00:03, 10.18it/s]

tensor([1], device='cuda:0')
tensor(9.6995, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.9599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.6898, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.8006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.9082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(10.0372, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.2108, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(9.9918, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9])
tensor([[-1.1582,  0.6457, -1.5494, -0.5527, -1.2174, -0.8870, -1.1192, -1.4374,
         -1.4517,  8.7271]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.0879e-05, 3.0900e-04

 18%|█▊        | 7/40 [00:00<00:03, 10.22it/s]

tensor([2], device='cuda:0')
tensor(6.4877, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(7.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.6090, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.7165, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.7039, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.0897, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3])
tensor([[-0.4311,  0.1607,  3.5763,  7.9414,  1.7337, -3.2024, -1.6653, -4.9305,
         -1.6649, -1.5182]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.2759e-04, 4.1129e-04, 1.2518e-02, 9.8463e-01, 1.9829e-03, 1.4243e-05,
         6.6243e-05, 2.5298e-06, 6.6269

 22%|██▎       | 9/40 [00:00<00:03, 10.24it/s]

tensor([3], device='cuda:0')
tensor(11.5938, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(14.3292, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.4767e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.5531, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(12.0194, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.3975, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.4190, -1.0823, -1.3970, -1.3568, -1.2319, -0.2008,  9.7629, -1.2554,
         -1.0703, -1.7496]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.7844e-05, 1.9494e-05, 1.4232e-05, 1.4816e-05, 1.6786e-05, 4.7071e-05,
         9.9980e-01, 1.6397e-05, 1.9729e-05, 1.0003e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.182

 28%|██▊       | 11/40 [00:01<00:02, 10.21it/s]

tensor([4], device='cuda:0')
tensor(9.0213, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.1450, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(12.7201, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.8442, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(9.4024, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(11.8398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-0.6042, -1.4840, -1.4724, -1.3033, -0.1522,  0.7553, -0.7027,  9.1970,
         -2.2136, -2.0201]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.5359e-05, 2.2965e-05, 2.3235e-05, 2.7515e-05, 8.6993e-05, 2.1558e-04,
         5.0165e-05, 9.9949e-01, 1.1072e-05, 1.3436e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.6815, dev

 32%|███▎      | 13/40 [00:01<00:02, 10.23it/s]

tensor([5], device='cuda:0')
tensor(4.8239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(6.3297, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.4541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(4.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1])
tensor([[-0.6000,  9.2703, -2.1742, -1.2984, -1.5062, -0.8327, -1.7229, -1.2212,
          0.4756, -0.3906]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.1669e-05, 9.9959e-01, 1.0704e-05, 2.5699e-05, 2.0876e-05, 4.0940e-05,
         1.6809e-05, 2.7761e-05, 1.5148e-04, 6.3702e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.4448, device=

 38%|███▊      | 15/40 [00:01<00:02, 10.23it/s]

tensor([6], device='cuda:0')
tensor(8.9873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.1196, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.8123, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.9986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8])
tensor([[-0.1813, -1.9498,  0.6527, -0.8094, -3.3110, -0.5479, -2.3913, -1.2656,
         10.9227, -1.1194]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.5050e-05, 2.5676e-06, 3.4656e-05, 8.0316e-06, 6.5822e-07, 1.0432e-05,
         1.6510e-06, 5.0894e-06, 9.9992e-01, 5.8908e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(11.1041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.8725, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.2700, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.7321, d

 42%|████▎     | 17/40 [00:01<00:02, 10.23it/s]

tensor([7], device='cuda:0')
tensor(11.1239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.4896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-1.9353,  0.6932, -3.3635, -0.9198,  0.8928, -1.2848,  0.9007,  7.9944,
         -0.8237, -2.1541]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[4.8574e-05, 6.7288e-04, 1.1644e-05, 1.3409e-04, 8.2149e-04, 9.3090e-05,
         8.2800e-04, 9.9720e-01, 1.4762e-04, 3.9025e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.9324, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(7.3039, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.3607, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(8.9170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.1044, devic

 48%|████▊     | 19/40 [00:01<00:02, 10.24it/s]

tensor([8], device='cuda:0')
tensor(6.2564, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(6.7699, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.9328, -1.5432, -2.9490, -1.2920, -2.0524, -0.7938,  8.8504,  0.2382,
          1.8307, -1.3563]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.6315e-05, 3.0590e-05, 7.4990e-06, 3.9323e-05, 1.8382e-05, 6.4718e-05,
         9.9867e-01, 1.8164e-04, 8.9295e-04, 3.6873e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.7845, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.3949, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.8007, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.1437, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.9041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.6455, devi

 52%|█████▎    | 21/40 [00:02<00:01, 10.19it/s]

tensor([9], device='cuda:0')
tensor(10.6542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[11.9811, -1.8371,  0.6634, -3.0956, -0.9524, -3.0579, -1.5319, -1.8216,
          0.3903, -0.7386]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9997e-01, 9.9730e-07, 1.2155e-05, 2.8330e-07, 2.4157e-06, 2.9418e-07,
         1.3532e-06, 1.0128e-06, 9.2502e-06, 2.9914e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(3.0756e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.8182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.3177, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(15.0768, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(12.9335, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(15.0391, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.513

 62%|██████▎   | 25/40 [00:02<00:01, 10.19it/s]

tensor([6])
tensor([[-0.8305, -1.6688,  3.4938, -0.3346, -0.7775, -3.0140,  9.6587, -2.7989,
         -0.7830, -2.9454]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.7771e-05, 1.2010e-05, 2.0971e-03, 4.5599e-05, 2.9284e-05, 3.1284e-06,
         9.9775e-01, 3.8791e-06, 2.9121e-05, 3.3506e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.4915, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.3298, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(6.1672, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(9.9956, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.4385, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.4599, dev

 68%|██████▊   | 27/40 [00:02<00:01, 10.14it/s]

tensor([[9.9995e-01, 1.7697e-06, 7.1977e-06, 7.4422e-06, 1.2698e-06, 6.6070e-06,
         2.9787e-06, 7.5436e-06, 1.1907e-05, 5.3284e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.2093e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.2447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.8418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.8083, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(13.5767, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.9274, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(12.7240, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7948, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.3384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0'

 72%|███████▎  | 29/40 [00:02<00:01, 10.15it/s]

tensor([1], device='cuda:0')
tensor(11.4074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(8.6683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(7.3170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.3103, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(6.4940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.4476, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.4175, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.1686, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-2.1914, -2.1133, -1.0397,  3.0514, -2.4000,  1.1859,  8.8577, -0.7359,
         -1.9526, -2.6624]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.5844e-05, 1.7131e-0

 78%|███████▊  | 31/40 [00:03<00:00, 10.09it/s]

tensor([2], device='cuda:0')
tensor(9.9504, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.1972, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.0145, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.1261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(8.9376, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.7359, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.6334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.8824, -2.6733, -1.9444,  0.5012,  1.3101, -0.8342, -2.5633, -1.5779,
          0.1563, -3.2573]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9986e-01, 1.2964e-06, 2.6873e-06, 3.1006e-05, 6.9621e-05, 8.1556e-06,
         1.4472e-06, 3.8769e-06, 2.196

 82%|████████▎ | 33/40 [00:03<00:00, 10.10it/s]

tensor([3], device='cuda:0')
tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.8829, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(8.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(5.5881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7752, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.7076, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(9.8752, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1])
tensor([[-0.8671,  8.4216, -1.7000, -1.5789, -2.7154, -0.6648, -0.3491, -0.8559,
          0.1213,  0.1882]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.2369e-05, 9.9893e-01, 4.0160e-05, 4.5331e-05, 1.4548e-05, 1.1308e-04,
         1.5505e-04, 9.3414e-05, 2.4819e-04, 2.6536e-04]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.2897, devic

 88%|████████▊ | 35/40 [00:03<00:00, 10.08it/s]

tensor([4], device='cuda:0')
tensor(10.0445, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.8864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.1881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.8261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.8081, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.6275, -0.7283, -0.4201, -1.6450, -1.2606, -2.3152, -2.5185, -2.7050,
          2.4562, -1.4913]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9967e-01, 1.1698e-05, 1.5919e-05, 4.6770e-06, 6.8693e-06, 2.3929e-06,
         1.9526e-06, 1.6203e-06, 2.8256e-04, 5.4541e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.3561, devi

 92%|█████████▎| 37/40 [00:03<00:00,  9.90it/s]

tensor([4], device='cuda:0')
tensor(10.3443, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.6436, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.0347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.3994, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.2549, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.4080, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.5270, -0.9814, -1.4233, -2.7062, -2.6090, -0.8791, -1.8054, -1.9808,
          0.3190,  1.5389]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9980e-01, 1.0043e-05, 6.4562e-06, 1.7898e-06, 1.9726e-06, 1.1125e-05,
         4.4058e-06, 3.6970e-06, 3.6868e-05, 1.2486e-04]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.5086, d

 98%|█████████▊| 39/40 [00:03<00:00,  9.97it/s]

tensor([4], device='cuda:0')
tensor(5.8942, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.4429, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(10.3002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(9.1658, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(8.9409, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-2.1841, -2.8058, -1.7487,  1.9596,  4.3715, -1.3251,  1.7265,  4.6650,
         -3.3622, -1.2969]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.6586e-04, 3.0386e-04, 8.7458e-04, 3.5669e-02, 3.9786e-01, 1.3359e-03,
         2.8252e-02, 5.3359e-01, 1.7419e-04, 1.3740e-03]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(7.4772, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(8.0989, device=

100%|██████████| 40/40 [00:03<00:00, 10.12it/s]


tensor([5], device='cuda:0')
tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.2113, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.9738, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(15.3389, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(13.1690, device='cuda:0', grad_fn=<NllLossBackward0>)
<built-in method dim of Tensor object at 0x7cb101ba2200>
<built-in method dim of Tensor object at 0x7cb101ba2de0>
<built-in method dim of Tensor object at 0x7cb101ba1f80>
<built-in method dim of Tensor object at 0x7cb101ba2c50>
<built-in method dim of Tensor object at 0x7cb101ba2570>
<built-in method dim of Tensor object at 0x7cb101ba25c0>
<built-in method dim of Tensor object at 0x7cb101ba1ad0>
<built-in method dim of Tensor object at 0x7cb101ba2430>
<built-in method dim of Tensor object at 0x7cb101ba0a40>
<built-in method dim of Tensor object 

  5%|▌         | 2/40 [00:00<00:03, 10.19it/s]

tensor([6])
tensor([[ 1.0570, -3.5207,  2.4737,  0.7948, -1.7219,  1.6019,  6.8481, -4.5315,
          0.9258, -3.9275]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.9768e-03, 3.0596e-05, 1.2274e-02, 2.2901e-03, 1.8487e-04, 5.1329e-03,
         9.7447e-01, 1.1135e-05, 2.6106e-03, 2.0370e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.8169, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(4.4003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.0792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.5959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(5.2721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.4054, device

 10%|█         | 4/40 [00:00<00:03, 10.19it/s]

tensor([[-0.7045, -1.7519,  0.3266, -1.3801, -0.6670,  0.2259, -0.2169, -3.2389,
         -1.8716,  9.2782]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[4.6174e-05, 1.6200e-05, 1.2947e-04, 2.3494e-05, 4.7934e-05, 1.1707e-04,
         7.5188e-05, 3.6618e-06, 1.4372e-05, 9.9953e-01]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.9831, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.0305, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(8.9520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.6588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.9457, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.0528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.4955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.5176, device='cuda:0', 

 15%|█▌        | 6/40 [00:00<00:03, 10.21it/s]

tensor([1], device='cuda:0')
tensor(9.6995, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.9599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.6898, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.8006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.9082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(10.0372, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.2108, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(9.9918, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9])
tensor([[-1.1582,  0.6457, -1.5494, -0.5527, -1.2174, -0.8870, -1.1192, -1.4374,
         -1.4517,  8.7271]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.0879e-05, 3.0900e-04

 20%|██        | 8/40 [00:00<00:03, 10.12it/s]

tensor([2], device='cuda:0')
tensor(6.4877, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(7.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.6090, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.7165, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.7039, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.0897, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3])
tensor([[-0.4311,  0.1607,  3.5763,  7.9414,  1.7337, -3.2024, -1.6653, -4.9305,
         -1.6649, -1.5182]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.2759e-04, 4.1129e-04, 1.2518e-02, 9.8463e-01, 1.9829e-03, 1.4243e-05,
         6.6243e-05, 2.5298e-06, 6.6269

 25%|██▌       | 10/40 [00:00<00:02, 10.15it/s]

tensor([3], device='cuda:0')
tensor(11.5938, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(14.3292, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.4767e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.5531, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(12.0194, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.3975, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.4190, -1.0823, -1.3970, -1.3568, -1.2319, -0.2008,  9.7629, -1.2554,
         -1.0703, -1.7496]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.7844e-05, 1.9494e-05, 1.4232e-05, 1.4816e-05, 1.6786e-05, 4.7071e-05,
         9.9980e-01, 1.6397e-05, 1.9729e-05, 1.0003e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.182

 30%|███       | 12/40 [00:01<00:02, 10.12it/s]

tensor([4], device='cuda:0')
tensor(9.0213, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.1450, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(12.7201, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.8442, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(9.4024, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(11.8398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-0.6042, -1.4840, -1.4724, -1.3033, -0.1522,  0.7553, -0.7027,  9.1970,
         -2.2136, -2.0201]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.5359e-05, 2.2965e-05, 2.3235e-05, 2.7515e-05, 8.6993e-05, 2.1558e-04,
         5.0165e-05, 9.9949e-01, 1.1072e-05, 1.3436e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.6815, dev

 35%|███▌      | 14/40 [00:01<00:02, 10.11it/s]

tensor([5], device='cuda:0')
tensor(4.8239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(6.3297, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.4541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(4.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1])
tensor([[-0.6000,  9.2703, -2.1742, -1.2984, -1.5062, -0.8327, -1.7229, -1.2212,
          0.4756, -0.3906]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.1669e-05, 9.9959e-01, 1.0704e-05, 2.5699e-05, 2.0876e-05, 4.0940e-05,
         1.6809e-05, 2.7761e-05, 1.5148e-04, 6.3702e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.4448, device=

 40%|████      | 16/40 [00:01<00:02, 10.08it/s]

tensor([6], device='cuda:0')
tensor(8.9873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.1196, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.8123, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.9986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8])
tensor([[-0.1813, -1.9498,  0.6527, -0.8094, -3.3110, -0.5479, -2.3913, -1.2656,
         10.9227, -1.1194]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.5050e-05, 2.5676e-06, 3.4656e-05, 8.0316e-06, 6.5822e-07, 1.0432e-05,
         1.6510e-06, 5.0894e-06, 9.9992e-01, 5.8908e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(11.1041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.8725, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.2700, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.7321, d

 45%|████▌     | 18/40 [00:01<00:02, 10.07it/s]

tensor([7], device='cuda:0')
tensor(11.1239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.4896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-1.9353,  0.6932, -3.3635, -0.9198,  0.8928, -1.2848,  0.9007,  7.9944,
         -0.8237, -2.1541]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[4.8574e-05, 6.7288e-04, 1.1644e-05, 1.3409e-04, 8.2149e-04, 9.3090e-05,
         8.2800e-04, 9.9720e-01, 1.4762e-04, 3.9025e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.9324, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(7.3039, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.3607, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(8.9170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.1044, devic

 50%|█████     | 20/40 [00:01<00:01, 10.09it/s]

tensor([8], device='cuda:0')
tensor(6.2564, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(6.7699, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.9328, -1.5432, -2.9490, -1.2920, -2.0524, -0.7938,  8.8504,  0.2382,
          1.8307, -1.3563]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.6315e-05, 3.0590e-05, 7.4990e-06, 3.9323e-05, 1.8382e-05, 6.4718e-05,
         9.9867e-01, 1.8164e-04, 8.9295e-04, 3.6873e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.7845, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.3949, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.8007, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.1437, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.9041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.6455, devi

 55%|█████▌    | 22/40 [00:02<00:01, 10.04it/s]

tensor([9], device='cuda:0')
tensor(10.6542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[11.9811, -1.8371,  0.6634, -3.0956, -0.9524, -3.0579, -1.5319, -1.8216,
          0.3903, -0.7386]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9997e-01, 9.9730e-07, 1.2155e-05, 2.8330e-07, 2.4157e-06, 2.9418e-07,
         1.3532e-06, 1.0128e-06, 9.2502e-06, 2.9914e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(3.0756e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.8182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.3177, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(15.0768, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(12.9335, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(15.0391, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.513

 60%|██████    | 24/40 [00:02<00:01,  9.73it/s]

tensor([7], device='cuda:0')
tensor(11.7821, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.3136, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.8305, -1.6688,  3.4938, -0.3346, -0.7775, -3.0140,  9.6587, -2.7989,
         -0.7830, -2.9454]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.7771e-05, 1.2010e-05, 2.0971e-03, 4.5599e-05, 2.9284e-05, 3.1284e-06,
         9.9775e-01, 3.8791e-06, 2.9121e-05, 3.3506e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.4915, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.3298, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(6.1672, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(9.9956, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.4385, devi

 62%|██████▎   | 25/40 [00:02<00:01,  9.78it/s]

tensor([8], device='cuda:0')
tensor(13.3044, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(13.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[11.0398, -2.2049, -0.8019, -0.7685, -2.5368, -0.8875, -1.6842, -0.7550,
         -0.2986, -1.1026]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9995e-01, 1.7697e-06, 7.1977e-06, 7.4422e-06, 1.2698e-06, 6.6070e-06,
         2.9787e-06, 7.5436e-06, 1.1907e-05, 5.3284e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.2093e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.2447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.8418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.8083, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(13.5767, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.927

 70%|███████   | 28/40 [00:02<00:01,  9.90it/s]

tensor([9], device='cuda:0')
tensor(11.2056, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-0.7409, -2.9658, -0.2267,  1.1246, -2.8687,  1.9476, -1.0061,  8.4390,
         -1.9760, -1.7271]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.0282e-04, 1.1113e-05, 1.7195e-04, 6.6413e-04, 1.2246e-05, 1.5125e-03,
         7.8875e-05, 9.9738e-01, 2.9903e-05, 3.8355e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.1825, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.4074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(8.6683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(7.3170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.3103, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(6.4940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.4476, devic

 75%|███████▌  | 30/40 [00:02<00:00, 10.02it/s]

tensor([5])
tensor([[ 0.0709, -1.6504, -0.9286, -2.1754, -0.9926,  9.0213, -0.1043,  0.0842,
         -1.7140, -1.6115]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.2961e-04, 2.3179e-05, 4.7707e-05, 1.3712e-05, 4.4748e-05, 9.9946e-01,
         1.0879e-04, 1.3135e-04, 2.1751e-05, 2.4098e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(8.9510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.6723, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.9504, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.1972, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.0145, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.1261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(8.9376, devic

 80%|████████  | 32/40 [00:03<00:00, 10.06it/s]

tensor(9.6199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.1084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.5651, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.8829, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(8.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(5.5881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7752, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.7076, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(9.8752, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1])
tensor([[-0.8671,  8.4216, -1.7000, -1.5789, -2.7154, -0.6648, -0.3491, -0.8559,
          0.1213,  0.1882]], device='cuda:

 85%|████████▌ | 34/40 [00:03<00:00, 10.05it/s]

tensor([1], device='cuda:0')
tensor(9.3980, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.5252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.6814, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.0445, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.8864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.1881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.8261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.8081, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.6275, -0.7283, -0.4201, -1.6450, -1.2606, -2.3152, -2.5185, -2.7050,
          2.4562, -1.4913]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9967e-01, 1.1698e-0

 90%|█████████ | 36/40 [00:03<00:00, 10.06it/s]

tensor([2], device='cuda:0')
tensor(7.6410, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(8.4736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.3443, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.6436, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.0347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.3994, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.2549, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.4080, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.5270, -0.9814, -1.4233, -2.7062, -2.6090, -0.8791, -1.8054, -1.9808,
          0.3190,  1.5389]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9980e-01, 1.0043e-05, 6.4562e-06, 1.7898e-06, 1.9726e-06, 1.1125e-05,
         4.4058e-06, 3.6970e-06, 3.6

 95%|█████████▌| 38/40 [00:03<00:00, 10.09it/s]

tensor([3], device='cuda:0')
tensor(9.2128, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(5.8942, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.4429, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(10.3002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(0.0041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(9.1658, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(8.9409, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-2.1841, -2.8058, -1.7487,  1.9596,  4.3715, -1.3251,  1.7265,  4.6650,
         -3.3622, -1.2969]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.6586e-04, 3.0386e-04, 8.7458e-04, 3.5669e-02, 3.9786e-01, 1.3359e-03,
         2.8252e-02, 5.3359e-01, 1.7419e-04, 1.3740e-03]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(7.4772, device=

100%|██████████| 40/40 [00:03<00:00, 10.04it/s]


tensor([4], device='cuda:0')
tensor(12.4925, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.2113, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.9738, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(15.3389, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(13.1690, device='cuda:0', grad_fn=<NllLossBackward0>)
<built-in method dim of Tensor object at 0x7cb101c61a80>
<built-in method dim of Tensor object at 0x7cb10f4b8590>
<built-in method dim of Tensor object at 0x7cb10d45f790>
<built-in method dim of Tensor object at 0x7cb10d45e7f0>
<built-in method dim of Tensor object at 0x7cb10d45e340>
<built-in method dim of Tensor object at 0x7cb10d45e3e0>
<built-in method dim of Tensor object at 0x7cb101ba5f30>
<built-in method dim of Tensor object at 0x7cb101ba48b0>
<buil

  0%|          | 0/40 [00:00<?, ?it/s]

tensor([6])
tensor([[ 1.0570, -3.5207,  2.4737,  0.7948, -1.7219,  1.6019,  6.8481, -4.5315,
          0.9258, -3.9275]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.9768e-03, 3.0596e-05, 1.2274e-02, 2.2901e-03, 1.8487e-04, 5.1329e-03,
         9.7447e-01, 1.1135e-05, 2.6106e-03, 2.0370e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.8169, device='cuda:0', grad_fn=<NllLossBackward0>)


  2%|▎         | 1/40 [00:00<00:04,  9.74it/s]

tensor([1], device='cuda:0')
tensor(10.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(4.4003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.0792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.5959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(5.2721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.4054, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(5.9482, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.8015, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[ 8.9780, -1.5297, -0.3120, -2.2158, -1.3013, -2.9711, -0.6241, -0.9367,
          1.8987, -0.9862]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9882e-01, 2.7291e-05

  8%|▊         | 3/40 [00:00<00:03,  9.97it/s]

tensor([2], device='cuda:0')
tensor(8.9520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.6588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.9457, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.0528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.4955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.5176, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.1502, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.8194, -0.1051, -0.9614, -3.4720, -1.9014, -1.6733, -2.0099, -2.1061,
          0.7421,  0.6674]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9988e-01, 1.8009e-05, 7.6488e-06, 6.2126e-07, 2.9879e-06, 3.7535e-06,
         2.6808e-06, 2.4350e-06, 4.2018

 15%|█▌        | 6/40 [00:00<00:03,  9.99it/s]

tensor([3], device='cuda:0')
tensor(6.6898, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.8006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.9082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(10.0372, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.2108, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(9.9918, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9])
tensor([[-1.1582,  0.6457, -1.5494, -0.5527, -1.2174, -0.8870, -1.1192, -1.4374,
         -1.4517,  8.7271]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.0879e-05, 3.0900e-04, 3.4409e-05, 9.3225e-05, 4.7953e-05, 6.6732e-05,
         5.2906e-05, 3.8483e-05, 3.7937e-05, 9.9927e-01]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8861, device

 20%|██        | 8/40 [00:00<00:03,  9.93it/s]

tensor([3], device='cuda:0')
tensor(7.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.6090, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.7165, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.7039, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.0897, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3])
tensor([[-0.4311,  0.1607,  3.5763,  7.9414,  1.7337, -3.2024, -1.6653, -4.9305,
         -1.6649, -1.5182]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.2759e-04, 4.1129e-04, 1.2518e-02, 9.8463e-01, 1.9829e-03, 1.4243e-05,
         6.6243e-05, 2.5298e-06, 6.6269e-05, 7.6738e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(8.3880, devic

 25%|██▌       | 10/40 [00:01<00:03,  9.91it/s]

tensor([3], device='cuda:0')
tensor(11.5938, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(14.3292, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.4767e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.5531, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(12.0194, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.3975, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.4190, -1.0823, -1.3970, -1.3568, -1.2319, -0.2008,  9.7629, -1.2554,
         -1.0703, -1.7496]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.7844e-05, 1.9494e-05, 1.4232e-05, 1.4816e-05, 1.6786e-05, 4.7071e-05,
         9.9980e-01, 1.6397e-05, 1.9729e-05, 1.0003e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.182

 28%|██▊       | 11/40 [00:01<00:03,  9.60it/s]

tensor([3], device='cuda:0')
tensor(11.6704, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.0213, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.1450, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(12.7201, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.8442, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(9.4024, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(11.8398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-0.6042, -1.4840, -1.4724, -1.3033, -0.1522,  0.7553, -0.7027,  9.1970,
         -2.2136, -2.0201]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.5359e-05, 2.2965e-05, 2.3235e-05, 2.7515e-05, 8.6993e-05, 2.1558e-04,
         5.0165e-05, 9.9949e-01, 1.1072e-05, 1.3436e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8017, dev

 30%|███       | 12/40 [00:01<00:03,  9.28it/s]

tensor([9], device='cuda:0')
tensor(11.2175, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3])
tensor([[ 0.6650, -2.2376, -1.3248,  6.1632, -1.4612,  1.3675, -2.5694, -0.1383,
         -2.2628,  1.7982]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.9803e-03, 2.1846e-04, 5.4419e-04, 9.7223e-01, 4.7482e-04, 8.0358e-03,
         1.5676e-04, 1.7826e-03, 2.1302e-04, 1.2361e-02]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.5264, device='cuda:0', grad_fn=<NllLossBackward0>)


 32%|███▎      | 13/40 [00:01<00:02,  9.13it/s]

tensor([1], device='cuda:0')
tensor(8.4289, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(7.5162, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(0.0282, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.6526, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.8239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(6.3297, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.4541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(4.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1])
tensor([[-0.6000,  9.2703, -2.1742, -1.2984, -1.5062, -0.8327, -1.7229, -1.2212,
          0.4756, -0.3906]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.1669e-05, 9.9959e-01, 1

 38%|███▊      | 15/40 [00:01<00:02,  9.06it/s]

tensor([4])
tensor([[-0.3432, -1.7097, -0.9237, -1.8010,  9.8037, -1.3243,  0.8166, -2.3156,
         -1.0083, -1.1947]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.9190e-05, 9.9933e-06, 2.1932e-05, 9.1214e-06, 9.9974e-01, 1.4691e-05,
         1.2499e-04, 5.4518e-06, 2.0151e-05, 1.6725e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.1471, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.5136, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.7276, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.6049, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.1283, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.9873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.1196, de

 42%|████▎     | 17/40 [00:01<00:02,  9.07it/s]

tensor([9], device='cuda:0')
tensor(12.0421, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8])
tensor([[-2.3996, -0.3248,  0.5186, -2.0738, -3.5619,  0.1140,  0.0469, -1.3591,
          9.7644, -0.7249]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.2132e-06, 4.1513e-05, 9.6485e-05, 7.2210e-06, 1.6305e-06, 6.4379e-05,
         6.0203e-05, 1.4756e-05, 9.9968e-01, 2.7825e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(12.1643, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.0895, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.2461, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.8385, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(13.3266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.6507, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.7178, dev

 48%|████▊     | 19/40 [00:02<00:02,  9.16it/s]

tensor([7], device='cuda:0')
tensor(0.0028, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.8209, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.1513, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3])
tensor([[ 1.7581, -4.3443, -1.1574,  6.6546, -0.0511,  0.9876, -4.0018, -0.1622,
          0.4149, -0.0987]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[7.3494e-03, 1.6445e-05, 3.9818e-04, 9.8346e-01, 1.2038e-03, 3.4011e-03,
         2.3160e-05, 1.0772e-03, 1.9182e-03, 1.1478e-03]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(4.9131, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.0155, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(7.8286, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(0.0167, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(6.7223, device

 52%|█████▎    | 21/40 [00:02<00:02,  9.27it/s]

tensor([6], device='cuda:0')
tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(8.6135, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(7.0210, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.2080, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-1.5233, -1.6362, -1.6831, -0.3905, -0.1150, -3.9904, 10.7787, -1.3062,
         -0.2590,  0.1246]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[4.5422e-06, 4.0573e-06, 3.8714e-06, 1.4100e-05, 1.8573e-05, 3.8532e-07,
         9.9991e-01, 5.6436e-06, 1.6083e-05, 2.3602e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(12.3021, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.4150, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(12.4619, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.1693, dev

 57%|█████▊    | 23/40 [00:02<00:01,  9.56it/s]

tensor(15.0391, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.5130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(13.8028, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.5909, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.7198, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9])
tensor([[ 0.1308,  1.3012, -2.8782, -1.4426, -2.1191, -2.2085, -2.1480, -1.9619,
          1.5066,  9.8196]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[6.1941e-05, 1.9965e-04, 3.0560e-06, 1.2842e-05, 6.5286e-06, 5.9705e-06,
         6.3428e-06, 7.6402e-06, 2.4516e-04, 9.9945e-01]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.6893, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(8.5190, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(12.6984, device='cuda:0', grad_fn=<NllLo

 60%|██████    | 24/40 [00:02<00:01,  9.67it/s]

tensor(12.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0023, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.4599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.4440, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.6064, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4])
tensor([[-1.8237, -2.5839, -0.9481, -1.6009, 10.9585,  1.5663, -0.0274, -0.7193,
         -2.3457, -2.4760]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.8101e-06, 1.3138e-06, 6.7450e-06, 3.5113e-06, 9.9987e-01, 8.3360e-05,
         1.6936e-05, 8.4788e-06, 1.6672e-06, 1.4635e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(12.7823, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.5426, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.9067, device='cuda:0', grad_fn=<NllL

 65%|██████▌   | 26/40 [00:02<00:01,  9.83it/s]

tensor([6], device='cuda:0')
tensor(12.7240, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7948, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.3384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(12.1425, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-2.8801, -1.7697,  2.4038,  2.3613,  1.6345, -2.0878,  9.2529, -3.6529,
         -3.3121, -1.9502]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.3653e-06, 1.6287e-05, 1.0577e-03, 1.0136e-03, 4.9006e-04, 1.1849e-05,
         9.9739e-01, 2.4771e-06, 3.4832e-06, 1.3597e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(12.1356, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.0252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(6.8517, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.8942, de

 70%|███████   | 28/40 [00:02<00:01, 10.00it/s]

tensor([7], device='cuda:0')
tensor(0.0026, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.4175, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.1686, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-2.1914, -2.1133, -1.0397,  3.0514, -2.4000,  1.1859,  8.8577, -0.7359,
         -1.9526, -2.6624]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.5844e-05, 1.7131e-05, 5.0123e-05, 2.9976e-03, 1.2861e-05, 4.6409e-04,
         9.9634e-01, 6.7915e-05, 2.0117e-05, 9.8919e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(11.0528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.9746, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.9010, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(5.8099, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.2613, dev

 75%|███████▌  | 30/40 [00:03<00:00, 10.07it/s]

tensor([8], device='cuda:0')
tensor(10.7359, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.6334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.8824, -2.6733, -1.9444,  0.5012,  1.3101, -0.8342, -2.5633, -1.5779,
          0.1563, -3.2573]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9986e-01, 1.2964e-06, 2.6873e-06, 3.1006e-05, 6.9621e-05, 8.1556e-06,
         1.4472e-06, 3.8769e-06, 2.1960e-05, 7.2299e-07]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.5559, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(12.8270, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.3813, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.5724, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.7168, de

 80%|████████  | 32/40 [00:03<00:00, 10.11it/s]

tensor([9], device='cuda:0')
tensor(9.8752, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1])
tensor([[-0.8671,  8.4216, -1.7000, -1.5789, -2.7154, -0.6648, -0.3491, -0.8559,
          0.1213,  0.1882]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.2369e-05, 9.9893e-01, 4.0160e-05, 4.5331e-05, 1.4548e-05, 1.1308e-04,
         1.5505e-04, 9.3414e-05, 2.4819e-04, 2.6536e-04]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.2897, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.1226, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.0015, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.1381, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.0874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.7718, devic

 85%|████████▌ | 34/40 [00:03<00:00, 10.15it/s]

tensor([0])
tensor([[10.6275, -0.7283, -0.4201, -1.6450, -1.2606, -2.3152, -2.5185, -2.7050,
          2.4562, -1.4913]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9967e-01, 1.1698e-05, 1.5919e-05, 4.6770e-06, 6.8693e-06, 2.3929e-06,
         1.9526e-06, 1.6203e-06, 2.8256e-04, 5.4541e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.3561, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.0480, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(12.2729, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.8884, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.9430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.1464, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(13.3329, d

 90%|█████████ | 36/40 [00:03<00:00, 10.16it/s]

tensor([9], device='cuda:0')
tensor(12.4080, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[10.5270, -0.9814, -1.4233, -2.7062, -2.6090, -0.8791, -1.8054, -1.9808,
          0.3190,  1.5389]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9980e-01, 1.0043e-05, 6.4562e-06, 1.7898e-06, 1.9726e-06, 1.1125e-05,
         4.4058e-06, 3.6970e-06, 3.6868e-05, 1.2486e-04]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.5086, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.9505, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(13.2334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(13.1362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.4063, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(12.3326, d

 95%|█████████▌| 38/40 [00:03<00:00, 10.18it/s]

tensor([7])
tensor([[-2.1841, -2.8058, -1.7487,  1.9596,  4.3715, -1.3251,  1.7265,  4.6650,
         -3.3622, -1.2969]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.6586e-04, 3.0386e-04, 8.7458e-04, 3.5669e-02, 3.9786e-01, 1.3359e-03,
         2.8252e-02, 5.3359e-01, 1.7419e-04, 1.3740e-03]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(7.4772, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 40/40 [00:04<00:00,  9.76it/s]


tensor([1], device='cuda:0')
tensor(8.0989, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(7.0418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(3.3335, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(0.9217, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(6.6182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(3.5666, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(0.6281, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.6553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(6.5900, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5])
tensor([[ 2.9645, -3.6526, -2.7475,  5.8789, -1.7009, 10.7838, -2.4197, -2.1822,
         -4.5472, -2.3774]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.9878e-04, 5.3330e-07, 1

  5%|▌         | 2/40 [00:00<00:03,  9.96it/s]

tensor([6])
tensor([[ 1.0570, -3.5207,  2.4737,  0.7948, -1.7219,  1.6019,  6.8481, -4.5315,
          0.9258, -3.9275]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.9768e-03, 3.0596e-05, 1.2274e-02, 2.2901e-03, 1.8487e-04, 5.1329e-03,
         9.7447e-01, 1.1135e-05, 2.6106e-03, 2.0370e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.8169, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(4.4003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.0792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.5959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(5.2721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.4054, device

  8%|▊         | 3/40 [00:00<00:03,  9.92it/s]

tensor([9])
tensor([[-0.7045, -1.7519,  0.3266, -1.3801, -0.6670,  0.2259, -0.2169, -3.2389,
         -1.8716,  9.2782]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[4.6174e-05, 1.6200e-05, 1.2947e-04, 2.3494e-05, 4.7934e-05, 1.1707e-04,
         7.5188e-05, 3.6618e-06, 1.4372e-05, 9.9953e-01]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.9831, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.0305, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(8.9520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.6588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.9457, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.0528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.4955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.5176, devic

 15%|█▌        | 6/40 [00:00<00:03,  9.99it/s]

tensor([[-1.3409, -1.1839, -2.4443,  1.8258,  0.7150, -1.3926,  8.5136, -1.5216,
         -1.6952, -1.4762]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.2405e-05, 6.1313e-05, 1.7386e-05, 1.2435e-03, 4.0950e-04, 4.9766e-05,
         9.9804e-01, 4.3740e-05, 3.6770e-05, 4.5774e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.8565, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(9.6995, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.9599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(6.6898, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.8006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.9082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(10.0372, device='cuda:0', g

 18%|█▊        | 7/40 [00:00<00:03,  9.91it/s]

tensor([[-0.4976, -3.9528,  1.7074,  0.4844, -0.4140,  3.4785,  8.1837, -3.5865,
         -3.5089, -1.8946]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.6782e-04, 5.2994e-06, 1.5221e-03, 4.4804e-04, 1.8245e-04, 8.9461e-03,
         9.8867e-01, 7.6443e-06, 8.2612e-06, 4.1505e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(8.6926, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.1479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(6.4877, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(7.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(8.6090, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.7165, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(0.0114, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7815, device='cuda:0', g

 25%|██▌       | 10/40 [00:01<00:03,  9.90it/s]

tensor([5])
tensor([[-0.8969, -1.4454,  1.1196, -0.6272, -3.3625, 10.9666, -2.6843, -0.5865,
         -1.0528, -1.4309]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[7.0428e-06, 4.0693e-06, 5.2905e-05, 9.2226e-06, 5.9831e-07, 9.9991e-01,
         1.1789e-06, 9.6059e-06, 6.0260e-06, 4.1288e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(11.8635, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.4120, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.8470, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.5938, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(14.3292, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.4767e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.5531

 28%|██▊       | 11/40 [00:01<00:02,  9.92it/s]

tensor([[ 9.9581,  0.5512, -0.5752, -1.7119,  0.9371, -2.1866, -2.7616, -2.8858,
          0.5560, -1.8814]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9966e-01, 8.2129e-05, 2.6627e-05, 8.5433e-06, 1.2080e-04, 5.3147e-06,
         2.9905e-06, 2.6415e-06, 8.2522e-05, 7.2114e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(9.4072, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(10.5336, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.6704, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(9.0213, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(12.1450, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(12.7201, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.8442, device='cuda:0'

 32%|███▎      | 13/40 [00:01<00:02, 10.02it/s]

tensor([1], device='cuda:0')
tensor(8.4289, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(7.5162, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(0.0282, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(7.6526, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(4.8239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(6.3297, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.4541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(4.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1])
tensor([[-0.6000,  9.2703, -2.1742, -1.2984, -1.5062, -0.8327, -1.7229, -1.2212,
          0.4756, -0.3906]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.1669e-05, 9.9959e-01, 1

 40%|████      | 16/40 [00:01<00:02,  9.99it/s]

tensor([2], device='cuda:0')
tensor(10.7276, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.6049, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.1283, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(8.9873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.1196, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(10.8123, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.9986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8])
tensor([[-0.1813, -1.9498,  0.6527, -0.8094, -3.3110, -0.5479, -2.3913, -1.2656,
         10.9227, -1.1194]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.5050e-05, 2.5676e-06, 3.4656e-05, 8.0316e-06, 6.5822e-07, 1.0432e-05,
         1.6510e-06, 5.0894e-06, 9.9

 42%|████▎     | 17/40 [00:01<00:02,  9.97it/s]

tensor([2], device='cuda:0')
tensor(9.2461, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.8385, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(13.3266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.6507, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.7178, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.1239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.4896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-1.9353,  0.6932, -3.3635, -0.9198,  0.8928, -1.2848,  0.9007,  7.9944,
         -0.8237, -2.1541]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[4.8574e-05, 6.7288e-04, 1.1644e-05, 1.3409e-04, 8.2149e-04, 9.3090e-05,
         8.2800e-04, 9.9720e-01, 1.476

 48%|████▊     | 19/40 [00:01<00:02, 10.03it/s]

tensor([3], device='cuda:0')
tensor(0.0167, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(6.7223, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(5.6836, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(10.6731, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(6.8334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(6.2564, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(6.7699, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.9328, -1.5432, -2.9490, -1.2920, -2.0524, -0.7938,  8.8504,  0.2382,
          1.8307, -1.3563]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[5.6315e-05, 3.0590e-05, 7.4990e-06, 3.9323e-05, 1.8382e-05, 6.4718e-05,
         9.9867e-01, 1.8164e-04, 8.9295e-04, 3.6873e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.7845, device=

 52%|█████▎    | 21/40 [00:02<00:01, 10.08it/s]

tensor([4], device='cuda:0')
tensor(10.8938, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(14.7692, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.0833e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.0850, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(11.0378, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(10.6542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[11.9811, -1.8371,  0.6634, -3.0956, -0.9524, -3.0579, -1.5319, -1.8216,
          0.3903, -0.7386]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9997e-01, 9.9730e-07, 1.2155e-05, 2.8330e-07, 2.4157e-06, 2.9418e-07,
         1.3532e-06, 1.0128e-06, 9.2502e-06, 2.9914e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(3.0756e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.

 57%|█████▊    | 23/40 [00:02<00:01, 10.11it/s]

tensor([5], device='cuda:0')
tensor(12.0287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(11.9682, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.7821, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(8.3136, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6])
tensor([[-0.8305, -1.6688,  3.4938, -0.3346, -0.7775, -3.0140,  9.6587, -2.7989,
         -0.7830, -2.9454]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.7771e-05, 1.2010e-05, 2.0971e-03, 4.5599e-05, 2.9284e-05, 3.1284e-06,
         9.9775e-01, 3.8791e-06, 2.9121e-05, 3.3506e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.4915, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.3298, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(6.1672, dev

 62%|██████▎   | 25/40 [00:02<00:01, 10.10it/s]

tensor([6], device='cuda:0')
tensor(10.9860, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.6779, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(13.3044, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(13.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([0])
tensor([[11.0398, -2.2049, -0.8019, -0.7685, -2.5368, -0.8875, -1.6842, -0.7550,
         -0.2986, -1.1026]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9995e-01, 1.7697e-06, 7.1977e-06, 7.4422e-06, 1.2698e-06, 6.6070e-06,
         2.9787e-06, 7.5436e-06, 1.1907e-05, 5.3284e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(5.2093e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(13.2447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(11.8418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.808

 68%|██████▊   | 27/40 [00:02<00:01, 10.11it/s]

tensor([7], device='cuda:0')
tensor(12.9084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([8], device='cuda:0')
tensor(12.5676, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(11.2056, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7])
tensor([[-0.7409, -2.9658, -0.2267,  1.1246, -2.8687,  1.9476, -1.0061,  8.4390,
         -1.9760, -1.7271]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.0282e-04, 1.1113e-05, 1.7195e-04, 6.6413e-04, 1.2246e-05, 1.5125e-03,
         7.8875e-05, 9.9738e-01, 2.9903e-05, 3.8355e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.1825, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(11.4074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(8.6683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(7.3170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.3103, dev

 72%|███████▎  | 29/40 [00:02<00:01, 10.10it/s]

tensor([8], device='cuda:0')
tensor(10.8139, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([9], device='cuda:0')
tensor(11.5238, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5])
tensor([[ 0.0709, -1.6504, -0.9286, -2.1754, -0.9926,  9.0213, -0.1043,  0.0842,
         -1.7140, -1.6115]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[1.2961e-04, 2.3179e-05, 4.7707e-05, 1.3712e-05, 4.4748e-05, 9.9946e-01,
         1.0879e-04, 1.3135e-04, 2.1751e-05, 2.4098e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(8.9510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.6723, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.9504, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(11.1972, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.0145, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(0.0005, dev

 78%|███████▊  | 31/40 [00:03<00:00, 10.11it/s]

tensor([9], device='cuda:0')
tensor(14.1399, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3])
tensor([[-0.8539, -1.3425, -0.7991,  8.7618, -3.1169,  0.2326,  3.1779, -3.0092,
         -1.9416, -1.1092]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[6.6395e-05, 4.0735e-05, 7.0136e-05, 9.9580e-01, 6.9078e-06, 1.9680e-04,
         3.7422e-03, 7.6931e-06, 2.2374e-05, 5.1434e-05]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(9.6199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(10.1084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.5651, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(11.8829, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(8.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(5.5881, devic

 88%|████████▊ | 35/40 [00:03<00:00, 10.08it/s]

tensor([9])
tensor([[-1.2875, -0.1040, -0.2313, -1.3875, -0.7505, -0.5924,  0.1058, -2.5322,
         -2.5142,  9.2935]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[2.5381e-05, 8.2891e-05, 7.2987e-05, 2.2967e-05, 4.3425e-05, 5.0864e-05,
         1.0224e-04, 7.3110e-06, 7.4439e-06, 9.9958e-01]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(10.5815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(9.3980, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(9.5252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(10.6814, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.0445, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(9.8864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(9.1881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(11.8261, devi

 92%|█████████▎| 37/40 [00:03<00:00, 10.07it/s]

tensor([[ 9.8667, -2.6076,  2.2265,  1.3939, -0.4768, -1.7761, -3.1672, -2.5319,
         -0.3874, -2.5405]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[9.9922e-01, 3.8207e-06, 4.8033e-04, 2.0891e-04, 3.2175e-05, 8.7750e-06,
         2.1832e-06, 4.1212e-06, 3.5185e-05, 4.0859e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(12.4751, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(7.6410, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(8.4736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(10.3443, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(11.6436, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.0347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([7], device='cuda:0')
tensor(12.3994, device='cuda:0'

100%|██████████| 40/40 [00:04<00:00,  9.95it/s]

tensor([9], device='cuda:0')
tensor(6.5900, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5])
tensor([[ 2.9645, -3.6526, -2.7475,  5.8789, -1.7009, 10.7838, -2.4197, -2.1822,
         -4.5472, -2.3774]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[3.9878e-04, 5.3330e-07, 1.3183e-06, 7.3527e-03, 3.7545e-06, 9.9224e-01,
         1.8299e-06, 2.3203e-06, 2.1798e-07, 1.9089e-06]], device='cuda:0')
10
tensor([0], device='cuda:0')
tensor(7.8271, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([1], device='cuda:0')
tensor(14.4442, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([2], device='cuda:0')
tensor(13.5391, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([3], device='cuda:0')
tensor(4.9127, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([4], device='cuda:0')
tensor(12.4925, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([5], device='cuda:0')
tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([6], device='cuda:0')
tensor(13.2113, devi




['unlearned_checkpoint_0.pth', 'unlearned_checkpoint_2.pth', 'unlearned_checkpoint_1.pth', 'unlearned_checkpoint_3.pth', 'unlearned_checkpoint_4.pth']
  adding: kaggle/tmp2/unlearned_checkpoint_0.pth (deflated 7%)
  adding: kaggle/tmp2/unlearned_checkpoint_1.pth (deflated 7%)
  adding: kaggle/tmp2/unlearned_checkpoint_2.pth (deflated 7%)
  adding: kaggle/tmp2/unlearned_checkpoint_3.pth (deflated 7%)
  adding: kaggle/tmp2/unlearned_checkpoint_4.pth (deflated 7%)
