# Define some paths

In [4]:
# The base path where all outputs are stored. You can create a folder or point to an existing accessible location.
YOUR_BASE_PATH='shared_fs'
CHECKPOINT_STORAGE="/run/determined/workdir/shared_fs/checkpoints"

# The following paths will be added to the base path (e.g. audio files will be stored in 'shared_fs/audio_files'). 
AUDIO_PATH='alice_in_wonderland_audio' # where audio files are downloaded
SNIPPETS_PATH='alice_snippets' # where 10s clips are stored

# Only populate one of the following: either a YouTube playlist or a single video. Make sure the video doesn't have an age restriction, or you will have to log in
PLAYLIST_URL='' 
SINGLE_VIDEO_URL='https://www.youtube.com/watch?v=27SwZZ8jiBcc' 

# Audio Downloading + Model Training Installations

In [21]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 9 not upgraded.


In [22]:
!git clone https://huggingface.co/fnlp/SpeechTokenizer
!pip3 install speechtokenizer

fatal: destination path 'SpeechTokenizer' already exists and is not an empty directory.
^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [23]:
!pip install datasets
!pip install mamba-ssm==1.1.0
!pip install huggingface_hub
!pip install torchinfo

Collecting argparse (from buildtools->causal-conv1d>=1.1.0->mamba-ssm==1.1.0)
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0



KeyboardInterrupt



In [None]:
# !pip install pytube
# temporarily using this version of pytube to bypass AgeRestrictedError: v70VhIeurKQ is age restricted, and can't be accessed without logging in.
!python -m pip install git+https://github.com/pytube/pytube
!pip install moviepy
!pip install soundfile

# Define some tokenization functions

In [8]:
from speechtokenizer import SpeechTokenizer
import soundfile as sf
import torchaudio
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

NUM_QUANTIZERS_USED = 4
batch_size = 1

config_path = 'SpeechTokenizer/speechtokenizer_hubert_avg/config.json'
ckpt_path = 'SpeechTokenizer/speechtokenizer_hubert_avg/SpeechTokenizer.pt'
speech_tokenizer = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path).to(device)
speech_tokenizer.eval()


def normalize_waveform(waveform, sr):
    if len(waveform.shape) == 2 and waveform.shape[1] > 0:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    waveform = waveform.reshape(1, -1)
    waveform = torchaudio.functional.resample(waveform, sr, speech_tokenizer.sample_rate)
    return waveform


# Expected input: waveform of shape (B, C, V)
# The encoding returns shape (num codebooks, 1, timestep). Using it with batch size > 1 gave me a lot of errors so I didn't try it.
# After encoding the tokens, I threw away some of the later codebooks which encode less information before I flattened them as explained below.
def tokenize(waveform):
    with torch.no_grad():
        codes = speech_tokenizer.encode(waveform.unsqueeze(0).to(device))  # codes: (n_q, B, T)
    semantic_tokens = codes[:NUM_QUANTIZERS_USED, :, :].cpu()
    return flatten_tokens(semantic_tokens)

def save_waveform(filename, waveform):
  torchaudio.save(filename, waveform[0].detach().cpu(), 16000)

def decode_tokens(tokens):
  unflattened_tokens = unflatten_tokens(tokens)
  return speech_tokenizer.decode(unflattened_tokens)

def save_to_file(tok, filename):
  outputwav = decode_tokens(tok.detach().to(device))
  save_waveform(filename, outputwav)

# Transposing the timestep and code books before flattening to have it be a1, b1, c1 instead of a1, a2, a3, b1, b2, b3,
# since I'm throwing away some of the codebooks, and I also want to be able to generate based on timestep
def flatten_tokens(tokens):
    n_q, B, T = tokens.shape
    transpose_tokens = tokens.transpose(0, 2)
    return transpose_tokens.reshape(B, T * NUM_QUANTIZERS_USED)


def unflatten_tokens(tokens):
    B, L = tokens.shape
    T = L // NUM_QUANTIZERS_USED
    return tokens.reshape(T, B, NUM_QUANTIZERS_USED).transpose(0, 2)

# Download Audio

In [None]:
from pytube import Playlist, YouTube

def download_audio_from_playlist(playlist_url, output_path):
    playlist = Playlist(playlist_url)
    for video in playlist.videos:
        audio_stream = video.streams.get_audio_only()
        audio_stream.download(output_path=output_path, filename=video.title + ".mp4")

def download_audio_from_video(video_url, output_path):
    video = YouTube(video_url)
    audio_stream = video.streams.get_audio_only()
    audio_stream.download(output_path=output_path, filename=video.title + ".mp4")
    
playlist_url = PLAYLIST_URL
video_url = SINGLE_VIDEO_URL

if playlist_url:
    download_audio_from_playlist(playlist_url, f"{YOUR_BASE_PATH}/{AUDIO_PATH}")
elif video_url:
    download_audio_from_video(video_url, f"{YOUR_BASE_PATH}/{AUDIO_PATH}")

# Install audio processing tools

In [None]:
!apt-get update
!apt-get install -y ffmpeg

In [None]:
!ffmpeg -version

# Split audio into 10s snippets

In [None]:
import subprocess
import os

def extract_and_split_audio(mp4_file, output_dir, clip_length=10):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    base_name = os.path.splitext(os.path.basename(mp4_file))[0]
    output_format = "wav"
    temp_audio_file = os.path.join(output_dir, f"{base_name}_temp.{output_format}")

    # Extract the audio from the video, suppressing error messages
    with open(os.devnull, 'w') as FNULL:
        subprocess.run(["ffmpeg", "-i", mp4_file, "-q:a", "0", "-map", "a", temp_audio_file], 
                       stdout=FNULL, stderr=subprocess.STDOUT, check=True)

    # Get the duration of the audio file
    result = subprocess.run(["ffprobe", "-v", "error", "-show_entries", 
                             "format=duration", "-of", 
                             "default=noprint_wrappers=1:nokey=1", temp_audio_file], 
                            text=True, capture_output=True)
    total_duration = float(result.stdout)

    # Split the audio file into chunks
    for start in range(0, int(total_duration), clip_length):
        end = min(start + clip_length, int(total_duration))
        output_file = os.path.join(output_dir, f"{base_name}_clip_{start}_{end}.{output_format}")
        with open(os.devnull, 'w') as FNULL:
            subprocess.run(["ffmpeg", "-y", "-i", temp_audio_file, "-ss", str(start), "-to", 
                            str(end), "-c", "copy", output_file], 
                           stdout=FNULL, stderr=subprocess.STDOUT, check=True)

    # Remove the temporary audio file
    os.remove(temp_audio_file)

def process_all_files(input_dir, output_dir):
    for file in os.listdir(input_dir):
        if file.endswith('.mp4'):
            mp4_file_path = os.path.join(input_dir, file)
            extract_and_split_audio(mp4_file_path, output_dir)

In [14]:
input_directory = f"{YOUR_BASE_PATH}/{AUDIO_PATH}" # Folder where your MP4 files are located
output_directory = f"{YOUR_BASE_PATH}/{SNIPPETS_PATH}"  # Folder where you want to save the clips

In [None]:
process_all_files(input_directory, output_directory)

# Normalize + tokenize the waveforms

In [None]:
!pip install librosa

In [None]:
!mkdir shared_fs/testfiles

In [15]:
from datasets import load_dataset
import torch
import numpy as np


print("Loading Dataset")
# Select subset from 10s clips
audio_dataset = load_dataset("audiofolder", data_dir=output_directory)["train"]

print(audio_dataset)


print("Normalizing the waveforms")
audio_dataset = audio_dataset.map(
    lambda x: {
        "original_sampling_rate": x["audio"]["sampling_rate"],
        "audio_array": normalize_waveform(
            torch.tensor(x["audio"]["array"]), x["audio"]["sampling_rate"]
        ),
    },
    remove_columns=["audio"],
    # keep_in_memory=True,
    writer_batch_size=15000,
)

print(audio_dataset)

Loading Dataset


Resolving data files:   0%|          | 0/1555 [00:00<?, ?it/s]

Dataset({
    features: ['audio'],
    num_rows: 1555
})
Normalizing the waveforms


Map:   0%|          | 0/1555 [00:00<?, ? examples/s]

Dataset({
    features: ['original_sampling_rate', 'audio_array'],
    num_rows: 1555
})


In [22]:
print("Making sure the dataset is in the correct format")

def standardize_audio_length_with_tolerance(example, expected_length):
    audio_array = example["audio_array"]
    if not isinstance(audio_array, torch.Tensor):
        audio_array = torch.tensor(audio_array)

    # Assuming audio_array is 2D: [channels, length]
    current_length = audio_array.shape[1]
    if current_length < expected_length:
        # Pad with zeros. The padding size should match the number of channels
        padding_length = expected_length - current_length
        padding = torch.zeros((audio_array.shape[0], padding_length), dtype=audio_array.dtype)
        audio_array = torch.cat([audio_array, padding], dim=1)
        
    elif current_length > expected_length:
        # Trim the excess length
        audio_array = audio_array[:, :expected_length]
    return {"audio_array": audio_array}
    
# Apply the function
audio_dataset = audio_dataset.map(
    lambda x: standardize_audio_length_with_tolerance(x, expected_length=160000),
    batched=False
)


print(audio_dataset)

Making sure the dataset is in the correct format


Map:   0%|          | 0/1555 [00:00<?, ? examples/s]

Dataset({
    features: ['original_sampling_rate', 'audio_array'],
    num_rows: 1555
})


In [23]:
print("Tokenizing the waveforms")
audio_dataset = audio_dataset.map(
    lambda x: {"tokens": tokenize(torch.tensor(x["audio_array"]))
               },
    remove_columns=[
        "audio_array",
    ],
    writer_batch_size=15000,
)

print(audio_dataset)

# Checking the files to see if the tokenization worked correctly.
for idx, t in enumerate(audio_dataset.select(range(0, 10))):
    save_to_file(torch.tensor(t["tokens"]).to(device), f"{YOUR_BASE_PATH}/testfiles/{idx}_test.wav")

Tokenizing the waveforms


Map:   0%|          | 0/1555 [00:00<?, ? examples/s]

Dataset({
    features: ['original_sampling_rate', 'tokens'],
    num_rows: 1555
})


# Create train/test dataloaders

In [72]:
print(audio_dataset)

audio_dataset = audio_dataset.with_format('torch')
audio_dataset = audio_dataset.train_test_split(0.05)

# Setting up the train and test dataloader
train_dataloader = torch.utils.data.DataLoader(audio_dataset['train'], batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(audio_dataset['test'], batch_size=batch_size, shuffle=True)

Dataset({
    features: ['original_sampling_rate', 'tokens'],
    num_rows: 1555
})


In [73]:
print(len(train_dataloader))
print(len(test_dataloader))
print(list(train_dataloader)[0])
print(list(test_dataloader)[0]['tokens'][0][0])
print(len(list(test_dataloader)[0]['tokens'][0][0]))

1477
78
{'original_sampling_rate': tensor([44100]), 'tokens': tensor([[[511, 496, 571,  ..., 552, 511, 364]]])}
tensor([474, 957, 882,  ...,  45, 496, 107])
2000


# Define model + training functions

In [2]:
from tqdm import tqdm
from mamba_ssm import Mamba
import matplotlib.pyplot as plt
import numpy as np
import torchinfo
import torchaudio
from torch.utils.data import DataLoader
import os
import torch
import torch.nn as nn
from torch.nn import  functional as F
#hyperparams

epochs = 20
lr = 1e-3
block_size = 2000
device = "cuda" if torch.cuda.is_available() else "cpu"
max_iters = 10000
print_iters = 100
eval_iters = 10
eval_interval = 300
n_embed=384
n_heads = 6
n_layers = 6
dropout = 0.2
vocab_size=1024
from tqdm import tqdm

# ---------




class SelfAttentionHead(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.keys = nn.Linear(n_embed, head_size)
    self.queries = nn.Linear(n_embed, head_size)
    self.values = nn.Linear(n_embed, head_size)
    self.head_size = head_size
    self.n_embed = n_embed
    self.register_buffer('tril', torch.tril(torch.ones((block_size,block_size))).to(device))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.keys(x) # (B,T,C_h)
    q = self.queries(x) # (B,T,C_h)
    v = self.values(x) # (B,T,C_h)
    wei = k @ q.transpose(-1,-2) * C**(-0.5)# (B,T,T)
    wei = wei.masked_fill( self.tril[:T,:T]==0, float('-inf'))
    # wei = F.softmax(wei, dim=-1) # (B,T,T)
    wei = torch.log(torch.exp(wei)+1) # (B,T,T)
    wei = self.dropout(wei)
    out = wei @ v # (B,T,C_h)
    return out


class MultiHeadAttention(nn.Module):
  def __init__(self, n_heads, head_size) -> None:
    super().__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(n_heads)])
    self.proj = nn.Linear(n_embed, n_embed)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    out = torch.cat([head(x) for head in self.heads], dim=-1)
    out = self.proj(out)
    out = self.dropout(out)
    return out

class FeedForward(nn.Module):
  def __init__(self, n_embed) -> None:
    super().__init__()
    self.ffn = nn.Sequential(
      nn.Linear(n_embed, 4*n_embed),
      nn.ReLU(),
      nn.Linear(4*n_embed, n_embed),
      nn.Dropout(dropout),
    )
  def forward(self, x):
    return self.ffn(x)

class Block(nn.Module):
  def __init__(self, n_embed, n_heads) -> None:
    super().__init__( )
    self.head_size = n_embed // n_heads
    # self.sa_head = MultiHeadAttention(n_heads, self.head_size)
    self.sa_head = Mamba(
      # This module uses roughly 3 * expand * d_model^2 parameters
      d_model=n_embed, # Model dimension d_model
      d_state=16,  # SSM state expansion factor
      d_conv=4,    # Local convolution width
      expand=1,    # Block expansion factor
  ).to("cuda")
    self.ffn = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)


  def forward(self, x):
    x = x + self.sa_head(self.ln1(x))
    x = x + self.ffn(self.ln2(x))

    return x

class MambaAudioModel(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size,n_embed)
    self.position_embedding_table = nn.Embedding(block_size,n_embed)
    self.lm_head = nn.Linear(n_embed,vocab_size)
    self.ffn = FeedForward(n_embed)
    print("layers", n_layers)
    self.blocks = nn.Sequential(*[Block(n_embed,n_heads=n_heads) for _ in range(n_layers)])


  def forward(self, idx, targets=None):
    # idx = idx[:,-block_size:]
    B,T = idx.shape
    tok_emb = self.token_embedding_table(idx) # (B,T, C_e)
    pos_emb = self.position_embedding_table(torch.arange(T,device=device)) # (T, C_e)
    x = tok_emb + pos_emb # (B,T,Q, C_e)
    x = self.blocks(x) # (B,T,Q, C_e)
    logits = self.lm_head(x) # (B,T,vocab_size)
    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
      logits = logits.view(B,T,C)
    return logits, loss


def estimate_test_loss(model, dataset):
  model.eval()
  test_losses = []
  with torch.no_grad():
    for tokens in tqdm(dataset['test']['tokens']):
      tokens = tokens.to(device)
      x = tokens[:,:-1].contiguous()
      y = tokens[:,1:].contiguous()
      logits, loss = model(x,y )
      test_losses.append(loss)
  model.train()
  return sum(test_losses)/len(test_losses)

# Install Determined and verify

In [32]:
!pip install --upgrade determined
!pip install tensorboard

[0m

In [33]:
import determined as det
!det --version

det 0.27.1


# Train model using Detached Mode (report metrics and store checkpoints in Determined)

In [79]:
import torchinfo
from tqdm.notebook import tqdm
import random
from determined.experimental import core_v2


# initialize core context

core_v2.init(
    defaults=core_v2.DefaultConfig(
        name="alice_in_wonderland_4quantizers",
        checkpoint_storage=CHECKPOINT_STORAGE,
    ),
    
    unmanaged=core_v2.UnmanagedConfig(
    ),
)

    
# initialize model 
model = MambaAudioModel(vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=lr)


# load checkpoint helper function
def load_state(checkpoint_directory, trial_id):
    checkpoint_directory = pathlib.Path(checkpoint_directory)

    with checkpoint_directory.joinpath("checkpoint.pt").open("rb") as f:
        model = torch.load(f)
    with checkpoint_directory.joinpath("state").open("r") as f:
        epochs_completed, ckpt_trial_id = [int(field) for field in f.read().split(",")]

    if ckpt_trial_id != trial_id:
        epochs_completed = 0

    return model, epochs_completed


initial_i = 0

# print model + put in train mode
print(torchinfo.summary(model))
model.train()


# training loop 
ind = initial_i

for epoch in tqdm(range(epochs)):
    for batch in tqdm(train_dataloader):
        
        checkpoint_metadata_dict = {"steps_completed": ind}
        if batch['tokens'].shape[-1] != block_size:
            continue
        batch_size = batch['tokens'].shape[0]
        tokens = batch['tokens'].to(device).reshape(batch_size,block_size)

        x = tokens[:,:-1].contiguous()
        y = tokens[:,1:].contiguous()
        logits, loss = model(x,y)

        if ind % 5 == 0:
            print(loss)
            core_v2.train.report_training_metrics(steps_completed=ind, metrics={"loss": loss.item()})

        if ind % 10 == 0:
            tl = estimate_test_loss(model, audio_dataset)
            print("testloss", tl)
            core_v2.train.report_validation_metrics(steps_completed=ind, metrics={"loss": tl.item()})


        if ind % 100 == 0:
            with core_v2.checkpoint.store_path(checkpoint_metadata_dict) as (path, storage_id):
                torch.save(model.state_dict(), path / "checkpoint.pt")
                with path.joinpath("state").open("w") as f:
                    f.write(f"{epochs},{core_v2.info.trial.trial_id}")
                    
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        
        # Prevents gradient explosion.
        torch.nn.utils.clip_grad.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        ind += 1

           
checkpoint_metadata_dict = {"steps_completed": ind}
with core_v2.checkpoint.store_path(checkpoint_metadata_dict) as (path, storage_id):
    torch.save(model.state_dict(), path / "checkpoint.pt")
    with path.joinpath("state").open("w") as f:
        f.write(f"{epoch},{core_v2.info.trial.trial_id}")
                
core_v2.close()
print(torchinfo.summary(model))

TensorFlow writer not found


layers 6
Layer (type:depth-idx)                   Param #
MambaAudioModel                          --
├─Embedding: 1-1                         393,216
├─Embedding: 1-2                         768,000
├─Linear: 1-3                            394,240
├─FeedForward: 1-4                       --
│    └─Sequential: 2-1                   --
│    │    └─Linear: 3-1                  591,360
│    │    └─ReLU: 3-2                    --
│    │    └─Linear: 3-3                  590,208
│    │    └─Dropout: 3-4                 --
├─Sequential: 1-5                        --
│    └─Block: 2-2                        --
│    │    └─Mamba: 3-5                   481,920
│    │    └─FeedForward: 3-6             1,181,568
│    │    └─LayerNorm: 3-7               768
│    │    └─LayerNorm: 3-8               768
│    └─Block: 2-3                        --
│    │    └─Mamba: 3-9                   481,920
│    │    └─FeedForward: 3-10            1,181,568
│    │    └─LayerNorm: 3-11              768
│    │    

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(7.2986, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(7.2858, device='cuda:0')
tensor(6.7379, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(6.5748, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(6.2993, device='cuda:0')
tensor(5.3363, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(6.2164, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(5.8910, device='cuda:0')
tensor(5.8314, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.0076, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(5.4353, device='cuda:0')
tensor(5.0022, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.2857, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(5.1449, device='cuda:0')
tensor(5.1319, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.8760, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.9760, device='cuda:0')
tensor(4.9891, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.7021, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.7828, device='cuda:0')
tensor(4.3998, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.5754, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.6843, device='cuda:0')
tensor(4.3927, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.1889, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.5606, device='cuda:0')
tensor(5.2044, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.4545, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.4689, device='cuda:0')
tensor(4.1986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.1146, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.4639, device='cuda:0')
tensor(4.9506, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0722, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.2847, device='cuda:0')
tensor(3.9095, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.0261, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.1974, device='cuda:0')
tensor(4.6598, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.5859, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(4.1048, device='cuda:0')
tensor(3.8586, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.1287, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.9996, device='cuda:0')
tensor(4.2599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.4755, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.9409, device='cuda:0')
tensor(3.5780, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.0726, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.8782, device='cuda:0')
tensor(4.2188, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.7676, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.8152, device='cuda:0')
tensor(3.5904, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.7937, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7389, device='cuda:0')
tensor(3.2008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.6779, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6805, device='cuda:0')
tensor(2.9183, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.5113, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6445, device='cuda:0')
tensor(3.2724, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.8603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5991, device='cuda:0')
tensor(3.2121, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.9374, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5469, device='cuda:0')
tensor(3.2466, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.4411, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4896, device='cuda:0')
tensor(3.2253, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.6780, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4791, device='cuda:0')
tensor(3.0839, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3938, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4353, device='cuda:0')
tensor(1.5862, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.5146, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4337, device='cuda:0')
tensor(3.6822, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6629, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3843, device='cuda:0')
tensor(3.7196, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.5866, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3732, device='cuda:0')
tensor(3.3264, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.5585, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3324, device='cuda:0')
tensor(2.7838, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.2712, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3118, device='cuda:0')
tensor(2.9898, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.1201, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2764, device='cuda:0')
tensor(3.0782, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8486, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2527, device='cuda:0')
tensor(3.2457, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7744, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2382, device='cuda:0')
tensor(2.9938, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9910, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2158, device='cuda:0')
tensor(2.7235, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9533, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1930, device='cuda:0')
tensor(3.7606, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.3381, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1779, device='cuda:0')
tensor(2.5923, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0085, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2003, device='cuda:0')
tensor(3.3542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.3553, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1660, device='cuda:0')
tensor(3.1456, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.3945, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1547, device='cuda:0')
tensor(3.3309, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.3188, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1391, device='cuda:0')
tensor(3.2282, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2469, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1105, device='cuda:0')
tensor(3.0550, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4277, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1153, device='cuda:0')
tensor(2.8926, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8402, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1159, device='cuda:0')
tensor(3.0617, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.4066, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1078, device='cuda:0')
tensor(3.0742, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.5693, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0782, device='cuda:0')
tensor(2.7254, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.4071, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0632, device='cuda:0')
tensor(2.6479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.4263, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0598, device='cuda:0')
tensor(2.9946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7098, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0665, device='cuda:0')
tensor(3.0513, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.4152, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0379, device='cuda:0')
tensor(3.1880, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.1919, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0150, device='cuda:0')
tensor(2.8708, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0372, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0243, device='cuda:0')
tensor(3.1702, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0658, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0065, device='cuda:0')
tensor(2.3308, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8953, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0073, device='cuda:0')
tensor(2.7149, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.2543, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0104, device='cuda:0')
tensor(2.8761, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.3995, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9780, device='cuda:0')
tensor(2.5008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9610, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9814, device='cuda:0')
tensor(3.0466, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.4266, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9745, device='cuda:0')
tensor(2.8261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0447, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9588, device='cuda:0')
tensor(3.1193, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.2184, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9623, device='cuda:0')
tensor(2.3600, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0309, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9470, device='cuda:0')
tensor(3.3836, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8123, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9424, device='cuda:0')
tensor(3.3810, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4072, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9277, device='cuda:0')
tensor(2.7320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8196, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9335, device='cuda:0')
tensor(2.9218, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7359, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9137, device='cuda:0')
tensor(2.7227, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9885, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9188, device='cuda:0')
tensor(3.0398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9004, device='cuda:0')
tensor(2.6978, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8951, device='cuda:0')
tensor(3.0650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5217, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8915, device='cuda:0')
tensor(3.4170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4499, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8815, device='cuda:0')
tensor(2.9015, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9205, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8701, device='cuda:0')
tensor(2.9477, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.2681, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8658, device='cuda:0')
tensor(3.1524, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.2831, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8728, device='cuda:0')
tensor(2.6090, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6287, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8749, device='cuda:0')
tensor(2.6555, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6285, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8791, device='cuda:0')
tensor(2.9917, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7862, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8584, device='cuda:0')
tensor(2.8095, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.3612, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8562, device='cuda:0')
tensor(2.2607, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0542, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8613, device='cuda:0')
tensor(2.4588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7085, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8547, device='cuda:0')
tensor(3.0418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7362, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8504, device='cuda:0')
tensor(2.9488, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0784, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8607, device='cuda:0')
tensor(2.7233, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2922, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8688, device='cuda:0')
tensor(2.9881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9788, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8428, device='cuda:0')
tensor(2.6309, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.1608, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8193, device='cuda:0')
tensor(2.8494, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8044, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8174, device='cuda:0')
tensor(3.2969, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6611, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8117, device='cuda:0')
tensor(2.5514, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8704, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8016, device='cuda:0')
tensor(2.3701, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6329, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7914, device='cuda:0')
tensor(2.8593, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.1469, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7917, device='cuda:0')
tensor(2.5525, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0430, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7913, device='cuda:0')
tensor(2.8433, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8006, device='cuda:0')
tensor(3.2728, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2884, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7793, device='cuda:0')
tensor(3.3093, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5302, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7936, device='cuda:0')
tensor(2.6298, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8197, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7716, device='cuda:0')
tensor(2.9908, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3167, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7583, device='cuda:0')
tensor(2.6506, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6271, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7801, device='cuda:0')
tensor(3.3621, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4206, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7800, device='cuda:0')
tensor(2.9953, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9811, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7767, device='cuda:0')
tensor(3.1018, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9057, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7611, device='cuda:0')
tensor(3.1307, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0418, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7585, device='cuda:0')
tensor(2.8198, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7096, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7476, device='cuda:0')
tensor(2.9408, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.2603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7329, device='cuda:0')
tensor(2.5817, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6718, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7442, device='cuda:0')
tensor(2.6600, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4748, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7530, device='cuda:0')
tensor(2.9703, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3145, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7548, device='cuda:0')
tensor(2.9281, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7801, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7383, device='cuda:0')
tensor(2.5486, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5482, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7312, device='cuda:0')
tensor(2.9450, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4889, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7211, device='cuda:0')
tensor(2.8560, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7899, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7367, device='cuda:0')
tensor(2.9267, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4006, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7344, device='cuda:0')
tensor(2.8355, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0632, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7337, device='cuda:0')
tensor(2.4638, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8467, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7170, device='cuda:0')
tensor(1.0506, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7457, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7248, device='cuda:0')
tensor(2.6397, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6822, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7184, device='cuda:0')
tensor(2.9881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5854, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7036, device='cuda:0')
tensor(2.2636, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5860, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7380, device='cuda:0')
tensor(3.0196, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7824, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7129, device='cuda:0')
tensor(2.7934, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7248, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7042, device='cuda:0')
tensor(2.8695, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9473, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6890, device='cuda:0')
tensor(2.7487, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8151, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7055, device='cuda:0')
tensor(2.5325, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3702, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6983, device='cuda:0')
tensor(2.7424, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6813, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6880, device='cuda:0')
tensor(3.1596, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7909, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6810, device='cuda:0')
tensor(2.6985, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4362, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6827, device='cuda:0')
tensor(2.7311, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0275, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6828, device='cuda:0')
tensor(2.4759, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3909, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6893, device='cuda:0')
tensor(2.5883, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4568, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6916, device='cuda:0')
tensor(2.8387, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6944, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6673, device='cuda:0')
tensor(2.4555, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0566, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6658, device='cuda:0')
tensor(3.1453, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5191, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6632, device='cuda:0')
tensor(2.7621, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6260, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6627, device='cuda:0')
tensor(2.3000, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8543, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6493, device='cuda:0')
tensor(2.5550, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9823, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6584, device='cuda:0')
tensor(2.8075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7291, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6616, device='cuda:0')
tensor(2.8152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.1099, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6606, device='cuda:0')
tensor(2.7364, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1653, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6597, device='cuda:0')
tensor(2.5132, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4003, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6571, device='cuda:0')
tensor(2.4375, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8164, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6608, device='cuda:0')
tensor(2.0784, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6601, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6799, device='cuda:0')
tensor(3.0343, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0141, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6748, device='cuda:0')
tensor(2.4040, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6370, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6583, device='cuda:0')
tensor(3.0777, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4868, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6616, device='cuda:0')
tensor(2.2829, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5725, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6494, device='cuda:0')
tensor(2.9053, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8940, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6517, device='cuda:0')
tensor(2.7443, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8525, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6392, device='cuda:0')
tensor(2.5078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5909, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6350, device='cuda:0')
tensor(2.4731, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7274, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6313, device='cuda:0')
tensor(2.4701, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.2169, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6334, device='cuda:0')
tensor(2.2926, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(2.6151, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6234, device='cuda:0')
tensor(2.3776, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5715, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6209, device='cuda:0')
tensor(2.4018, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4188, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6244, device='cuda:0')
tensor(2.3613, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.1487, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6611, device='cuda:0')
tensor(2.3633, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2986, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6976, device='cuda:0')
tensor(2.3590, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6730, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6436, device='cuda:0')
tensor(1.5965, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7927, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6579, device='cuda:0')
tensor(2.1254, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8100, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6908, device='cuda:0')
tensor(2.8089, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9026, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6492, device='cuda:0')
tensor(2.6050, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5227, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6521, device='cuda:0')
tensor(2.6228, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2997, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6364, device='cuda:0')
tensor(2.7147, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2066, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6320, device='cuda:0')
tensor(2.2136, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5420, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6243, device='cuda:0')
tensor(2.4613, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2346, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6225, device='cuda:0')
tensor(2.2040, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3819, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6357, device='cuda:0')
tensor(2.3257, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5653, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6296, device='cuda:0')
tensor(2.2092, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3431, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6245, device='cuda:0')
tensor(2.2550, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0664, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6320, device='cuda:0')
tensor(1.9558, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4810, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6142, device='cuda:0')
tensor(2.6404, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6106, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6283, device='cuda:0')
tensor(2.1021, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5737, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6273, device='cuda:0')
tensor(2.5545, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2108, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6301, device='cuda:0')
tensor(2.6578, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9788, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6384, device='cuda:0')
tensor(2.6211, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5520, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6274, device='cuda:0')
tensor(2.4969, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4083, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6304, device='cuda:0')
tensor(2.3907, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9920, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6327, device='cuda:0')
tensor(2.5961, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4547, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6128, device='cuda:0')
tensor(2.9832, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5439, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6125, device='cuda:0')
tensor(2.7207, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7451, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6207, device='cuda:0')
tensor(2.5703, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6911, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6151, device='cuda:0')
tensor(2.6728, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4091, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6218, device='cuda:0')
tensor(2.5614, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3989, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6193, device='cuda:0')
tensor(2.2498, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4025, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6200, device='cuda:0')
tensor(2.3899, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3422, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6155, device='cuda:0')
tensor(2.1858, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1287, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6239, device='cuda:0')
tensor(2.5365, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6907, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6064, device='cuda:0')
tensor(2.8886, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8612, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6008, device='cuda:0')
tensor(1.9591, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8894, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6036, device='cuda:0')
tensor(2.7551, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5323, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6051, device='cuda:0')
tensor(2.4864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2569, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6067, device='cuda:0')
tensor(1.9127, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2051, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6072, device='cuda:0')
tensor(2.2791, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2269, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6210, device='cuda:0')
tensor(2.5566, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9971, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6130, device='cuda:0')
tensor(2.1395, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6977, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6060, device='cuda:0')
tensor(2.6421, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5815, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5963, device='cuda:0')
tensor(2.6776, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5834, device='cuda:0')
tensor(2.3779, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5336, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5926, device='cuda:0')
tensor(2.7423, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2616, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5884, device='cuda:0')
tensor(2.5873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8842, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6058, device='cuda:0')
tensor(2.1920, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6642, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6082, device='cuda:0')
tensor(2.3549, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1928, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6050, device='cuda:0')
tensor(2.4950, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8154, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5878, device='cuda:0')
tensor(2.4082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2909, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5956, device='cuda:0')
tensor(2.5398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4021, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5923, device='cuda:0')
tensor(2.5152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0125, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5832, device='cuda:0')
tensor(2.1724, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5213, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5928, device='cuda:0')
tensor(2.5796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7547, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5962, device='cuda:0')
tensor(2.8113, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7851, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5844, device='cuda:0')
tensor(2.8424, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5445, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5787, device='cuda:0')
tensor(2.2688, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1146, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5762, device='cuda:0')
tensor(2.2570, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4133, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5801, device='cuda:0')
tensor(2.4786, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9541, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5724, device='cuda:0')
tensor(2.7678, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7993, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5797, device='cuda:0')
tensor(2.0466, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.9459, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5752, device='cuda:0')
tensor(2.5450, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6142, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5834, device='cuda:0')
tensor(2.1647, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2436, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5699, device='cuda:0')
tensor(2.4626, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5190, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5775, device='cuda:0')
tensor(2.4766, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3578, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5714, device='cuda:0')
tensor(2.3785, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8587, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5723, device='cuda:0')
tensor(2.5372, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4788, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5723, device='cuda:0')
tensor(2.6101, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9960, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5802, device='cuda:0')
tensor(2.2140, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6430, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5724, device='cuda:0')
tensor(2.2715, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6469, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5702, device='cuda:0')
tensor(2.5020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3082, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5624, device='cuda:0')
tensor(2.5791, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7624, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5744, device='cuda:0')
tensor(2.3388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7526, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5738, device='cuda:0')
tensor(2.6032, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5326, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5590, device='cuda:0')
tensor(2.6178, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6105, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5587, device='cuda:0')
tensor(2.4905, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3623, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5711, device='cuda:0')
tensor(2.5238, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4196, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5631, device='cuda:0')
tensor(2.8319, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6448, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5534, device='cuda:0')
tensor(2.4116, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1871, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5709, device='cuda:0')
tensor(2.1105, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2623, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5600, device='cuda:0')
tensor(2.3878, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5272, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5445, device='cuda:0')
tensor(2.4357, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5626, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5472, device='cuda:0')
tensor(2.1871, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8732, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5508, device='cuda:0')
tensor(2.3930, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2592, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5518, device='cuda:0')
tensor(1.9974, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2160, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5643, device='cuda:0')
tensor(2.7212, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6208, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5379, device='cuda:0')
tensor(1.9966, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6450, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5388, device='cuda:0')
tensor(2.5459, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4394, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5403, device='cuda:0')
tensor(2.6128, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5181, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5340, device='cuda:0')
tensor(2.5005, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5104, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5294, device='cuda:0')
tensor(2.4125, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2495, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5351, device='cuda:0')
tensor(3.0719, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4068, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5498, device='cuda:0')
tensor(2.0505, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2712, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5511, device='cuda:0')
tensor(2.8540, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7551, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5492, device='cuda:0')
tensor(2.6818, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8702, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5386, device='cuda:0')
tensor(2.6659, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1269, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5326, device='cuda:0')
tensor(2.7237, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5325, device='cuda:0')
tensor(2.3502, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5809, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5420, device='cuda:0')
tensor(2.4383, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4891, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5408, device='cuda:0')
tensor(2.1697, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.0435, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5483, device='cuda:0')
tensor(2.1851, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4778, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5447, device='cuda:0')
tensor(2.9362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4374, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5294, device='cuda:0')
tensor(2.2073, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2958, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5338, device='cuda:0')
tensor(2.2069, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3806, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5262, device='cuda:0')
tensor(2.3792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8010, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5254, device='cuda:0')
tensor(2.4736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8523, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5288, device='cuda:0')
tensor(2.5240, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8167, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5234, device='cuda:0')
tensor(2.7046, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3985, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5115, device='cuda:0')
tensor(2.4468, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1958, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5157, device='cuda:0')
tensor(2.4683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5733, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5225, device='cuda:0')
tensor(2.4806, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5186, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5068, device='cuda:0')
tensor(2.1632, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6155, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5170, device='cuda:0')
tensor(2.3650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1960, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5134, device='cuda:0')
tensor(2.7623, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3500, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5275, device='cuda:0')
tensor(2.2889, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3094, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5158, device='cuda:0')
tensor(2.4669, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2379, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5315, device='cuda:0')
tensor(1.5919, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5571, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5114, device='cuda:0')
tensor(2.2606, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1641, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5176, device='cuda:0')
tensor(2.0324, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3804, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5140, device='cuda:0')
tensor(2.2663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9834, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5100, device='cuda:0')
tensor(2.6039, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1311, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5216, device='cuda:0')
tensor(2.4065, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2058, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5201, device='cuda:0')
tensor(2.8211, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4647, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5131, device='cuda:0')
tensor(2.5853, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3775, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5049, device='cuda:0')
tensor(2.4017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7786, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5076, device='cuda:0')
tensor(2.6461, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7289, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4969, device='cuda:0')
tensor(2.1351, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3883, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5048, device='cuda:0')
tensor(2.4891, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1485, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5084, device='cuda:0')
tensor(2.6522, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3659, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4982, device='cuda:0')
tensor(2.0181, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5062, device='cuda:0')
tensor(2.2254, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1209, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5034, device='cuda:0')
tensor(2.6966, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0892, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4968, device='cuda:0')
tensor(2.7153, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6027, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5002, device='cuda:0')
tensor(2.5503, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4206, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5200, device='cuda:0')
tensor(2.0993, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7926, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5174, device='cuda:0')
tensor(2.5057, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6738, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5259, device='cuda:0')
tensor(2.2983, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4287, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4977, device='cuda:0')
tensor(2.5379, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1078, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5049, device='cuda:0')
tensor(2.8177, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8120, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4959, device='cuda:0')
tensor(2.6301, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7063, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4873, device='cuda:0')
tensor(2.2364, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1013, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4918, device='cuda:0')
tensor(2.4522, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3802, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4877, device='cuda:0')
tensor(2.7400, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5866, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4890, device='cuda:0')
tensor(2.2987, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3660, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4859, device='cuda:0')
tensor(2.8620, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5821, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4858, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(2.0978, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4442, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4830, device='cuda:0')
tensor(1.6399, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4342, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4881, device='cuda:0')
tensor(2.1769, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1317, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4895, device='cuda:0')
tensor(2.1814, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0230, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4869, device='cuda:0')
tensor(2.3434, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2837, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4941, device='cuda:0')
tensor(2.4081, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4052, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4996, device='cuda:0')
tensor(2.3521, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4213, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4986, device='cuda:0')
tensor(2.3731, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4252, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5009, device='cuda:0')
tensor(1.8628, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2392, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5089, device='cuda:0')
tensor(1.9825, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0286, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5064, device='cuda:0')
tensor(1.9701, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0366, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5016, device='cuda:0')
tensor(2.2337, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0203, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5106, device='cuda:0')
tensor(1.7280, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8462, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5201, device='cuda:0')
tensor(2.2261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2845, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5089, device='cuda:0')
tensor(2.5572, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3240, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5164, device='cuda:0')
tensor(2.1741, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0767, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5181, device='cuda:0')
tensor(1.9122, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2149, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5294, device='cuda:0')
tensor(2.1280, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1765, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5147, device='cuda:0')
tensor(2.4565, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7756, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5074, device='cuda:0')
tensor(2.2676, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0072, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5067, device='cuda:0')
tensor(1.9584, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3794, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5197, device='cuda:0')
tensor(2.0931, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3305, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5229, device='cuda:0')
tensor(2.1616, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8615, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5103, device='cuda:0')
tensor(1.9529, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8301, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5249, device='cuda:0')
tensor(2.5321, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2648, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5337, device='cuda:0')
tensor(2.5257, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4765, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5360, device='cuda:0')
tensor(2.3673, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3739, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5193, device='cuda:0')
tensor(1.8850, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2390, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5278, device='cuda:0')
tensor(2.5822, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1976, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5417, device='cuda:0')
tensor(2.2688, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3728, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5238, device='cuda:0')
tensor(2.3675, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0111, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5180, device='cuda:0')
tensor(2.1050, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1822, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5169, device='cuda:0')
tensor(2.2176, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4320, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5185, device='cuda:0')
tensor(1.9900, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7373, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5177, device='cuda:0')
tensor(2.0295, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7044, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5183, device='cuda:0')
tensor(2.4666, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9805, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5247, device='cuda:0')
tensor(2.2717, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3174, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5101, device='cuda:0')
tensor(1.9138, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8695, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5275, device='cuda:0')
tensor(2.2103, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2646, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5211, device='cuda:0')
tensor(2.4400, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3848, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5124, device='cuda:0')
tensor(2.5743, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3171, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5143, device='cuda:0')
tensor(2.1017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4874, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5111, device='cuda:0')
tensor(2.4026, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2840, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5192, device='cuda:0')
tensor(1.8876, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2780, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5098, device='cuda:0')
tensor(2.0695, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1688, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5123, device='cuda:0')
tensor(2.0851, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7542, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5048, device='cuda:0')
tensor(2.4123, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2432, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5107, device='cuda:0')
tensor(2.1317, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1652, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5186, device='cuda:0')
tensor(2.1185, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8830, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5051, device='cuda:0')
tensor(2.0112, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2675, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5070, device='cuda:0')
tensor(1.9745, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1241, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5079, device='cuda:0')
tensor(2.3773, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5633, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5188, device='cuda:0')
tensor(2.2668, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1123, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5115, device='cuda:0')
tensor(2.1971, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5792, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5103, device='cuda:0')
tensor(2.4739, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5612, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5147, device='cuda:0')
tensor(2.5501, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4313, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5021, device='cuda:0')
tensor(2.5355, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2334, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5126, device='cuda:0')
tensor(2.4688, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5839, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4979, device='cuda:0')
tensor(2.0475, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5235, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4978, device='cuda:0')
tensor(2.3231, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5004, device='cuda:0')
tensor(1.9651, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4811, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5031, device='cuda:0')
tensor(2.1155, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1920, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5069, device='cuda:0')
tensor(2.1166, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4619, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5094, device='cuda:0')
tensor(2.6063, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1256, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4996, device='cuda:0')
tensor(1.9522, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5091, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5078, device='cuda:0')
tensor(1.9942, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3465, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5028, device='cuda:0')
tensor(2.5365, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0257, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4984, device='cuda:0')
tensor(1.8864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0760, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5120, device='cuda:0')
tensor(2.7496, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3855, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4970, device='cuda:0')
tensor(2.0098, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4236, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4994, device='cuda:0')
tensor(2.3658, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3827, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5040, device='cuda:0')
tensor(2.0433, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2710, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5023, device='cuda:0')
tensor(2.2186, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3048, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4947, device='cuda:0')
tensor(2.5423, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3339, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4909, device='cuda:0')
tensor(2.0427, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9925, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4885, device='cuda:0')
tensor(2.1332, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9319, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4922, device='cuda:0')
tensor(1.9752, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9945, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4933, device='cuda:0')
tensor(2.0821, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0902, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4905, device='cuda:0')
tensor(2.5199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9144, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4894, device='cuda:0')
tensor(2.3426, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3392, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4926, device='cuda:0')
tensor(2.3341, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1644, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4852, device='cuda:0')
tensor(2.7210, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1110, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4927, device='cuda:0')
tensor(1.8398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2083, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4906, device='cuda:0')
tensor(2.4018, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5132, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4904, device='cuda:0')
tensor(2.0872, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2285, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4864, device='cuda:0')
tensor(2.6400, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2102, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4894, device='cuda:0')
tensor(2.4570, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9573, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4817, device='cuda:0')
tensor(1.6570, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9533, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4998, device='cuda:0')
tensor(2.5027, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8700, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4845, device='cuda:0')
tensor(1.9655, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2138, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4912, device='cuda:0')
tensor(1.0938, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1796, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4890, device='cuda:0')
tensor(2.1181, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3801, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4848, device='cuda:0')
tensor(2.3915, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5719, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4924, device='cuda:0')
tensor(2.1869, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1884, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4811, device='cuda:0')
tensor(2.0527, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0774, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4863, device='cuda:0')
tensor(1.4291, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2728, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4802, device='cuda:0')
tensor(2.3787, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5793, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4679, device='cuda:0')
tensor(2.0879, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3078, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4799, device='cuda:0')
tensor(1.0946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7109, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4790, device='cuda:0')
tensor(2.1878, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5474, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4795, device='cuda:0')
tensor(2.7830, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4292, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4803, device='cuda:0')
tensor(2.3271, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7149, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4766, device='cuda:0')
tensor(2.3787, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8713, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4741, device='cuda:0')
tensor(2.2282, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.6373, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4802, device='cuda:0')
tensor(2.2270, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0679, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4829, device='cuda:0')
tensor(2.1043, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5234, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4828, device='cuda:0')
tensor(2.3786, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1590, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4750, device='cuda:0')
tensor(2.2406, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9743, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4739, device='cuda:0')
tensor(2.1989, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3857, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4809, device='cuda:0')
tensor(0.8252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2227, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4851, device='cuda:0')
tensor(2.2815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4536, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4612, device='cuda:0')
tensor(2.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0058, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4602, device='cuda:0')
tensor(1.9972, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0852, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4649, device='cuda:0')
tensor(2.1802, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3277, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4675, device='cuda:0')
tensor(2.5596, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4657, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4605, device='cuda:0')
tensor(2.0690, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5064, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4630, device='cuda:0')
tensor(2.0311, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4430, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4641, device='cuda:0')
tensor(2.0623, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0650, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4605, device='cuda:0')
tensor(2.0546, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.7000, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4735, device='cuda:0')
tensor(2.3526, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4049, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4591, device='cuda:0')
tensor(2.0611, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1986, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4629, device='cuda:0')
tensor(2.5375, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0383, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4697, device='cuda:0')
tensor(2.1093, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2616, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4669, device='cuda:0')
tensor(2.2267, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3592, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4579, device='cuda:0')
tensor(2.3035, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0796, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4676, device='cuda:0')
tensor(1.9041, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1993, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4659, device='cuda:0')
tensor(2.3250, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2692, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4638, device='cuda:0')
tensor(2.5797, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1103, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4581, device='cuda:0')
tensor(1.8860, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4612, device='cuda:0')
tensor(2.1303, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3041, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4549, device='cuda:0')
tensor(2.3586, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2504, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4606, device='cuda:0')
tensor(1.8313, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0719, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4551, device='cuda:0')
tensor(2.1813, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5422, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4591, device='cuda:0')
tensor(2.6367, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4621, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4540, device='cuda:0')
tensor(2.1020, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2044, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4504, device='cuda:0')
tensor(2.4505, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0709, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4502, device='cuda:0')
tensor(2.3578, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3839, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4428, device='cuda:0')
tensor(1.9943, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5760, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4532, device='cuda:0')
tensor(2.1700, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9581, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4491, device='cuda:0')
tensor(2.3733, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0446, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4460, device='cuda:0')
tensor(2.1163, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8376, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4491, device='cuda:0')
tensor(2.4663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0326, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4423, device='cuda:0')
tensor(2.3333, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3788, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4457, device='cuda:0')
tensor(2.3161, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0305, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4458, device='cuda:0')
tensor(2.3991, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5425, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4457, device='cuda:0')
tensor(2.2820, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9294, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4511, device='cuda:0')
tensor(1.8312, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0163, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4565, device='cuda:0')
tensor(2.4760, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1627, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4526, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.4949, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1524, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4627, device='cuda:0')
tensor(2.0772, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9511, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4608, device='cuda:0')
tensor(1.7418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7767, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4736, device='cuda:0')
tensor(2.2468, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2395, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4697, device='cuda:0')
tensor(1.9666, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1157, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4719, device='cuda:0')
tensor(1.8663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8097, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4914, device='cuda:0')
tensor(2.2389, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1066, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4816, device='cuda:0')
tensor(2.4242, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0410, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4944, device='cuda:0')
tensor(2.1341, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3273, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5098, device='cuda:0')
tensor(2.2152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6041, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4855, device='cuda:0')
tensor(2.0013, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1131, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5044, device='cuda:0')
tensor(1.7820, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9619, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5018, device='cuda:0')
tensor(2.3429, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1422, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4970, device='cuda:0')
tensor(2.0308, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1490, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4970, device='cuda:0')
tensor(1.5905, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8080, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5034, device='cuda:0')
tensor(1.7612, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9637, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5020, device='cuda:0')
tensor(2.0456, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3066, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4997, device='cuda:0')
tensor(1.6577, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8673, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5061, device='cuda:0')
tensor(1.6864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8700, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4970, device='cuda:0')
tensor(1.8199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5718, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4995, device='cuda:0')
tensor(2.1006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5537, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5171, device='cuda:0')
tensor(2.0274, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0074, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5037, device='cuda:0')
tensor(1.5345, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5018, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5106, device='cuda:0')
tensor(2.1881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8315, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5026, device='cuda:0')
tensor(2.2764, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9544, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5144, device='cuda:0')
tensor(1.9156, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2722, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5067, device='cuda:0')
tensor(2.1074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1297, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5073, device='cuda:0')
tensor(2.1144, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9360, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5065, device='cuda:0')
tensor(1.8338, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5715, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5082, device='cuda:0')
tensor(2.4404, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9243, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5151, device='cuda:0')
tensor(1.9548, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8561, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5172, device='cuda:0')
tensor(1.8024, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2818, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5142, device='cuda:0')
tensor(2.1374, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0274, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5135, device='cuda:0')
tensor(1.8048, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3509, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5148, device='cuda:0')
tensor(1.8047, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0224, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5081, device='cuda:0')
tensor(1.8475, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2424, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5149, device='cuda:0')
tensor(2.2586, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1223, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5128, device='cuda:0')
tensor(2.1634, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8787, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5188, device='cuda:0')
tensor(2.3623, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9340, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5145, device='cuda:0')
tensor(2.2556, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6627, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5185, device='cuda:0')
tensor(2.2615, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7874, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5114, device='cuda:0')
tensor(1.7736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5098, device='cuda:0')
tensor(1.8402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8089, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5081, device='cuda:0')
tensor(1.7554, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0058, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5102, device='cuda:0')
tensor(1.8870, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0664, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5133, device='cuda:0')
tensor(0.9976, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0339, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5143, device='cuda:0')
tensor(2.3889, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1657, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5096, device='cuda:0')
tensor(2.0132, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8643, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5170, device='cuda:0')
tensor(1.7349, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7870, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5169, device='cuda:0')
tensor(1.8727, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9268, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5184, device='cuda:0')
tensor(1.8238, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5669, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5186, device='cuda:0')
tensor(1.8553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2651, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5130, device='cuda:0')
tensor(2.2631, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4920, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5214, device='cuda:0')
tensor(1.9336, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1030, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5164, device='cuda:0')
tensor(1.8246, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0328, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5179, device='cuda:0')
tensor(2.0972, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0640, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5026, device='cuda:0')
tensor(2.4136, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0083, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5117, device='cuda:0')
tensor(1.9064, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2555, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5118, device='cuda:0')
tensor(2.2013, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7214, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5157, device='cuda:0')
tensor(1.8606, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2656, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5086, device='cuda:0')
tensor(2.3426, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0410, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4985, device='cuda:0')
tensor(2.4266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2500, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5079, device='cuda:0')
tensor(2.1961, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2606, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4996, device='cuda:0')
tensor(2.0353, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0717, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5019, device='cuda:0')
tensor(1.9386, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0785, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4986, device='cuda:0')
tensor(2.0951, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0837, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5076, device='cuda:0')
tensor(2.2325, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2250, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5023, device='cuda:0')
tensor(1.8646, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0721, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4952, device='cuda:0')
tensor(2.0843, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2527, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4950, device='cuda:0')
tensor(2.3381, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1666, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4922, device='cuda:0')
tensor(2.0683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7145, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4890, device='cuda:0')
tensor(1.5971, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6647, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4913, device='cuda:0')
tensor(2.2331, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2519, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4887, device='cuda:0')
tensor(2.2113, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8070, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4920, device='cuda:0')
tensor(2.0980, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0338, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4896, device='cuda:0')
tensor(1.7362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0132, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4895, device='cuda:0')
tensor(2.2063, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0436, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4933, device='cuda:0')
tensor(1.8221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3580, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4942, device='cuda:0')
tensor(2.4482, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2911, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4897, device='cuda:0')
tensor(2.2409, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9906, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4859, device='cuda:0')
tensor(2.0162, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5117, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4881, device='cuda:0')
tensor(2.5286, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4697, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4857, device='cuda:0')
tensor(2.2673, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1890, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4862, device='cuda:0')
tensor(2.1917, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0695, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4877, device='cuda:0')
tensor(1.7999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2038, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4990, device='cuda:0')
tensor(1.8644, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7845, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4936, device='cuda:0')
tensor(1.7648, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6810, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4872, device='cuda:0')
tensor(2.1679, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9553, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4848, device='cuda:0')
tensor(2.0086, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2041, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4876, device='cuda:0')
tensor(2.1792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4598, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4872, device='cuda:0')
tensor(2.2542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3340, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4887, device='cuda:0')
tensor(2.4214, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5326, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4848, device='cuda:0')
tensor(2.1695, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1342, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4910, device='cuda:0')
tensor(2.1552, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7948, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4809, device='cuda:0')
tensor(2.2317, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4167, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4979, device='cuda:0')
tensor(2.0842, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4820, device='cuda:0')
tensor(2.0684, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0661, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4826, device='cuda:0')
tensor(2.0521, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9828, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4741, device='cuda:0')
tensor(1.9241, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2785, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4802, device='cuda:0')
tensor(1.7559, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2579, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4771, device='cuda:0')
tensor(2.1348, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1823, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4842, device='cuda:0')
tensor(1.9468, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1940, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4868, device='cuda:0')
tensor(2.4661, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1796, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4846, device='cuda:0')
tensor(2.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0222, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4823, device='cuda:0')
tensor(2.5625, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7579, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4833, device='cuda:0')
tensor(2.0121, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9716, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4870, device='cuda:0')
tensor(2.0384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9085, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4830, device='cuda:0')
tensor(2.2079, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3796, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4817, device='cuda:0')
tensor(1.9295, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1131, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4793, device='cuda:0')
tensor(2.0813, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0154, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4757, device='cuda:0')
tensor(2.2957, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5242, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4695, device='cuda:0')
tensor(2.0965, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9206, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4770, device='cuda:0')
tensor(1.9465, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4369, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4754, device='cuda:0')
tensor(1.9367, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3333, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4715, device='cuda:0')
tensor(2.2910, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2768, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4753, device='cuda:0')
tensor(1.8778, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9974, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4722, device='cuda:0')
tensor(2.1347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.5635, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4662, device='cuda:0')
tensor(2.2685, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9265, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4806, device='cuda:0')
tensor(1.8431, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8012, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4668, device='cuda:0')
tensor(1.5384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0742, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4870, device='cuda:0')
tensor(2.2882, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8859, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4677, device='cuda:0')
tensor(2.2166, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9579, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4621, device='cuda:0')
tensor(2.3278, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4898, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4594, device='cuda:0')
tensor(2.2796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4358, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4611, device='cuda:0')
tensor(2.2402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9262, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4627, device='cuda:0')
tensor(1.7698, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8367, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4696, device='cuda:0')
tensor(2.3256, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2467, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4601, device='cuda:0')
tensor(2.4940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2541, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4659, device='cuda:0')
tensor(1.9188, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.8732, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4705, device='cuda:0')
tensor(2.2901, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9774, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4666, device='cuda:0')
tensor(1.9801, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2377, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4642, device='cuda:0')
tensor(2.0294, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1447, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4683, device='cuda:0')
tensor(2.3078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2985, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4645, device='cuda:0')
tensor(2.1082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1453, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4596, device='cuda:0')
tensor(2.2266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2386, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4599, device='cuda:0')
tensor(2.4650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0278, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4589, device='cuda:0')
tensor(2.2383, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2757, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4690, device='cuda:0')
tensor(2.3004, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2443, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4599, device='cuda:0')
tensor(1.8520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4249, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4588, device='cuda:0')
tensor(1.8946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3708, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4524, device='cuda:0')
tensor(2.2630, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1257, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4566, device='cuda:0')
tensor(1.8336, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1904, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4556, device='cuda:0')
tensor(2.3322, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8997, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4567, device='cuda:0')
tensor(2.1945, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4571, device='cuda:0')
tensor(2.2695, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2671, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4523, device='cuda:0')
tensor(1.8711, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1485, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4517, device='cuda:0')
tensor(2.1652, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7728, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4508, device='cuda:0')
tensor(1.9682, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.4105, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4542, device='cuda:0')
tensor(1.9653, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0008, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4668, device='cuda:0')
tensor(1.8432, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8888, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4708, device='cuda:0')
tensor(1.7709, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7411, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4820, device='cuda:0')
tensor(1.7081, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0022, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4896, device='cuda:0')
tensor(1.8959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4494, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5166, device='cuda:0')
tensor(1.6403, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5017, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5142, device='cuda:0')
tensor(1.5778, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9607, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5171, device='cuda:0')
tensor(1.7395, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8826, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5166, device='cuda:0')
tensor(1.8134, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8699, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5159, device='cuda:0')
tensor(2.0979, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0311, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5166, device='cuda:0')
tensor(2.0501, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8775, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5245, device='cuda:0')
tensor(1.5909, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8817, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5278, device='cuda:0')
tensor(1.5892, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9847, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5290, device='cuda:0')
tensor(1.6793, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8431, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5349, device='cuda:0')
tensor(1.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7716, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5254, device='cuda:0')
tensor(2.0879, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7393, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5338, device='cuda:0')
tensor(1.6766, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9155, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5273, device='cuda:0')
tensor(2.1058, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9701, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5342, device='cuda:0')
tensor(1.9221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6819, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5367, device='cuda:0')
tensor(2.0011, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8814, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5471, device='cuda:0')
tensor(1.9246, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6158, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5390, device='cuda:0')
tensor(1.9575, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9772, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5405, device='cuda:0')
tensor(1.9632, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7522, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5475, device='cuda:0')
tensor(1.9425, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6910, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5465, device='cuda:0')
tensor(2.1220, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8782, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5419, device='cuda:0')
tensor(1.9309, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0176, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5529, device='cuda:0')
tensor(1.6904, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1917, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5556, device='cuda:0')
tensor(1.5862, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2001, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5586, device='cuda:0')
tensor(1.5865, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8294, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5687, device='cuda:0')
tensor(1.7326, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0882, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5578, device='cuda:0')
tensor(2.2503, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9786, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5542, device='cuda:0')
tensor(2.1199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9790, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5533, device='cuda:0')
tensor(2.0010, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9883, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5548, device='cuda:0')
tensor(2.0002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0116, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5530, device='cuda:0')
tensor(1.5764, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7093, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5553, device='cuda:0')
tensor(1.5630, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8617, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5533, device='cuda:0')
tensor(1.9905, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9051, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5482, device='cuda:0')
tensor(2.2980, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0198, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5478, device='cuda:0')
tensor(1.8650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9743, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5443, device='cuda:0')
tensor(1.9147, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8807, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5470, device='cuda:0')
tensor(2.0310, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0509, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5425, device='cuda:0')
tensor(1.9053, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1799, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5528, device='cuda:0')
tensor(1.9840, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5520, device='cuda:0')
tensor(2.2382, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1648, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5553, device='cuda:0')
tensor(1.8848, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6544, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5601, device='cuda:0')
tensor(1.7570, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9136, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5469, device='cuda:0')
tensor(2.0186, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5495, device='cuda:0')
tensor(1.8334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0301, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5567, device='cuda:0')
tensor(1.6725, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8011, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5537, device='cuda:0')
tensor(2.0648, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1515, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5468, device='cuda:0')
tensor(1.8757, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1829, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5584, device='cuda:0')
tensor(1.5907, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9253, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5464, device='cuda:0')
tensor(1.8836, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6066, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5482, device='cuda:0')
tensor(2.5364, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6817, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5501, device='cuda:0')
tensor(2.1505, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2485, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5459, device='cuda:0')
tensor(1.7115, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9012, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5541, device='cuda:0')
tensor(2.1072, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7660, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5404, device='cuda:0')
tensor(1.6867, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4732, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5442, device='cuda:0')
tensor(1.9702, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8694, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5404, device='cuda:0')
tensor(1.7270, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9539, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5377, device='cuda:0')
tensor(2.0816, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8808, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5352, device='cuda:0')
tensor(2.1230, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9508, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5377, device='cuda:0')
tensor(2.0312, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9917, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5391, device='cuda:0')
tensor(1.9497, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1321, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5401, device='cuda:0')
tensor(1.9930, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0448, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5400, device='cuda:0')
tensor(1.5802, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0301, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5398, device='cuda:0')
tensor(1.9084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5350, device='cuda:0')
tensor(2.1445, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0310, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5364, device='cuda:0')
tensor(1.7886, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0820, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5325, device='cuda:0')
tensor(2.1143, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2242, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5405, device='cuda:0')
tensor(1.7776, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7144, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5394, device='cuda:0')
tensor(1.9229, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8139, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5346, device='cuda:0')
tensor(1.7259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0954, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5313, device='cuda:0')
tensor(2.4288, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0557, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5237, device='cuda:0')
tensor(1.9838, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3145, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5264, device='cuda:0')
tensor(1.8533, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0750, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5261, device='cuda:0')
tensor(1.6456, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0568, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5356, device='cuda:0')
tensor(1.9112, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0680, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5265, device='cuda:0')
tensor(1.8135, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9188, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5343, device='cuda:0')
tensor(1.9152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1474, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5248, device='cuda:0')
tensor(1.7321, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9562, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5263, device='cuda:0')
tensor(1.8841, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1726, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5256, device='cuda:0')
tensor(1.8556, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8902, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5285, device='cuda:0')
tensor(1.9563, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1405, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5261, device='cuda:0')
tensor(2.0523, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8304, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5280, device='cuda:0')
tensor(1.7928, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5899, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5236, device='cuda:0')
tensor(2.0230, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1572, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5232, device='cuda:0')
tensor(1.4543, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0809, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5286, device='cuda:0')
tensor(1.8955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8262, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5269, device='cuda:0')
tensor(1.7909, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3667, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5186, device='cuda:0')
tensor(1.8105, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9303, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5209, device='cuda:0')
tensor(1.8999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2502, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5273, device='cuda:0')
tensor(1.8497, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8204, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5174, device='cuda:0')
tensor(2.2925, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1019, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5251, device='cuda:0')
tensor(2.0638, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1160, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5282, device='cuda:0')
tensor(1.8287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1054, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5211, device='cuda:0')
tensor(1.9543, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8796, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5319, device='cuda:0')
tensor(1.9599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9723, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5290, device='cuda:0')
tensor(2.2756, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9165, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5198, device='cuda:0')
tensor(2.0964, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8561, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5164, device='cuda:0')
tensor(1.7824, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8637, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5175, device='cuda:0')
tensor(1.9999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1745, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5087, device='cuda:0')
tensor(1.8872, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2183, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5134, device='cuda:0')
tensor(2.0667, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3931, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5138, device='cuda:0')
tensor(2.0162, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9887, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5097, device='cuda:0')
tensor(2.3736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9326, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5109, device='cuda:0')
tensor(1.9252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1204, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5076, device='cuda:0')
tensor(1.8673, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2750, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5151, device='cuda:0')
tensor(2.0950, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7022, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5081, device='cuda:0')
tensor(2.0744, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5115, device='cuda:0')
tensor(1.8796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0183, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5009, device='cuda:0')
tensor(2.1517, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7658, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5030, device='cuda:0')
tensor(2.0429, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1791, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5050, device='cuda:0')
tensor(1.7137, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8292, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4949, device='cuda:0')
tensor(1.7250, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7486, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4985, device='cuda:0')
tensor(2.0053, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3195, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4959, device='cuda:0')
tensor(2.1077, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0248, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5031, device='cuda:0')
tensor(1.8985, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7759, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5003, device='cuda:0')
tensor(2.1414, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7120, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4974, device='cuda:0')
tensor(1.8159, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1964, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5029, device='cuda:0')
tensor(1.9053, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2846, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5009, device='cuda:0')
tensor(1.7999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8015, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5069, device='cuda:0')
tensor(1.7232, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3513, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5017, device='cuda:0')
tensor(2.2993, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0492, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5016, device='cuda:0')
tensor(2.1028, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2467, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5040, device='cuda:0')
tensor(1.8804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7191, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5069, device='cuda:0')
tensor(1.9688, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1764, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5066, device='cuda:0')
tensor(1.7786, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8206, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5019, device='cuda:0')
tensor(2.1698, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0612, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5008, device='cuda:0')
tensor(1.8160, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3418, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4968, device='cuda:0')
tensor(1.8440, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9280, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4911, device='cuda:0')
tensor(2.3032, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7478, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4872, device='cuda:0')
tensor(2.0702, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3029, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4962, device='cuda:0')
tensor(0.7181, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2595, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4938, device='cuda:0')
tensor(2.3586, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9752, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4907, device='cuda:0')
tensor(1.7887, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3763, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4907, device='cuda:0')
tensor(1.7794, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9110, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4956, device='cuda:0')
tensor(1.7604, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0988, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4868, device='cuda:0')
tensor(2.1459, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2433, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4822, device='cuda:0')
tensor(2.3159, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8182, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4860, device='cuda:0')
tensor(2.4015, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0019, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4897, device='cuda:0')
tensor(1.9784, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2394, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4899, device='cuda:0')
tensor(2.0438, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0572, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4915, device='cuda:0')
tensor(2.2320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9024, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4835, device='cuda:0')
tensor(1.9552, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0445, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4830, device='cuda:0')
tensor(1.9338, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2163, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4783, device='cuda:0')
tensor(1.9609, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8806, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4816, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.4964, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4969, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4907, device='cuda:0')
tensor(1.6576, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2056, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.4983, device='cuda:0')
tensor(1.6062, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3573, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5237, device='cuda:0')
tensor(1.7299, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9927, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5364, device='cuda:0')
tensor(1.7829, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6924, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5440, device='cuda:0')
tensor(1.8107, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6690, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5429, device='cuda:0')
tensor(1.6614, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8243, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5584, device='cuda:0')
tensor(1.7845, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8157, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5506, device='cuda:0')
tensor(1.9851, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5284, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5652, device='cuda:0')
tensor(1.6995, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8273, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5624, device='cuda:0')
tensor(1.4362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8368, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5725, device='cuda:0')
tensor(1.9023, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9440, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5699, device='cuda:0')
tensor(1.5797, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9242, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5802, device='cuda:0')
tensor(1.6769, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5899, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5892, device='cuda:0')
tensor(1.8042, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9968, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5815, device='cuda:0')
tensor(1.3373, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4617, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5889, device='cuda:0')
tensor(1.4547, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4930, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5988, device='cuda:0')
tensor(1.6958, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0682, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6086, device='cuda:0')
tensor(1.9514, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6472, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5962, device='cuda:0')
tensor(2.0227, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6862, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5993, device='cuda:0')
tensor(1.5870, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6485, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5955, device='cuda:0')
tensor(1.6418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9709, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5948, device='cuda:0')
tensor(2.0522, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8411, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6062, device='cuda:0')
tensor(1.6931, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4961, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6031, device='cuda:0')
tensor(1.5035, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2523, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5967, device='cuda:0')
tensor(1.5182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8366, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6136, device='cuda:0')
tensor(2.0266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7598, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6042, device='cuda:0')
tensor(0.7587, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8928, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6115, device='cuda:0')
tensor(1.9265, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0611, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5988, device='cuda:0')
tensor(1.2479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0012, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6113, device='cuda:0')
tensor(1.7682, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8357, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6020, device='cuda:0')
tensor(1.9122, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5607, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6114, device='cuda:0')
tensor(0.7748, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8458, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6085, device='cuda:0')
tensor(2.1555, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6489, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6115, device='cuda:0')
tensor(1.9852, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6455, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6107, device='cuda:0')
tensor(1.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1075, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6077, device='cuda:0')
tensor(1.8921, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6396, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6039, device='cuda:0')
tensor(1.8173, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6876, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6057, device='cuda:0')
tensor(1.6984, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6773, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6007, device='cuda:0')
tensor(1.8495, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6775, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5972, device='cuda:0')
tensor(1.5846, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9391, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6035, device='cuda:0')
tensor(1.7190, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6444, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6024, device='cuda:0')
tensor(1.7553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8351, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6059, device='cuda:0')
tensor(2.3927, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8281, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6138, device='cuda:0')
tensor(2.1790, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0570, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5972, device='cuda:0')
tensor(1.5353, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6301, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6041, device='cuda:0')
tensor(1.7432, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9799, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6012, device='cuda:0')
tensor(1.6305, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9892, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6060, device='cuda:0')
tensor(1.9844, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1999, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5986, device='cuda:0')
tensor(1.9526, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7169, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6176, device='cuda:0')
tensor(1.9889, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7284, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6004, device='cuda:0')
tensor(1.9691, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7431, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6050, device='cuda:0')
tensor(1.6636, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7669, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5983, device='cuda:0')
tensor(1.9507, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8900, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6076, device='cuda:0')
tensor(1.8044, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6624, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6007, device='cuda:0')
tensor(1.8791, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3758, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5966, device='cuda:0')
tensor(1.7744, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7899, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6123, device='cuda:0')
tensor(1.9722, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0406, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6022, device='cuda:0')
tensor(2.1526, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1676, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5953, device='cuda:0')
tensor(1.8492, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9040, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5976, device='cuda:0')
tensor(0.7032, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9578, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5979, device='cuda:0')
tensor(1.8006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8804, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6005, device='cuda:0')
tensor(2.1275, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5455, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5941, device='cuda:0')
tensor(1.9523, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5218, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5975, device='cuda:0')
tensor(1.9898, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7478, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5969, device='cuda:0')
tensor(1.9104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0654, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5927, device='cuda:0')
tensor(2.0739, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8125, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5840, device='cuda:0')
tensor(1.5906, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0407, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6056, device='cuda:0')
tensor(1.7659, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8954, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5914, device='cuda:0')
tensor(1.8986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8344, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5980, device='cuda:0')
tensor(1.6336, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1041, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6107, device='cuda:0')
tensor(2.0498, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0455, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5880, device='cuda:0')
tensor(2.0118, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3649, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5843, device='cuda:0')
tensor(1.7817, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7147, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5802, device='cuda:0')
tensor(1.8611, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6872, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5823, device='cuda:0')
tensor(1.7563, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9962, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5783, device='cuda:0')
tensor(1.5973, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2191, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5882, device='cuda:0')
tensor(1.9072, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6880, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5837, device='cuda:0')
tensor(1.7294, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0046, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5814, device='cuda:0')
tensor(2.0332, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6223, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5834, device='cuda:0')
tensor(2.0164, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9726, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5850, device='cuda:0')
tensor(1.9201, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7901, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5921, device='cuda:0')
tensor(1.5730, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6646, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5786, device='cuda:0')
tensor(1.6350, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7433, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5765, device='cuda:0')
tensor(2.0539, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1551, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5817, device='cuda:0')
tensor(2.0224, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9194, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5728, device='cuda:0')
tensor(1.6976, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6627, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5828, device='cuda:0')
tensor(1.9384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7741, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5750, device='cuda:0')
tensor(2.3927, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6773, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5745, device='cuda:0')
tensor(2.0330, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2107, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5747, device='cuda:0')
tensor(2.2496, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7754, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5808, device='cuda:0')
tensor(1.9239, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6003, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5731, device='cuda:0')
tensor(1.8717, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9482, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5888, device='cuda:0')
tensor(1.7504, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6689, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5744, device='cuda:0')
tensor(2.1677, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0322, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5803, device='cuda:0')
tensor(1.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5532, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5780, device='cuda:0')
tensor(1.8085, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7250, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5638, device='cuda:0')
tensor(2.0067, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0850, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5670, device='cuda:0')
tensor(1.8978, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8377, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5729, device='cuda:0')
tensor(2.1885, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0745, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5666, device='cuda:0')
tensor(1.7220, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9427, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5707, device='cuda:0')
tensor(2.0489, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9944, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5747, device='cuda:0')
tensor(1.6945, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7745, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5720, device='cuda:0')
tensor(2.0663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7925, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5659, device='cuda:0')
tensor(1.7126, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8928, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5713, device='cuda:0')
tensor(1.7235, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8282, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5639, device='cuda:0')
tensor(1.7822, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2111, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5590, device='cuda:0')
tensor(1.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0395, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5609, device='cuda:0')
tensor(1.8917, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8002, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5653, device='cuda:0')
tensor(2.0490, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8642, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5620, device='cuda:0')
tensor(1.9356, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6680, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5585, device='cuda:0')
tensor(2.0101, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7179, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5593, device='cuda:0')
tensor(1.8804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7174, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5687, device='cuda:0')
tensor(1.7026, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8255, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5641, device='cuda:0')
tensor(2.1097, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9535, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5683, device='cuda:0')
tensor(1.9236, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0041, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5712, device='cuda:0')
tensor(1.9489, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8983, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5557, device='cuda:0')
tensor(2.1948, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7850, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5595, device='cuda:0')
tensor(1.8487, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8605, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5636, device='cuda:0')
tensor(1.7890, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7313, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5612, device='cuda:0')
tensor(2.2359, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1359, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5604, device='cuda:0')
tensor(1.7699, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5921, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5627, device='cuda:0')
tensor(1.9529, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9268, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5696, device='cuda:0')
tensor(1.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1582, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5611, device='cuda:0')
tensor(1.7695, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0388, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5558, device='cuda:0')
tensor(2.1516, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0416, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5600, device='cuda:0')
tensor(1.9101, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0445, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5554, device='cuda:0')
tensor(1.8285, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8979, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5571, device='cuda:0')
tensor(1.8142, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1367, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5569, device='cuda:0')
tensor(1.6032, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0587, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5648, device='cuda:0')
tensor(2.1690, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9720, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5447, device='cuda:0')
tensor(1.8908, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1614, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5460, device='cuda:0')
tensor(1.7036, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7942, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5557, device='cuda:0')
tensor(1.9100, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0511, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5524, device='cuda:0')
tensor(1.8323, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8329, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5515, device='cuda:0')
tensor(1.7589, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0514, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5427, device='cuda:0')
tensor(1.7283, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8876, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5620, device='cuda:0')
tensor(2.1709, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9738, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5504, device='cuda:0')
tensor(1.7984, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8257, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5383, device='cuda:0')
tensor(1.8808, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7660, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5424, device='cuda:0')
tensor(2.0499, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9067, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5400, device='cuda:0')
tensor(0.7595, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9164, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5518, device='cuda:0')
tensor(2.1790, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1911, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5427, device='cuda:0')
tensor(2.3075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0730, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5431, device='cuda:0')
tensor(1.8796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9213, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5368, device='cuda:0')
tensor(2.0598, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9213, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5375, device='cuda:0')
tensor(1.8785, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5860, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5417, device='cuda:0')
tensor(1.9641, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6956, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5363, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.5863, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5619, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5759, device='cuda:0')
tensor(1.6006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8706, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5832, device='cuda:0')
tensor(1.2426, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5398, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5998, device='cuda:0')
tensor(1.6663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3781, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.5997, device='cuda:0')
tensor(1.5166, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5374, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6185, device='cuda:0')
tensor(1.4527, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7797, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6358, device='cuda:0')
tensor(1.9396, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5499, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6426, device='cuda:0')
tensor(1.6841, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4099, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6330, device='cuda:0')
tensor(1.6922, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8346, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6523, device='cuda:0')
tensor(1.8640, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2909, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6547, device='cuda:0')
tensor(1.6434, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6357, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6583, device='cuda:0')
tensor(1.4627, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7449, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6584, device='cuda:0')
tensor(1.8556, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5401, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6628, device='cuda:0')
tensor(1.4425, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4434, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6745, device='cuda:0')
tensor(1.5342, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9721, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6655, device='cuda:0')
tensor(0.8847, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5879, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6810, device='cuda:0')
tensor(1.3152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6561, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6667, device='cuda:0')
tensor(1.5988, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6599, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6607, device='cuda:0')
tensor(1.3559, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6399, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6649, device='cuda:0')
tensor(1.9339, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5113, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6768, device='cuda:0')
tensor(1.6613, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4425, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6742, device='cuda:0')
tensor(1.6452, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6109, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6732, device='cuda:0')
tensor(1.5940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5297, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6765, device='cuda:0')
tensor(1.7074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7307, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6857, device='cuda:0')
tensor(1.3699, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5880, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6970, device='cuda:0')
tensor(1.6153, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8280, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6819, device='cuda:0')
tensor(1.8622, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6213, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6899, device='cuda:0')
tensor(1.5847, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6627, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6794, device='cuda:0')
tensor(1.5058, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8100, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6838, device='cuda:0')
tensor(2.0008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5120, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6872, device='cuda:0')
tensor(1.3762, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6715, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6865, device='cuda:0')
tensor(1.3288, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6632, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6790, device='cuda:0')
tensor(1.5084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5168, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6836, device='cuda:0')
tensor(1.7767, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0039, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6832, device='cuda:0')
tensor(1.4867, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8872, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6924, device='cuda:0')
tensor(1.6891, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7187, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6895, device='cuda:0')
tensor(1.7414, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7099, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6820, device='cuda:0')
tensor(1.4427, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7522, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6826, device='cuda:0')
tensor(1.7232, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5337, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6857, device='cuda:0')
tensor(1.6900, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0952, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6902, device='cuda:0')
tensor(1.8442, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8284, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6892, device='cuda:0')
tensor(1.7040, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5352, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6953, device='cuda:0')
tensor(1.5896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7672, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6830, device='cuda:0')
tensor(1.6696, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0208, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6898, device='cuda:0')
tensor(1.7025, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6457, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6724, device='cuda:0')
tensor(1.4484, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6480, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6773, device='cuda:0')
tensor(1.6556, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6348, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6797, device='cuda:0')
tensor(1.6801, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0533, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6799, device='cuda:0')
tensor(1.7839, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6844, device='cuda:0')
tensor(1.9092, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7274, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6807, device='cuda:0')
tensor(1.4599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7760, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6736, device='cuda:0')
tensor(1.8985, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5176, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6724, device='cuda:0')
tensor(1.7607, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8376, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6701, device='cuda:0')
tensor(1.6340, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7740, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6720, device='cuda:0')
tensor(1.7844, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6715, device='cuda:0')
tensor(1.7944, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4990, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6831, device='cuda:0')
tensor(1.6280, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7731, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6755, device='cuda:0')
tensor(1.8219, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9493, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6764, device='cuda:0')
tensor(1.6268, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7215, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6627, device='cuda:0')
tensor(1.8941, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6779, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6654, device='cuda:0')
tensor(1.5887, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7160, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6701, device='cuda:0')
tensor(1.4345, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7443, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6645, device='cuda:0')
tensor(1.6562, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8615, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6636, device='cuda:0')
tensor(1.7890, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6206, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6705, device='cuda:0')
tensor(1.5500, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9443, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6720, device='cuda:0')
tensor(1.8157, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7252, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6659, device='cuda:0')
tensor(1.7933, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6618, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6669, device='cuda:0')
tensor(1.6464, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5825, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6719, device='cuda:0')
tensor(1.8546, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0359, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6670, device='cuda:0')
tensor(1.8458, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8034, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6737, device='cuda:0')
tensor(1.6831, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7605, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6618, device='cuda:0')
tensor(1.7093, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9278, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6676, device='cuda:0')
tensor(1.5553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5831, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6611, device='cuda:0')
tensor(1.3381, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6409, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6716, device='cuda:0')
tensor(1.6824, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8009, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6614, device='cuda:0')
tensor(1.6030, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6694, device='cuda:0')
tensor(1.6970, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6089, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6621, device='cuda:0')
tensor(1.7717, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6333, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6619, device='cuda:0')
tensor(1.7366, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6530, device='cuda:0')
tensor(2.0087, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7218, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6566, device='cuda:0')
tensor(1.8796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9073, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6577, device='cuda:0')
tensor(1.5874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9851, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6531, device='cuda:0')
tensor(1.9778, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4964, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6603, device='cuda:0')
tensor(1.7146, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7921, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6555, device='cuda:0')
tensor(1.5471, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0404, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6581, device='cuda:0')
tensor(1.7221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6824, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6562, device='cuda:0')
tensor(1.6794, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6742, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6512, device='cuda:0')
tensor(1.6315, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8628, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6549, device='cuda:0')
tensor(1.5028, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8788, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6505, device='cuda:0')
tensor(2.1284, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8427, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6538, device='cuda:0')
tensor(1.5903, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8101, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6493, device='cuda:0')
tensor(1.5367, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8693, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6560, device='cuda:0')
tensor(2.1599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4364, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6496, device='cuda:0')
tensor(1.8584, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8483, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6458, device='cuda:0')
tensor(1.3336, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9058, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6580, device='cuda:0')
tensor(2.2315, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8046, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6466, device='cuda:0')
tensor(1.8471, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6103, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6594, device='cuda:0')
tensor(1.8665, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6861, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6466, device='cuda:0')
tensor(1.4249, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6446, device='cuda:0')
tensor(1.8076, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9179, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6515, device='cuda:0')
tensor(2.1966, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9638, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6479, device='cuda:0')
tensor(1.7516, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8660, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6488, device='cuda:0')
tensor(2.0388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5436, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6463, device='cuda:0')
tensor(1.8315, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6713, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6446, device='cuda:0')
tensor(1.9590, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9083, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6501, device='cuda:0')
tensor(1.9335, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9729, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6428, device='cuda:0')
tensor(1.6634, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1193, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6355, device='cuda:0')
tensor(1.5795, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7614, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6324, device='cuda:0')
tensor(1.7379, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8187, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6380, device='cuda:0')
tensor(1.6905, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8160, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6296, device='cuda:0')
tensor(1.9240, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7320, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6298, device='cuda:0')
tensor(1.7288, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0285, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6358, device='cuda:0')
tensor(1.7117, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7477, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6286, device='cuda:0')
tensor(1.6334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1223, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6356, device='cuda:0')
tensor(1.7694, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8252, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6320, device='cuda:0')
tensor(1.8524, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9156, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6264, device='cuda:0')
tensor(1.5878, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5618, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6281, device='cuda:0')
tensor(1.6561, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9492, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6304, device='cuda:0')
tensor(2.2382, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6187, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6242, device='cuda:0')
tensor(1.8909, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5962, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6238, device='cuda:0')
tensor(1.6216, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7293, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6324, device='cuda:0')
tensor(1.5848, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6353, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6275, device='cuda:0')
tensor(1.9896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7010, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6224, device='cuda:0')
tensor(1.5325, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6117, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6296, device='cuda:0')
tensor(2.0232, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6790, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6249, device='cuda:0')
tensor(1.9126, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8286, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6360, device='cuda:0')
tensor(1.9479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8923, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6346, device='cuda:0')
tensor(2.0349, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0692, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6313, device='cuda:0')
tensor(1.5661, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6523, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6246, device='cuda:0')
tensor(1.6507, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8852, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6243, device='cuda:0')
tensor(1.5963, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6722, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6120, device='cuda:0')
tensor(1.8990, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5163, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6305, device='cuda:0')
tensor(1.7060, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8315, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6094, device='cuda:0')
tensor(1.6901, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7609, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6182, device='cuda:0')
tensor(1.4785, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0478, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6173, device='cuda:0')
tensor(1.8996, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8738, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6161, device='cuda:0')
tensor(1.8045, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6309, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6110, device='cuda:0')
tensor(1.8487, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6886, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6129, device='cuda:0')
tensor(1.9165, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0481, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6229, device='cuda:0')
tensor(2.0750, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8920, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6282, device='cuda:0')
tensor(1.6593, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0000, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6157, device='cuda:0')
tensor(1.8171, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8728, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6072, device='cuda:0')
tensor(1.9511, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9799, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6243, device='cuda:0')
tensor(1.8876, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8849, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6124, device='cuda:0')
tensor(1.7790, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8193, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6113, device='cuda:0')
tensor(1.9158, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6568, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6085, device='cuda:0')
tensor(2.0581, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2329, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6151, device='cuda:0')
tensor(2.0250, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.3680, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6088, device='cuda:0')
tensor(1.3985, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2862, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6565, device='cuda:0')
tensor(1.5804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6668, device='cuda:0')
tensor(1.6793, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6150, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6924, device='cuda:0')
tensor(1.4779, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2798, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6974, device='cuda:0')
tensor(1.3638, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3498, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7066, device='cuda:0')
tensor(1.2613, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1291, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7121, device='cuda:0')
tensor(1.5397, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4157, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7325, device='cuda:0')
tensor(1.5280, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2693, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7310, device='cuda:0')
tensor(1.5774, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5644, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7493, device='cuda:0')
tensor(1.6214, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4210, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7488, device='cuda:0')
tensor(1.2430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3512, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7707, device='cuda:0')
tensor(1.8719, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3840, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7501, device='cuda:0')
tensor(1.6621, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5560, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7626, device='cuda:0')
tensor(1.5946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6869, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7422, device='cuda:0')
tensor(1.4625, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3571, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7632, device='cuda:0')
tensor(1.4917, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5175, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7538, device='cuda:0')
tensor(1.3970, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2968, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7662, device='cuda:0')
tensor(1.4762, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6259, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7606, device='cuda:0')
tensor(1.4032, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3751, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7536, device='cuda:0')
tensor(1.3985, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5797, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7580, device='cuda:0')
tensor(1.3815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8408, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7585, device='cuda:0')
tensor(1.6317, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5214, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7582, device='cuda:0')
tensor(1.6560, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7111, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7543, device='cuda:0')
tensor(1.5527, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5640, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7750, device='cuda:0')
tensor(1.3422, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5353, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7626, device='cuda:0')
tensor(1.7771, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4565, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7707, device='cuda:0')
tensor(1.6040, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2676, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7693, device='cuda:0')
tensor(1.4200, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6755, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7729, device='cuda:0')
tensor(1.3268, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5692, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7822, device='cuda:0')
tensor(1.4559, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3153, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7753, device='cuda:0')
tensor(1.6227, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4467, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7771, device='cuda:0')
tensor(1.7402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7657, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7779, device='cuda:0')
tensor(1.6318, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7310, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7651, device='cuda:0')
tensor(1.4082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2697, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7709, device='cuda:0')
tensor(1.7302, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3286, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7604, device='cuda:0')
tensor(1.5317, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6065, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7748, device='cuda:0')
tensor(1.8785, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6180, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7724, device='cuda:0')
tensor(1.4180, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3812, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7770, device='cuda:0')
tensor(1.6565, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.4677, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7722, device='cuda:0')
tensor(1.6396, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6423, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7634, device='cuda:0')
tensor(1.4634, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4699, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7795, device='cuda:0')
tensor(1.5903, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7966, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7675, device='cuda:0')
tensor(1.5621, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6606, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7711, device='cuda:0')
tensor(1.5547, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9618, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7684, device='cuda:0')
tensor(1.5217, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4389, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7619, device='cuda:0')
tensor(1.6190, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5058, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7646, device='cuda:0')
tensor(1.6403, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5814, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7677, device='cuda:0')
tensor(1.6129, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4850, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7744, device='cuda:0')
tensor(1.6567, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4763, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7674, device='cuda:0')
tensor(1.5407, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4754, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7558, device='cuda:0')
tensor(1.6194, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7670, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7628, device='cuda:0')
tensor(1.5741, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7398, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7611, device='cuda:0')
tensor(1.5594, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4254, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7633, device='cuda:0')
tensor(1.5320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6105, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7728, device='cuda:0')
tensor(1.5582, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5978, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7608, device='cuda:0')
tensor(1.5411, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6098, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7684, device='cuda:0')
tensor(1.5491, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7652, device='cuda:0')
tensor(1.3294, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6978, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7609, device='cuda:0')
tensor(1.7341, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5521, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7535, device='cuda:0')
tensor(1.6464, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7745, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7671, device='cuda:0')
tensor(1.5642, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3696, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7620, device='cuda:0')
tensor(1.5087, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6653, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7585, device='cuda:0')
tensor(1.7642, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5468, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7667, device='cuda:0')
tensor(1.9360, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7581, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7516, device='cuda:0')
tensor(1.7889, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3484, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7614, device='cuda:0')
tensor(1.6275, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5813, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7524, device='cuda:0')
tensor(1.7767, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6973, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7510, device='cuda:0')
tensor(1.4364, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6354, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7478, device='cuda:0')
tensor(1.8481, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4414, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7578, device='cuda:0')
tensor(1.6511, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8392, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7495, device='cuda:0')
tensor(1.7804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6731, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7473, device='cuda:0')
tensor(1.4511, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3224, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7448, device='cuda:0')
tensor(1.6681, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5860, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7432, device='cuda:0')
tensor(1.9389, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4887, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7478, device='cuda:0')
tensor(1.5374, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8153, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7416, device='cuda:0')
tensor(1.7127, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5511, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7451, device='cuda:0')
tensor(1.8712, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4946, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7418, device='cuda:0')
tensor(1.9349, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7456, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7440, device='cuda:0')
tensor(1.8827, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4054, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7402, device='cuda:0')
tensor(1.7756, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8074, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7343, device='cuda:0')
tensor(1.8066, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5564, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7383, device='cuda:0')
tensor(1.6075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5116, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7415, device='cuda:0')
tensor(1.8157, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7460, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7373, device='cuda:0')
tensor(1.6288, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8865, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7399, device='cuda:0')
tensor(1.3964, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3716, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7356, device='cuda:0')
tensor(1.9105, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7737, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7305, device='cuda:0')
tensor(1.6509, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7070, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7321, device='cuda:0')
tensor(2.0345, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5558, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7306, device='cuda:0')
tensor(2.0081, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8819, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7305, device='cuda:0')
tensor(1.2388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6544, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7245, device='cuda:0')
tensor(1.3870, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5234, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7235, device='cuda:0')
tensor(1.5547, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9834, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7285, device='cuda:0')
tensor(1.1332, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6330, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7229, device='cuda:0')
tensor(1.4582, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4874, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7190, device='cuda:0')
tensor(1.9591, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8568, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7282, device='cuda:0')
tensor(1.6016, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4244, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7241, device='cuda:0')
tensor(1.1519, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6618, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7301, device='cuda:0')
tensor(1.8011, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5003, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7297, device='cuda:0')
tensor(1.6359, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3253, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7191, device='cuda:0')
tensor(1.5351, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6640, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7245, device='cuda:0')
tensor(1.7851, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4913, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7139, device='cuda:0')
tensor(1.4264, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9141, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7217, device='cuda:0')
tensor(1.5888, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6525, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7265, device='cuda:0')
tensor(1.7161, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7882, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7123, device='cuda:0')
tensor(1.7160, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7966, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7126, device='cuda:0')
tensor(1.5843, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9226, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7105, device='cuda:0')
tensor(1.5764, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9192, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7219, device='cuda:0')
tensor(2.0195, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7094, device='cuda:0')
tensor(1.5676, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7428, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7076, device='cuda:0')
tensor(1.7302, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4824, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7030, device='cuda:0')
tensor(1.9598, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7048, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7074, device='cuda:0')
tensor(1.7549, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8031, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7123, device='cuda:0')
tensor(1.8055, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.2153, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7179, device='cuda:0')
tensor(1.6115, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9095, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7190, device='cuda:0')
tensor(1.6524, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.0472, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7121, device='cuda:0')
tensor(1.8542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8622, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6992, device='cuda:0')
tensor(1.7263, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7738, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7056, device='cuda:0')
tensor(1.7933, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4779, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6929, device='cuda:0')
tensor(1.9869, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9987, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6987, device='cuda:0')
tensor(1.6131, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3987, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7065, device='cuda:0')
tensor(1.9816, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8464, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6974, device='cuda:0')
tensor(1.4734, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6285, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7035, device='cuda:0')
tensor(1.6712, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1768, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6992, device='cuda:0')
tensor(1.7420, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7247, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7042, device='cuda:0')
tensor(1.8046, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6834, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6956, device='cuda:0')
tensor(1.7440, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5849, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6983, device='cuda:0')
tensor(1.6901, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1155, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7041, device='cuda:0')
tensor(1.7544, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8088, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6980, device='cuda:0')
tensor(1.9885, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9780, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6867, device='cuda:0')
tensor(1.6661, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6901, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6938, device='cuda:0')
tensor(1.6251, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5923, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6933, device='cuda:0')
tensor(1.5083, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7841, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6975, device='cuda:0')
tensor(1.5097, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6712, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6867, device='cuda:0')
tensor(1.5518, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2516, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7089, device='cuda:0')
tensor(0.7281, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6746, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6975, device='cuda:0')
tensor(1.9952, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9171, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6979, device='cuda:0')
tensor(1.8372, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1142, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6994, device='cuda:0')
tensor(1.8575, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7740, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6873, device='cuda:0')
tensor(1.4397, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4396, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6900, device='cuda:0')
tensor(1.5721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8844, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6913, device='cuda:0')
tensor(1.5994, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6986, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6920, device='cuda:0')
tensor(1.8146, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7222, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6871, device='cuda:0')
tensor(1.5062, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7997, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6856, device='cuda:0')
tensor(1.6980, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8908, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6819, device='cuda:0')
tensor(1.6101, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6307, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6877, device='cuda:0')
tensor(1.7912, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5011, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6871, device='cuda:0')
tensor(1.7520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5529, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6807, device='cuda:0')
tensor(1.9656, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(0.8187, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.6864, device='cuda:0')
tensor(1.5201, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2496, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7442, device='cuda:0')
tensor(1.4736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3041, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7568, device='cuda:0')
tensor(1.1554, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6821, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7931, device='cuda:0')
tensor(1.1710, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0974, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8032, device='cuda:0')
tensor(1.1677, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4205, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8222, device='cuda:0')
tensor(1.3877, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4271, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8122, device='cuda:0')
tensor(1.3904, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0038, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8306, device='cuda:0')
tensor(1.1148, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2261, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8196, device='cuda:0')
tensor(1.4663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4720, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8299, device='cuda:0')
tensor(1.5188, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0959, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8222, device='cuda:0')
tensor(0.8848, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6184, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8490, device='cuda:0')
tensor(1.2797, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2921, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8377, device='cuda:0')
tensor(1.3066, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4676, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8531, device='cuda:0')
tensor(1.1919, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3514, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8522, device='cuda:0')
tensor(1.0205, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5259, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8613, device='cuda:0')
tensor(1.1632, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1299, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8556, device='cuda:0')
tensor(1.4941, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1502, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8667, device='cuda:0')
tensor(1.0916, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5907, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8658, device='cuda:0')
tensor(1.4639, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3214, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8728, device='cuda:0')
tensor(1.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4035, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8775, device='cuda:0')
tensor(1.3564, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2664, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8630, device='cuda:0')
tensor(1.4940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2615, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8735, device='cuda:0')
tensor(1.3678, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0728, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8571, device='cuda:0')
tensor(1.2254, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5119, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8637, device='cuda:0')
tensor(1.4674, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1473, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8680, device='cuda:0')
tensor(1.5871, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5093, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8641, device='cuda:0')
tensor(1.3982, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8512, device='cuda:0')
tensor(1.6944, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3097, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8387, device='cuda:0')
tensor(1.5393, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2791, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8646, device='cuda:0')
tensor(1.4793, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6616, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8588, device='cuda:0')
tensor(1.3881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6236, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8687, device='cuda:0')
tensor(1.1870, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4464, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8540, device='cuda:0')
tensor(1.6925, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6595, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8567, device='cuda:0')
tensor(1.4705, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4108, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8576, device='cuda:0')
tensor(1.1936, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4889, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8654, device='cuda:0')
tensor(1.5670, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3087, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8645, device='cuda:0')
tensor(1.3272, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2908, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8571, device='cuda:0')
tensor(1.5966, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5072, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8505, device='cuda:0')
tensor(1.4126, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4882, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8582, device='cuda:0')
tensor(1.2729, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3942, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8612, device='cuda:0')
tensor(1.4918, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5528, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8504, device='cuda:0')
tensor(1.5045, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4802, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8568, device='cuda:0')
tensor(1.5987, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5353, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8574, device='cuda:0')
tensor(1.2722, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4496, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8592, device='cuda:0')
tensor(1.4459, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6221, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8539, device='cuda:0')
tensor(1.7419, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8465, device='cuda:0')
tensor(1.5357, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7514, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8594, device='cuda:0')
tensor(0.7701, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3584, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8347, device='cuda:0')
tensor(1.6115, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6451, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8567, device='cuda:0')
tensor(1.3541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6990, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8517, device='cuda:0')
tensor(1.6074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6357, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8547, device='cuda:0')
tensor(1.3862, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8105, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8519, device='cuda:0')
tensor(1.6140, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8943, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8433, device='cuda:0')
tensor(1.6732, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5349, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8387, device='cuda:0')
tensor(0.7096, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2986, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8483, device='cuda:0')
tensor(1.8347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4133, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8386, device='cuda:0')
tensor(1.6951, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2581, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8428, device='cuda:0')
tensor(1.4655, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5056, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8456, device='cuda:0')
tensor(1.5964, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6999, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8448, device='cuda:0')
tensor(1.6919, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3535, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8383, device='cuda:0')
tensor(1.5109, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7433, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8384, device='cuda:0')
tensor(1.6116, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5328, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8421, device='cuda:0')
tensor(1.4690, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8476, device='cuda:0')
tensor(1.3998, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2862, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8449, device='cuda:0')
tensor(1.5161, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6113, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8397, device='cuda:0')
tensor(1.5705, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3842, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8370, device='cuda:0')
tensor(1.3999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1662, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8367, device='cuda:0')
tensor(1.3382, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7291, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8360, device='cuda:0')
tensor(1.7578, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8775, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8412, device='cuda:0')
tensor(1.2584, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5943, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8346, device='cuda:0')
tensor(1.6046, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4322, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8505, device='cuda:0')
tensor(1.3830, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6793, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8291, device='cuda:0')
tensor(1.5111, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4973, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8318, device='cuda:0')
tensor(1.5173, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2517, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8348, device='cuda:0')
tensor(1.6139, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5520, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8332, device='cuda:0')
tensor(1.6440, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7447, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8228, device='cuda:0')
tensor(1.5109, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4400, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8279, device='cuda:0')
tensor(1.6314, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6636, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8222, device='cuda:0')
tensor(1.6155, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4681, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8246, device='cuda:0')
tensor(1.7268, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6069, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8183, device='cuda:0')
tensor(1.6452, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3845, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8220, device='cuda:0')
tensor(1.4782, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5719, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8247, device='cuda:0')
tensor(0.6734, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4735, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8157, device='cuda:0')
tensor(1.5746, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8124, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8228, device='cuda:0')
tensor(1.3433, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6286, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8163, device='cuda:0')
tensor(1.6920, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8018, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8236, device='cuda:0')
tensor(1.5259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7059, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8215, device='cuda:0')
tensor(1.5180, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6073, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8308, device='cuda:0')
tensor(1.5694, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4844, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8201, device='cuda:0')
tensor(1.4510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3610, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8148, device='cuda:0')
tensor(1.4442, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9431, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8167, device='cuda:0')
tensor(1.3733, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5573, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8112, device='cuda:0')
tensor(1.6712, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4959, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8228, device='cuda:0')
tensor(1.6964, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8623, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8154, device='cuda:0')
tensor(1.6660, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4154, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8238, device='cuda:0')
tensor(1.7377, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3357, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8073, device='cuda:0')
tensor(1.5961, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7641, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8072, device='cuda:0')
tensor(1.5653, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4896, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8098, device='cuda:0')
tensor(1.8788, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6639, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8164, device='cuda:0')
tensor(1.6125, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6651, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8098, device='cuda:0')
tensor(1.7852, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6124, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8080, device='cuda:0')
tensor(1.7716, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8848, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8002, device='cuda:0')
tensor(1.4940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5220, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8178, device='cuda:0')
tensor(1.4067, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8589, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8128, device='cuda:0')
tensor(1.4746, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4613, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8048, device='cuda:0')
tensor(1.7648, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4712, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8007, device='cuda:0')
tensor(1.6657, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5427, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8154, device='cuda:0')
tensor(1.4330, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5722, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8078, device='cuda:0')
tensor(1.6804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5978, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8152, device='cuda:0')
tensor(1.7738, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6636, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8112, device='cuda:0')
tensor(1.6387, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6769, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8023, device='cuda:0')
tensor(1.4155, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4800, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8017, device='cuda:0')
tensor(1.6897, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6163, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8022, device='cuda:0')
tensor(1.5100, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5040, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7916, device='cuda:0')
tensor(1.7362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4368, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7902, device='cuda:0')
tensor(1.4982, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6906, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7831, device='cuda:0')
tensor(1.7751, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7038, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7899, device='cuda:0')
tensor(1.4951, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5888, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7936, device='cuda:0')
tensor(1.7401, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4805, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7952, device='cuda:0')
tensor(1.5142, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5097, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7884, device='cuda:0')
tensor(1.5480, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5680, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7992, device='cuda:0')
tensor(1.8369, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9120, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7881, device='cuda:0')
tensor(1.5500, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5911, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7875, device='cuda:0')
tensor(1.5032, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8024, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7860, device='cuda:0')
tensor(1.6259, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6794, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7825, device='cuda:0')
tensor(1.6546, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6925, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7895, device='cuda:0')
tensor(1.4722, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5874, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7875, device='cuda:0')
tensor(1.6405, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.9384, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7942, device='cuda:0')
tensor(1.4287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7348, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7853, device='cuda:0')
tensor(1.8656, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5335, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7857, device='cuda:0')
tensor(1.3721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5732, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7743, device='cuda:0')
tensor(1.7825, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7708, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7787, device='cuda:0')
tensor(1.4637, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5685, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7825, device='cuda:0')
tensor(1.8476, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6016, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7725, device='cuda:0')
tensor(1.7468, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4602, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7928, device='cuda:0')
tensor(1.5679, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4891, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7688, device='cuda:0')
tensor(1.8168, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6155, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7757, device='cuda:0')
tensor(0.7723, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4383, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7744, device='cuda:0')
tensor(1.3582, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7777, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7764, device='cuda:0')
tensor(1.6315, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6279, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7755, device='cuda:0')
tensor(1.6086, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7995, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7697, device='cuda:0')
tensor(1.7135, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6450, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7772, device='cuda:0')
tensor(1.6674, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5006, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7820, device='cuda:0')
tensor(1.8118, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6718, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7638, device='cuda:0')
tensor(1.7982, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4357, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7549, device='cuda:0')
tensor(1.4294, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3678, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7704, device='cuda:0')
tensor(1.8850, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6486, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7611, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.5959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0445, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.7936, device='cuda:0')
tensor(1.1170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2575, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8316, device='cuda:0')
tensor(1.4658, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9082, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8343, device='cuda:0')
tensor(1.1458, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4773, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8795, device='cuda:0')
tensor(1.4863, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4140, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8737, device='cuda:0')
tensor(1.5708, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2143, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8954, device='cuda:0')
tensor(1.2790, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0713, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9120, device='cuda:0')
tensor(1.2519, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2696, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9124, device='cuda:0')
tensor(1.2295, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1572, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9188, device='cuda:0')
tensor(1.2762, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3782, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9241, device='cuda:0')
tensor(1.1152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9290, device='cuda:0')
tensor(1.6896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4141, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9217, device='cuda:0')
tensor(1.2586, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2054, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9427, device='cuda:0')
tensor(1.6268, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2424, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9330, device='cuda:0')
tensor(1.3799, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1229, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9453, device='cuda:0')
tensor(1.1942, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2731, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9492, device='cuda:0')
tensor(1.3913, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1670, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9562, device='cuda:0')
tensor(1.3320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2713, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9735, device='cuda:0')
tensor(0.7533, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9765, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9686, device='cuda:0')
tensor(1.5105, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3995, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9625, device='cuda:0')
tensor(1.3093, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5023, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9701, device='cuda:0')
tensor(1.5453, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1772, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9686, device='cuda:0')
tensor(1.3144, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2935, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9546, device='cuda:0')
tensor(0.9999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4163, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9612, device='cuda:0')
tensor(1.1899, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0798, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9584, device='cuda:0')
tensor(1.2952, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6809, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9598, device='cuda:0')
tensor(1.3994, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2979, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9666, device='cuda:0')
tensor(1.1828, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3981, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9587, device='cuda:0')
tensor(1.1874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2993, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9627, device='cuda:0')
tensor(1.4740, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4601, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9673, device='cuda:0')
tensor(1.4453, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5064, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9472, device='cuda:0')
tensor(1.3830, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2512, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9603, device='cuda:0')
tensor(1.3685, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3716, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9594, device='cuda:0')
tensor(1.4432, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2292, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9598, device='cuda:0')
tensor(1.3579, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1610, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9482, device='cuda:0')
tensor(1.5085, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5859, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9648, device='cuda:0')
tensor(1.1983, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4206, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9532, device='cuda:0')
tensor(1.4607, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2071, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9597, device='cuda:0')
tensor(1.4277, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2646, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9684, device='cuda:0')
tensor(1.2320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9557, device='cuda:0')
tensor(1.2615, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4930, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9678, device='cuda:0')
tensor(1.5057, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4274, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9530, device='cuda:0')
tensor(1.3695, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2337, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9529, device='cuda:0')
tensor(1.2394, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3970, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9512, device='cuda:0')
tensor(1.4873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4629, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9643, device='cuda:0')
tensor(1.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4514, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9557, device='cuda:0')
tensor(1.4659, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4936, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9573, device='cuda:0')
tensor(1.6238, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6920, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9479, device='cuda:0')
tensor(1.4695, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1309, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9596, device='cuda:0')
tensor(1.6300, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6495, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9388, device='cuda:0')
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1825, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9437, device='cuda:0')
tensor(1.4611, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6118, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9488, device='cuda:0')
tensor(1.4447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5466, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9396, device='cuda:0')
tensor(1.4726, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1607, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9497, device='cuda:0')
tensor(1.5507, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5572, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9446, device='cuda:0')
tensor(1.4228, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4519, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9410, device='cuda:0')
tensor(1.6584, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2882, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9392, device='cuda:0')
tensor(1.7031, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2971, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9432, device='cuda:0')
tensor(1.2134, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3410, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9295, device='cuda:0')
tensor(1.5772, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5118, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9361, device='cuda:0')
tensor(1.4063, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6652, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9475, device='cuda:0')
tensor(1.3677, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9340, device='cuda:0')
tensor(1.4301, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4186, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9323, device='cuda:0')
tensor(1.6457, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5497, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9325, device='cuda:0')
tensor(1.4165, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5641, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9383, device='cuda:0')
tensor(1.3864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1938, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9293, device='cuda:0')
tensor(1.3478, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3581, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9371, device='cuda:0')
tensor(1.2452, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6396, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9319, device='cuda:0')
tensor(1.2801, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7555, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9288, device='cuda:0')
tensor(1.4489, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3704, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9320, device='cuda:0')
tensor(1.4332, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0887, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9301, device='cuda:0')
tensor(1.3365, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5980, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9287, device='cuda:0')
tensor(1.4120, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4035, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9246, device='cuda:0')
tensor(1.6096, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1082, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9400, device='cuda:0')
tensor(1.2609, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5122, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9315, device='cuda:0')
tensor(1.4348, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3632, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9327, device='cuda:0')
tensor(1.4117, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4596, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9347, device='cuda:0')
tensor(1.8291, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5400, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9196, device='cuda:0')
tensor(1.4677, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3910, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9117, device='cuda:0')
tensor(1.2807, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6771, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9240, device='cuda:0')
tensor(1.4161, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4680, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9091, device='cuda:0')
tensor(1.2438, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2378, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9113, device='cuda:0')
tensor(1.3683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2866, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9087, device='cuda:0')
tensor(1.5993, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5087, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9130, device='cuda:0')
tensor(1.6469, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6125, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9114, device='cuda:0')
tensor(1.3847, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3467, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9042, device='cuda:0')
tensor(1.6349, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6403, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9099, device='cuda:0')
tensor(1.5713, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6579, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9118, device='cuda:0')
tensor(1.4693, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5929, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9197, device='cuda:0')
tensor(0.7117, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6318, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9184, device='cuda:0')
tensor(1.4360, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3428, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9058, device='cuda:0')
tensor(1.6214, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5287, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9086, device='cuda:0')
tensor(1.4782, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3954, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9148, device='cuda:0')
tensor(1.3796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2334, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9136, device='cuda:0')
tensor(1.5531, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4642, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9059, device='cuda:0')
tensor(1.5134, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5241, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9042, device='cuda:0')
tensor(1.7614, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6969, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8981, device='cuda:0')
tensor(1.4894, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6205, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9059, device='cuda:0')
tensor(1.5061, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4898, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8852, device='cuda:0')
tensor(1.6029, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3522, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9066, device='cuda:0')
tensor(1.5828, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4866, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9061, device='cuda:0')
tensor(1.6375, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7106, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8962, device='cuda:0')
tensor(1.4112, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5995, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9006, device='cuda:0')
tensor(1.6549, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3804, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8967, device='cuda:0')
tensor(1.7282, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4584, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8852, device='cuda:0')
tensor(1.5135, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5851, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8894, device='cuda:0')
tensor(1.6186, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5208, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8914, device='cuda:0')
tensor(1.3232, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5075, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8879, device='cuda:0')
tensor(1.3828, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7095, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8819, device='cuda:0')
tensor(1.4048, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7928, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8966, device='cuda:0')
tensor(1.3417, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4348, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8729, device='cuda:0')
tensor(1.8501, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8396, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8757, device='cuda:0')
tensor(1.5339, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3580, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8774, device='cuda:0')
tensor(1.3415, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5490, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8935, device='cuda:0')
tensor(1.8063, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7098, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8892, device='cuda:0')
tensor(1.6701, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6985, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8794, device='cuda:0')
tensor(1.7280, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4220, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8755, device='cuda:0')
tensor(1.4792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4789, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8825, device='cuda:0')
tensor(1.6758, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4683, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8847, device='cuda:0')
tensor(1.4162, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7490, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8876, device='cuda:0')
tensor(1.5684, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4951, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8848, device='cuda:0')
tensor(1.7387, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5347, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8890, device='cuda:0')
tensor(1.3889, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6584, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8723, device='cuda:0')
tensor(1.4755, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7122, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8755, device='cuda:0')
tensor(1.2712, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5037, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8872, device='cuda:0')
tensor(1.7061, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5197, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8715, device='cuda:0')
tensor(1.4889, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4350, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8804, device='cuda:0')
tensor(1.5392, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4806, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8722, device='cuda:0')
tensor(1.7728, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8241, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8717, device='cuda:0')
tensor(1.5992, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4106, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8615, device='cuda:0')
tensor(1.6503, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8235, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8673, device='cuda:0')
tensor(1.7667, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5622, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8682, device='cuda:0')
tensor(1.6637, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5327, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8655, device='cuda:0')
tensor(1.7905, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1492, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8675, device='cuda:0')
tensor(1.5781, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6479, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8624, device='cuda:0')
tensor(1.5315, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5583, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8610, device='cuda:0')
tensor(1.3152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6648, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8597, device='cuda:0')
tensor(1.7182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5984, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8569, device='cuda:0')
tensor(1.8854, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3946, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8548, device='cuda:0')
tensor(1.5726, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8062, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8519, device='cuda:0')
tensor(1.3674, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4656, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8579, device='cuda:0')
tensor(1.4672, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6830, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8526, device='cuda:0')
tensor(1.5736, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5757, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8447, device='cuda:0')
tensor(1.6806, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1772, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8580, device='cuda:0')
tensor(1.8496, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4854, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8605, device='cuda:0')
tensor(1.7059, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4567, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8586, device='cuda:0')
tensor(1.8072, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5272, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8660, device='cuda:0')
tensor(1.6841, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.3190, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.8537, device='cuda:0')
tensor(1.3821, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2012, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9070, device='cuda:0')
tensor(1.3202, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4537, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9331, device='cuda:0')
tensor(0.8347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0936, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9727, device='cuda:0')
tensor(1.1134, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0794, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9750, device='cuda:0')
tensor(1.4195, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1368, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9933, device='cuda:0')
tensor(1.0639, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9097, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9943, device='cuda:0')
tensor(1.0235, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1571, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0018, device='cuda:0')
tensor(1.0208, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2217, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0122, device='cuda:0')
tensor(0.8717, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0315, device='cuda:0')
tensor(1.4011, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0933, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0149, device='cuda:0')
tensor(1.0014, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0166, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0218, device='cuda:0')
tensor(1.1921, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9003, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0416, device='cuda:0')
tensor(1.3446, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1298, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0315, device='cuda:0')
tensor(0.8355, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9781, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0466, device='cuda:0')
tensor(1.1139, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0074, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0371, device='cuda:0')
tensor(1.1230, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1760, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0501, device='cuda:0')
tensor(1.2746, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9802, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0477, device='cuda:0')
tensor(1.1001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0288, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0659, device='cuda:0')
tensor(1.1483, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2001, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0560, device='cuda:0')
tensor(1.1946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4052, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0566, device='cuda:0')
tensor(1.0288, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4131, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0438, device='cuda:0')
tensor(1.2347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1299, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0665, device='cuda:0')
tensor(1.4047, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3377, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0459, device='cuda:0')
tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4194, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0689, device='cuda:0')
tensor(1.0819, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1919, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0543, device='cuda:0')
tensor(1.1794, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0961, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0600, device='cuda:0')
tensor(1.4971, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2717, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0514, device='cuda:0')
tensor(1.3535, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3176, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0605, device='cuda:0')
tensor(1.2914, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2094, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0674, device='cuda:0')
tensor(1.3678, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1157, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0577, device='cuda:0')
tensor(1.0579, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2968, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0608, device='cuda:0')
tensor(1.3008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3320, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0594, device='cuda:0')
tensor(1.1844, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0577, device='cuda:0')
tensor(1.1368, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1764, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0788, device='cuda:0')
tensor(1.5147, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1692, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0815, device='cuda:0')
tensor(1.4655, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3110, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0626, device='cuda:0')
tensor(1.3047, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5415, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0517, device='cuda:0')
tensor(1.2704, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2440, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0633, device='cuda:0')
tensor(1.2955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2209, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0751, device='cuda:0')
tensor(1.5164, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2405, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0632, device='cuda:0')
tensor(1.3437, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1528, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0606, device='cuda:0')
tensor(1.4031, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1513, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0623, device='cuda:0')
tensor(1.2859, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4618, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0553, device='cuda:0')
tensor(1.0619, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3250, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0504, device='cuda:0')
tensor(1.7398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4314, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0486, device='cuda:0')
tensor(1.3874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1800, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0572, device='cuda:0')
tensor(1.2995, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5044, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0446, device='cuda:0')
tensor(1.4826, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4463, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0576, device='cuda:0')
tensor(1.2954, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3720, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0623, device='cuda:0')
tensor(1.2759, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3282, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0436, device='cuda:0')
tensor(1.3981, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2997, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0442, device='cuda:0')
tensor(1.3058, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0732, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0490, device='cuda:0')
tensor(1.5075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0470, device='cuda:0')
tensor(1.3696, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3995, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0372, device='cuda:0')
tensor(1.1447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5231, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0398, device='cuda:0')
tensor(1.3594, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4203, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0406, device='cuda:0')
tensor(1.2170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3151, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0357, device='cuda:0')
tensor(1.4641, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3196, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0335, device='cuda:0')
tensor(1.6091, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0685, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0328, device='cuda:0')
tensor(1.3608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5890, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0393, device='cuda:0')
tensor(1.5288, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3530, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0424, device='cuda:0')
tensor(1.3921, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3894, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0577, device='cuda:0')
tensor(1.5364, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2099, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0397, device='cuda:0')
tensor(1.4899, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4780, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0373, device='cuda:0')
tensor(1.2189, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2572, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0301, device='cuda:0')
tensor(1.6310, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4441, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0356, device='cuda:0')
tensor(1.2609, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4865, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0305, device='cuda:0')
tensor(1.1797, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4231, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0291, device='cuda:0')
tensor(1.8220, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4764, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0232, device='cuda:0')
tensor(1.0837, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2510, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0306, device='cuda:0')
tensor(1.6036, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5474, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0299, device='cuda:0')
tensor(1.3101, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4701, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0213, device='cuda:0')
tensor(1.2183, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6412, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0352, device='cuda:0')
tensor(1.2442, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3674, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0235, device='cuda:0')
tensor(1.6726, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7445, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0211, device='cuda:0')
tensor(1.3368, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3815, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0172, device='cuda:0')
tensor(1.8414, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4627, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0094, device='cuda:0')
tensor(1.1567, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4636, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0345, device='cuda:0')
tensor(1.3436, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4977, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0123, device='cuda:0')
tensor(1.3849, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3047, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0110, device='cuda:0')
tensor(1.1912, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4609, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0196, device='cuda:0')
tensor(1.4055, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3467, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0205, device='cuda:0')
tensor(1.5612, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5425, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0234, device='cuda:0')
tensor(1.2778, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4470, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0150, device='cuda:0')
tensor(1.1990, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7366, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0057, device='cuda:0')
tensor(1.1883, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4003, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0113, device='cuda:0')
tensor(1.3506, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2263, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0211, device='cuda:0')
tensor(0.8061, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5093, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0046, device='cuda:0')
tensor(1.2681, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2597, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9961, device='cuda:0')
tensor(1.4093, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5498, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0080, device='cuda:0')
tensor(1.4466, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3900, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0166, device='cuda:0')
tensor(1.3081, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6699, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0026, device='cuda:0')
tensor(1.3654, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2608, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0011, device='cuda:0')
tensor(1.3161, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5463, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0013, device='cuda:0')
tensor(1.0891, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5739, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9996, device='cuda:0')
tensor(1.4149, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3674, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9994, device='cuda:0')
tensor(1.4404, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4748, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0035, device='cuda:0')
tensor(1.5295, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2449, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9992, device='cuda:0')
tensor(1.7671, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3550, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9893, device='cuda:0')
tensor(1.7643, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4390, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9793, device='cuda:0')
tensor(1.3225, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4145, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9940, device='cuda:0')
tensor(1.2983, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4148, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9905, device='cuda:0')
tensor(1.6122, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6010, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9903, device='cuda:0')
tensor(1.3106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3686, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9909, device='cuda:0')
tensor(1.5520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4932, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9807, device='cuda:0')
tensor(1.3607, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3882, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9865, device='cuda:0')
tensor(1.4899, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4997, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9936, device='cuda:0')
tensor(1.4202, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4622, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9808, device='cuda:0')
tensor(1.2754, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3697, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9867, device='cuda:0')
tensor(1.2296, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4883, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9819, device='cuda:0')
tensor(1.4711, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1475, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9834, device='cuda:0')
tensor(1.2480, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6560, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9829, device='cuda:0')
tensor(1.6174, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5746, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9843, device='cuda:0')
tensor(1.3970, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6522, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9861, device='cuda:0')
tensor(1.4619, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3198, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9783, device='cuda:0')
tensor(1.6619, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5898, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9716, device='cuda:0')
tensor(1.4735, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5173, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9733, device='cuda:0')
tensor(1.5174, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4023, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9685, device='cuda:0')
tensor(1.6168, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2994, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9762, device='cuda:0')
tensor(1.4311, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6731, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9669, device='cuda:0')
tensor(1.4517, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4162, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9732, device='cuda:0')
tensor(1.4246, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8302, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9742, device='cuda:0')
tensor(1.1757, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5229, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9610, device='cuda:0')
tensor(1.5755, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5392, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9649, device='cuda:0')
tensor(1.4038, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5411, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9663, device='cuda:0')
tensor(1.3920, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5111, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9538, device='cuda:0')
tensor(1.7603, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6005, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9641, device='cuda:0')
tensor(1.4277, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4772, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9569, device='cuda:0')
tensor(1.9820, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2812, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9564, device='cuda:0')
tensor(1.4147, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6145, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9626, device='cuda:0')
tensor(1.4354, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3713, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9563, device='cuda:0')
tensor(1.6968, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6011, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9557, device='cuda:0')
tensor(1.5221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5131, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9642, device='cuda:0')
tensor(1.7035, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6155, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9455, device='cuda:0')
tensor(1.6199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3741, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9445, device='cuda:0')
tensor(1.5792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5721, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9493, device='cuda:0')
tensor(1.1787, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5858, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9557, device='cuda:0')
tensor(1.4360, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4644, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9553, device='cuda:0')
tensor(1.3351, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5594, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9490, device='cuda:0')
tensor(1.5701, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6285, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9419, device='cuda:0')
tensor(1.8435, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.8284, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9313, device='cuda:0')
tensor(1.6645, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4195, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9463, device='cuda:0')
tensor(1.3808, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4750, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9369, device='cuda:0')
tensor(1.5246, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3327, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9454, device='cuda:0')
tensor(1.2577, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5112, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9287, device='cuda:0')
tensor(1.3612, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3427, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9447, device='cuda:0')
tensor(1.2355, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3619, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9423, device='cuda:0')
tensor(1.4431, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.1670, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(2.9465, device='cuda:0')
tensor(1.4069, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0029, device='cuda:0')
tensor(1.2931, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8526, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0184, device='cuda:0')
tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2126, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0458, device='cuda:0')
tensor(1.2530, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7969, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0511, device='cuda:0')
tensor(1.0383, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0222, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0719, device='cuda:0')
tensor(1.1749, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7108, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0876, device='cuda:0')
tensor(1.1324, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0937, device='cuda:0')
tensor(0.7583, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9115, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1117, device='cuda:0')
tensor(1.0056, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3319, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1066, device='cuda:0')
tensor(1.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9970, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1155, device='cuda:0')
tensor(1.0483, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1417, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1243, device='cuda:0')
tensor(1.2182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1939, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1130, device='cuda:0')
tensor(0.9776, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9486, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1320, device='cuda:0')
tensor(1.2681, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4153, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1243, device='cuda:0')
tensor(1.2031, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2370, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1154, device='cuda:0')
tensor(1.2244, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1298, device='cuda:0')
tensor(1.1764, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1123, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1422, device='cuda:0')
tensor(1.0135, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8042, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1433, device='cuda:0')
tensor(1.4061, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0003, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1500, device='cuda:0')
tensor(1.1829, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2286, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1578, device='cuda:0')
tensor(1.2608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1993, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1412, device='cuda:0')
tensor(0.9903, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8312, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1748, device='cuda:0')
tensor(1.2270, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2645, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1571, device='cuda:0')
tensor(1.1585, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9103, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1752, device='cuda:0')
tensor(1.3879, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2922, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1566, device='cuda:0')
tensor(1.0252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1690, device='cuda:0')
tensor(1.2665, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4047, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1507, device='cuda:0')
tensor(0.9626, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1150, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1410, device='cuda:0')
tensor(1.2100, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0665, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1487, device='cuda:0')
tensor(1.1649, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4057, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1685, device='cuda:0')
tensor(1.1305, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0754, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1596, device='cuda:0')
tensor(1.3089, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3221, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1555, device='cuda:0')
tensor(1.2017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1835, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1543, device='cuda:0')
tensor(1.5098, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9996, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1501, device='cuda:0')
tensor(1.3606, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3501, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1430, device='cuda:0')
tensor(1.1403, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1266, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1506, device='cuda:0')
tensor(1.3503, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3115, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1524, device='cuda:0')
tensor(1.0678, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0981, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1390, device='cuda:0')
tensor(1.6084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2124, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1299, device='cuda:0')
tensor(1.2597, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2059, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1381, device='cuda:0')
tensor(1.4732, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1102, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1380, device='cuda:0')
tensor(1.1130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2182, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1469, device='cuda:0')
tensor(1.3364, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0858, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1525, device='cuda:0')
tensor(1.2698, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1439, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1394, device='cuda:0')
tensor(1.2347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2007, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1394, device='cuda:0')
tensor(1.3861, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1299, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1370, device='cuda:0')
tensor(1.1483, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2694, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1451, device='cuda:0')
tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0152, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1514, device='cuda:0')
tensor(1.4025, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4232, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1453, device='cuda:0')
tensor(1.2729, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0097, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1398, device='cuda:0')
tensor(1.1814, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3272, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1285, device='cuda:0')
tensor(1.1462, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0835, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1351, device='cuda:0')
tensor(1.3768, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3644, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1329, device='cuda:0')
tensor(1.0788, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6621, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1232, device='cuda:0')
tensor(0.9405, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4799, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1334, device='cuda:0')
tensor(1.4042, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2315, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1276, device='cuda:0')
tensor(1.0925, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4967, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1190, device='cuda:0')
tensor(1.4586, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3209, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1273, device='cuda:0')
tensor(1.3003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1616, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1160, device='cuda:0')
tensor(1.4724, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1001, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1293, device='cuda:0')
tensor(1.3694, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3770, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1206, device='cuda:0')
tensor(1.1656, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2345, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1323, device='cuda:0')
tensor(1.4521, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0550, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1179, device='cuda:0')
tensor(1.4756, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0235, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1198, device='cuda:0')
tensor(1.6076, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5302, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1254, device='cuda:0')
tensor(1.4410, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1616, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1018, device='cuda:0')
tensor(1.3697, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0376, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1091, device='cuda:0')
tensor(1.4949, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0773, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1118, device='cuda:0')
tensor(1.4126, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3215, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1105, device='cuda:0')
tensor(1.2599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3190, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1229, device='cuda:0')
tensor(1.4345, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2809, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1185, device='cuda:0')
tensor(1.1907, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3974, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1074, device='cuda:0')
tensor(1.2999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5264, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1029, device='cuda:0')
tensor(1.3947, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2042, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1059, device='cuda:0')
tensor(1.5085, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4543, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1036, device='cuda:0')
tensor(0.9339, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2435, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1006, device='cuda:0')
tensor(1.3991, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6589, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1078, device='cuda:0')
tensor(1.2842, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3774, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1377, device='cuda:0')
tensor(1.1069, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3298, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1121, device='cuda:0')
tensor(1.4149, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4215, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1064, device='cuda:0')
tensor(0.6699, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2018, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1093, device='cuda:0')
tensor(1.4121, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3156, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1126, device='cuda:0')
tensor(1.2342, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5360, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0929, device='cuda:0')
tensor(1.3307, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5438, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1069, device='cuda:0')
tensor(1.1392, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5510, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0846, device='cuda:0')
tensor(1.3270, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3892, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0925, device='cuda:0')
tensor(1.2280, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3568, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0898, device='cuda:0')
tensor(1.4177, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1259, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0951, device='cuda:0')
tensor(1.1223, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2193, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0821, device='cuda:0')
tensor(1.2623, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3006, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0870, device='cuda:0')
tensor(1.3519, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3965, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0880, device='cuda:0')
tensor(1.4180, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1332, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0908, device='cuda:0')
tensor(1.4506, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3939, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0907, device='cuda:0')
tensor(1.3571, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5296, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1005, device='cuda:0')
tensor(1.6002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1837, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0875, device='cuda:0')
tensor(1.2338, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5424, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0740, device='cuda:0')
tensor(1.3654, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3905, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0813, device='cuda:0')
tensor(1.1648, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0908, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0823, device='cuda:0')
tensor(1.1772, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2822, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0807, device='cuda:0')
tensor(1.5549, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5114, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0787, device='cuda:0')
tensor(1.3312, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3599, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0742, device='cuda:0')
tensor(1.2922, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3135, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0687, device='cuda:0')
tensor(1.2483, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3532, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0625, device='cuda:0')
tensor(1.2758, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3268, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0698, device='cuda:0')
tensor(1.6902, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4870, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0618, device='cuda:0')
tensor(1.4403, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4049, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0727, device='cuda:0')
tensor(1.4853, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1762, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0474, device='cuda:0')
tensor(1.6084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4765, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0575, device='cuda:0')
tensor(1.4869, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1761, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0627, device='cuda:0')
tensor(1.7082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3502, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0784, device='cuda:0')
tensor(1.2182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2440, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0610, device='cuda:0')
tensor(1.4137, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0677, device='cuda:0')
tensor(1.4894, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6633, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0628, device='cuda:0')
tensor(1.4197, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3849, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0519, device='cuda:0')
tensor(1.8599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5065, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0681, device='cuda:0')
tensor(1.4128, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2275, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0558, device='cuda:0')
tensor(1.4003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5350, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0474, device='cuda:0')
tensor(1.4872, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3372, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0526, device='cuda:0')
tensor(1.2301, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4405, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0478, device='cuda:0')
tensor(1.3631, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3762, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0364, device='cuda:0')
tensor(1.1095, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5869, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0555, device='cuda:0')
tensor(1.5459, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5212, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0528, device='cuda:0')
tensor(1.5754, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1320, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0539, device='cuda:0')
tensor(1.3858, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5178, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0463, device='cuda:0')
tensor(1.5226, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5550, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0564, device='cuda:0')
tensor(1.5738, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5368, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0408, device='cuda:0')
tensor(1.2066, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2278, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0384, device='cuda:0')
tensor(1.4233, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4656, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0553, device='cuda:0')
tensor(1.4719, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2319, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0378, device='cuda:0')
tensor(1.3445, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2496, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0370, device='cuda:0')
tensor(1.4621, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3108, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0467, device='cuda:0')
tensor(1.4880, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2108, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0291, device='cuda:0')
tensor(1.5658, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5333, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0384, device='cuda:0')
tensor(1.5700, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3233, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0462, device='cuda:0')
tensor(1.2271, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2923, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0394, device='cuda:0')
tensor(1.5324, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2261, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0434, device='cuda:0')
tensor(1.3683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5596, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0447, device='cuda:0')
tensor(1.4486, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3916, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0409, device='cuda:0')
tensor(1.3071, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4399, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0324, device='cuda:0')
tensor(1.4139, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7070, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0382, device='cuda:0')
tensor(1.3592, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2693, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0379, device='cuda:0')
tensor(1.3602, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3425, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0311, device='cuda:0')
tensor(1.2073, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5038, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0313, device='cuda:0')
tensor(1.6313, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2405, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0253, device='cuda:0')
tensor(1.6965, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2552, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0116, device='cuda:0')
tensor(1.3903, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.7303, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0244, device='cuda:0')
tensor(1.2992, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3325, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0153, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(0.9496, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9327, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.0590, device='cuda:0')
tensor(0.5347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1404, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1140, device='cuda:0')
tensor(1.1112, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8713, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1304, device='cuda:0')
tensor(0.6388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1371, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1701, device='cuda:0')
tensor(1.0483, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0961, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1748, device='cuda:0')
tensor(1.0480, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4264, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1871, device='cuda:0')
tensor(1.2210, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2291, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1809, device='cuda:0')
tensor(0.9681, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3551, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1878, device='cuda:0')
tensor(1.2754, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8432, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1919, device='cuda:0')
tensor(0.9998, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1673, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2146, device='cuda:0')
tensor(1.0503, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7482, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2267, device='cuda:0')
tensor(1.0124, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8719, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2144, device='cuda:0')
tensor(0.9104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1007, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2310, device='cuda:0')
tensor(1.1607, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2949, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2211, device='cuda:0')
tensor(0.5799, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1651, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2320, device='cuda:0')
tensor(1.0835, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2173, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2286, device='cuda:0')
tensor(1.0393, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9186, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2156, device='cuda:0')
tensor(1.0757, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3027, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2332, device='cuda:0')
tensor(1.3475, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3710, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2254, device='cuda:0')
tensor(1.4994, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1378, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2256, device='cuda:0')
tensor(1.2519, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1172, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2242, device='cuda:0')
tensor(1.1955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3640, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2125, device='cuda:0')
tensor(1.4935, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1577, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2222, device='cuda:0')
tensor(1.0235, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1021, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2293, device='cuda:0')
tensor(0.9956, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2663, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2313, device='cuda:0')
tensor(0.8705, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7267, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2303, device='cuda:0')
tensor(0.9744, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2212, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2317, device='cuda:0')
tensor(1.3152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2533, device='cuda:0')
tensor(1.2048, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0533, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2412, device='cuda:0')
tensor(1.1089, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9653, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2318, device='cuda:0')
tensor(1.2221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2230, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2337, device='cuda:0')
tensor(0.9956, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0493, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2414, device='cuda:0')
tensor(1.4774, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2506, device='cuda:0')
tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2227, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2434, device='cuda:0')
tensor(1.1665, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5614, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2411, device='cuda:0')
tensor(1.0435, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0171, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2411, device='cuda:0')
tensor(1.3158, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1286, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2315, device='cuda:0')
tensor(1.2199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2235, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2311, device='cuda:0')
tensor(0.6390, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2774, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2310, device='cuda:0')
tensor(0.6396, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1238, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2222, device='cuda:0')
tensor(1.2064, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2307, device='cuda:0')
tensor(1.4528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2278, device='cuda:0')
tensor(1.3158, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0740, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2208, device='cuda:0')
tensor(1.1155, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3804, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2426, device='cuda:0')
tensor(1.1329, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1859, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2260, device='cuda:0')
tensor(1.2625, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3201, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2265, device='cuda:0')
tensor(1.1887, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3705, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2181, device='cuda:0')
tensor(1.0209, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1290, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2204, device='cuda:0')
tensor(1.1097, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3323, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2098, device='cuda:0')
tensor(1.1509, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3350, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2446, device='cuda:0')
tensor(1.1921, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2942, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2099, device='cuda:0')
tensor(1.2120, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5693, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2222, device='cuda:0')
tensor(0.9498, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2970, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2142, device='cuda:0')
tensor(1.2411, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8171, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2191, device='cuda:0')
tensor(1.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3813, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2104, device='cuda:0')
tensor(1.3097, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3019, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2255, device='cuda:0')
tensor(1.1332, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1069, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2198, device='cuda:0')
tensor(1.0225, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9672, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2107, device='cuda:0')
tensor(1.0539, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2638, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2244, device='cuda:0')
tensor(1.1104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2424, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2019, device='cuda:0')
tensor(1.0007, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1975, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2179, device='cuda:0')
tensor(1.2295, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4149, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2016, device='cuda:0')
tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3438, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2302, device='cuda:0')
tensor(0.9127, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4132, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2144, device='cuda:0')
tensor(1.2813, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1098, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2129, device='cuda:0')
tensor(1.5028, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0933, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2108, device='cuda:0')
tensor(1.4284, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2011, device='cuda:0')
tensor(1.0602, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2229, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2068, device='cuda:0')
tensor(1.0598, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3472, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2234, device='cuda:0')
tensor(1.3532, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3600, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2214, device='cuda:0')
tensor(1.2309, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4959, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2128, device='cuda:0')
tensor(1.2211, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0750, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2071, device='cuda:0')
tensor(1.1650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2501, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1975, device='cuda:0')
tensor(1.2015, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0324, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2030, device='cuda:0')
tensor(1.5439, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0046, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2140, device='cuda:0')
tensor(1.2174, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3580, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1988, device='cuda:0')
tensor(1.2651, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3755, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1861, device='cuda:0')
tensor(1.2483, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3913, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2102, device='cuda:0')
tensor(1.3419, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4055, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1832, device='cuda:0')
tensor(1.4182, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9828, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1927, device='cuda:0')
tensor(1.1819, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3757, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1937, device='cuda:0')
tensor(1.0266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0068, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2144, device='cuda:0')
tensor(1.0883, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4166, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1934, device='cuda:0')
tensor(1.1347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3747, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1994, device='cuda:0')
tensor(1.3392, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4572, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1862, device='cuda:0')
tensor(0.9792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6424, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2036, device='cuda:0')
tensor(0.7392, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5542, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1917, device='cuda:0')
tensor(1.3776, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2483, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1871, device='cuda:0')
tensor(1.0595, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9518, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1843, device='cuda:0')
tensor(1.1919, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1892, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1848, device='cuda:0')
tensor(1.5191, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2312, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1812, device='cuda:0')
tensor(1.2737, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4906, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1830, device='cuda:0')
tensor(1.2709, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2082, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1864, device='cuda:0')
tensor(1.3296, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3621, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1814, device='cuda:0')
tensor(1.2464, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1709, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1705, device='cuda:0')
tensor(1.2577, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0728, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1850, device='cuda:0')
tensor(1.2875, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3554, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1866, device='cuda:0')
tensor(1.5360, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5116, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1833, device='cuda:0')
tensor(1.3595, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1651, device='cuda:0')
tensor(1.2071, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2096, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1656, device='cuda:0')
tensor(1.3418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5499, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1670, device='cuda:0')
tensor(1.3306, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4768, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1627, device='cuda:0')
tensor(1.4887, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4759, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1647, device='cuda:0')
tensor(1.2408, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4046, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1660, device='cuda:0')
tensor(1.5146, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3326, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1718, device='cuda:0')
tensor(1.3691, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0586, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1715, device='cuda:0')
tensor(1.1378, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2181, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1576, device='cuda:0')
tensor(1.1766, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4873, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1573, device='cuda:0')
tensor(1.3945, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3586, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1616, device='cuda:0')
tensor(1.2307, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1459, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1533, device='cuda:0')
tensor(1.3520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2702, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1501, device='cuda:0')
tensor(1.3785, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3846, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1514, device='cuda:0')
tensor(1.4503, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1428, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1368, device='cuda:0')
tensor(1.3824, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1421, device='cuda:0')
tensor(1.2752, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5804, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1585, device='cuda:0')
tensor(1.2111, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4988, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1522, device='cuda:0')
tensor(1.5347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2623, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1384, device='cuda:0')
tensor(1.2273, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6180, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1358, device='cuda:0')
tensor(1.4723, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2549, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1320, device='cuda:0')
tensor(1.2186, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1977, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1329, device='cuda:0')
tensor(1.3349, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3849, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1487, device='cuda:0')
tensor(1.2102, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5240, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1277, device='cuda:0')
tensor(1.2588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1303, device='cuda:0')
tensor(1.4067, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4955, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1306, device='cuda:0')
tensor(1.5017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4006, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1456, device='cuda:0')
tensor(1.2056, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2938, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1315, device='cuda:0')
tensor(1.4129, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2844, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1321, device='cuda:0')
tensor(1.4663, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1323, device='cuda:0')
tensor(1.5473, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5300, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1242, device='cuda:0')
tensor(1.2574, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4300, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1206, device='cuda:0')
tensor(1.1133, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4796, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1377, device='cuda:0')
tensor(1.4111, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3841, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1251, device='cuda:0')
tensor(1.6508, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6527, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1335, device='cuda:0')
tensor(1.2520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2873, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1144, device='cuda:0')
tensor(1.4473, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2115, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1265, device='cuda:0')
tensor(1.2019, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1730, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1190, device='cuda:0')
tensor(1.4087, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5484, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1233, device='cuda:0')
tensor(1.5279, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3465, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1189, device='cuda:0')
tensor(1.2374, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4578, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1212, device='cuda:0')
tensor(1.2672, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6063, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1208, device='cuda:0')
tensor(1.1801, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5138, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1174, device='cuda:0')
tensor(1.4253, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2952, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1216, device='cuda:0')
tensor(1.2149, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1735, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1236, device='cuda:0')
tensor(1.3216, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4488, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1170, device='cuda:0')
tensor(1.2511, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2960, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1193, device='cuda:0')
tensor(1.4579, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5988, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1125, device='cuda:0')
tensor(1.5084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1074, device='cuda:0')
tensor(1.3270, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1363, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1076, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(1.1992, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1649, device='cuda:0')
tensor(0.7952, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8950, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1881, device='cuda:0')
tensor(1.0285, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9889, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2385, device='cuda:0')
tensor(0.9711, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1104, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2486, device='cuda:0')
tensor(1.2990, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1320, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2566, device='cuda:0')
tensor(1.2423, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8691, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2527, device='cuda:0')
tensor(0.6551, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0610, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2831, device='cuda:0')
tensor(0.7824, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8138, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2868, device='cuda:0')
tensor(1.2161, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7653, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3080, device='cuda:0')
tensor(1.2961, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3053, device='cuda:0')
tensor(0.9498, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7945, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3327, device='cuda:0')
tensor(1.0759, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2277, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3161, device='cuda:0')
tensor(0.8908, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0176, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3121, device='cuda:0')
tensor(1.2236, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1837, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3167, device='cuda:0')
tensor(0.8941, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7601, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3156, device='cuda:0')
tensor(0.7614, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8143, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3403, device='cuda:0')
tensor(1.0243, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3415, device='cuda:0')
tensor(0.5723, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9558, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3422, device='cuda:0')
tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2516, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3325, device='cuda:0')
tensor(0.9705, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0926, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3396, device='cuda:0')
tensor(0.8341, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0431, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3307, device='cuda:0')
tensor(0.7129, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8931, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3238, device='cuda:0')
tensor(0.7867, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1642, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3284, device='cuda:0')
tensor(0.6912, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9858, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3312, device='cuda:0')
tensor(1.1738, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0880, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3227, device='cuda:0')
tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2829, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3474, device='cuda:0')
tensor(1.2916, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7832, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3424, device='cuda:0')
tensor(1.1289, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9931, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3330, device='cuda:0')
tensor(0.8331, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3430, device='cuda:0')
tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1847, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3405, device='cuda:0')
tensor(0.8916, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9816, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3374, device='cuda:0')
tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1781, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3406, device='cuda:0')
tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0978, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3229, device='cuda:0')
tensor(1.1541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9675, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3310, device='cuda:0')
tensor(0.7797, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0057, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3294, device='cuda:0')
tensor(1.0645, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2733, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3403, device='cuda:0')
tensor(1.1018, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2353, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3393, device='cuda:0')
tensor(0.9346, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0683, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3269, device='cuda:0')
tensor(1.0200, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3879, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3244, device='cuda:0')
tensor(1.1410, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1682, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3090, device='cuda:0')
tensor(1.2073, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2949, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3200, device='cuda:0')
tensor(0.8950, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3197, device='cuda:0')
tensor(0.9818, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0888, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3254, device='cuda:0')
tensor(1.0541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1715, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3242, device='cuda:0')
tensor(1.0553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1301, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3121, device='cuda:0')
tensor(0.8224, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3526, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3301, device='cuda:0')
tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1105, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3330, device='cuda:0')
tensor(0.9793, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1550, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3151, device='cuda:0')
tensor(1.2198, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2557, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3213, device='cuda:0')
tensor(0.4227, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3443, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3309, device='cuda:0')
tensor(1.3084, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9443, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3198, device='cuda:0')
tensor(0.9979, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3875, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3045, device='cuda:0')
tensor(0.5972, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0661, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3112, device='cuda:0')
tensor(1.1983, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1510, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3139, device='cuda:0')
tensor(1.0161, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2070, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3187, device='cuda:0')
tensor(1.2776, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0741, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3139, device='cuda:0')
tensor(1.1637, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3937, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3140, device='cuda:0')
tensor(1.1121, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0832, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3154, device='cuda:0')
tensor(1.0989, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9805, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3201, device='cuda:0')
tensor(1.1839, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3474, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3206, device='cuda:0')
tensor(1.2529, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4566, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3181, device='cuda:0')
tensor(1.1901, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1381, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3073, device='cuda:0')
tensor(1.2402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0084, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2998, device='cuda:0')
tensor(1.2138, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8856, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3056, device='cuda:0')
tensor(1.2075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9974, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3013, device='cuda:0')
tensor(1.1300, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2180, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3073, device='cuda:0')
tensor(1.2140, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5874, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2963, device='cuda:0')
tensor(1.1875, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9373, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2990, device='cuda:0')
tensor(1.2142, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3204, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2907, device='cuda:0')
tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1275, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2772, device='cuda:0')
tensor(1.3624, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1820, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2913, device='cuda:0')
tensor(1.1218, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2924, device='cuda:0')
tensor(1.3875, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0605, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2863, device='cuda:0')
tensor(0.9981, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2359, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2773, device='cuda:0')
tensor(1.0408, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1806, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2788, device='cuda:0')
tensor(0.8905, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2882, device='cuda:0')
tensor(1.1229, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2136, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2738, device='cuda:0')
tensor(1.2272, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4091, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2727, device='cuda:0')
tensor(1.2248, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9734, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2700, device='cuda:0')
tensor(1.4696, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2826, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2794, device='cuda:0')
tensor(1.0850, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4856, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2611, device='cuda:0')
tensor(1.4106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1323, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2597, device='cuda:0')
tensor(1.2318, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1457, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2675, device='cuda:0')
tensor(1.0196, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4701, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2725, device='cuda:0')
tensor(1.1362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5442, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2647, device='cuda:0')
tensor(1.2496, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2035, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2699, device='cuda:0')
tensor(1.1626, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4410, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2556, device='cuda:0')
tensor(1.0706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9724, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2670, device='cuda:0')
tensor(1.1866, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2709, device='cuda:0')
tensor(1.3720, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1366, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2594, device='cuda:0')
tensor(1.1951, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4041, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2642, device='cuda:0')
tensor(1.5108, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0474, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2530, device='cuda:0')
tensor(1.0446, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2666, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2698, device='cuda:0')
tensor(1.1573, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9943, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2643, device='cuda:0')
tensor(1.0833, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2979, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2634, device='cuda:0')
tensor(1.3253, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0020, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2642, device='cuda:0')
tensor(1.2329, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9680, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2770, device='cuda:0')
tensor(1.1430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2335, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2581, device='cuda:0')
tensor(1.4272, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1378, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2427, device='cuda:0')
tensor(1.4780, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3983, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2442, device='cuda:0')
tensor(1.5031, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3368, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2578, device='cuda:0')
tensor(0.7313, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2428, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2444, device='cuda:0')
tensor(1.2803, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4047, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2594, device='cuda:0')
tensor(1.0957, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4361, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2403, device='cuda:0')
tensor(1.2921, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0167, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2422, device='cuda:0')
tensor(1.1669, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2161, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2509, device='cuda:0')
tensor(1.2406, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3056, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2515, device='cuda:0')
tensor(1.2763, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4843, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2429, device='cuda:0')
tensor(1.2241, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3647, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2370, device='cuda:0')
tensor(1.2426, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6119, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2364, device='cuda:0')
tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1385, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2585, device='cuda:0')
tensor(1.2496, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1137, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2471, device='cuda:0')
tensor(1.2409, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0138, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2270, device='cuda:0')
tensor(1.2860, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2140, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2367, device='cuda:0')
tensor(1.1521, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5497, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2289, device='cuda:0')
tensor(1.3520, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3189, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2164, device='cuda:0')
tensor(1.2488, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0851, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2374, device='cuda:0')
tensor(1.0703, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1508, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2334, device='cuda:0')
tensor(1.4124, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2523, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2247, device='cuda:0')
tensor(1.3298, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2154, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2317, device='cuda:0')
tensor(1.3850, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4456, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2268, device='cuda:0')
tensor(1.0799, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5103, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2375, device='cuda:0')
tensor(1.1906, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2347, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2206, device='cuda:0')
tensor(1.3808, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4479, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2269, device='cuda:0')
tensor(1.3717, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2236, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2256, device='cuda:0')
tensor(1.2504, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5567, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2155, device='cuda:0')
tensor(1.3466, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0454, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2144, device='cuda:0')
tensor(1.2311, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3386, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2144, device='cuda:0')
tensor(0.5671, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3807, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2249, device='cuda:0')
tensor(1.3519, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4319, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2180, device='cuda:0')
tensor(1.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1600, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2170, device='cuda:0')
tensor(1.2190, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9616, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2065, device='cuda:0')
tensor(1.3043, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3416, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2062, device='cuda:0')
tensor(1.2333, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2030, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2065, device='cuda:0')
tensor(1.2131, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3918, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2015, device='cuda:0')
tensor(1.6417, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3574, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1967, device='cuda:0')
tensor(1.2867, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3930, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1991, device='cuda:0')
tensor(1.0648, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5615, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2015, device='cuda:0')
tensor(1.1572, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3098, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2048, device='cuda:0')
tensor(1.2467, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1418, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2156, device='cuda:0')
tensor(1.1221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3748, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1932, device='cuda:0')
tensor(1.3287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3623, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2002, device='cuda:0')
tensor(1.1404, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2478, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2010, device='cuda:0')
tensor(1.3356, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4626, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2068, device='cuda:0')
tensor(1.6305, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2532, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1979, device='cuda:0')
tensor(1.4240, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1821, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1880, device='cuda:0')
tensor(1.3624, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2009, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.1838, device='cuda:0')
tensor(1.4793, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(0.8863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2077, device='cuda:0')
tensor(0.7926, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1320, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2630, device='cuda:0')
tensor(0.8857, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7233, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2980, device='cuda:0')
tensor(0.8147, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.4250, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3506, device='cuda:0')
tensor(0.6793, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6248, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3506, device='cuda:0')
tensor(0.9528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9551, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3601, device='cuda:0')
tensor(1.1004, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8354, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3535, device='cuda:0')
tensor(0.7224, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0818, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3619, device='cuda:0')
tensor(0.9761, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9964, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3697, device='cuda:0')
tensor(1.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8962, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3717, device='cuda:0')
tensor(1.0902, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8349, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3687, device='cuda:0')
tensor(0.7978, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3299, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3935, device='cuda:0')
tensor(1.1304, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5978, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3991, device='cuda:0')
tensor(1.1430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0268, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3967, device='cuda:0')
tensor(1.2028, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0780, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4008, device='cuda:0')
tensor(1.2233, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8654, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4211, device='cuda:0')
tensor(0.8254, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0985, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4135, device='cuda:0')
tensor(0.9857, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9851, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4077, device='cuda:0')
tensor(1.0706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8788, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4126, device='cuda:0')
tensor(0.8593, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0993, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4132, device='cuda:0')
tensor(0.8040, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1800, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4082, device='cuda:0')
tensor(0.8334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8548, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4078, device='cuda:0')
tensor(0.7340, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8901, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4089, device='cuda:0')
tensor(1.1250, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8739, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4075, device='cuda:0')
tensor(0.8930, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0568, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4247, device='cuda:0')
tensor(0.9717, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0296, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4287, device='cuda:0')
tensor(1.3895, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9414, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4133, device='cuda:0')
tensor(1.1959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1007, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4243, device='cuda:0')
tensor(1.2487, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7262, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3886, device='cuda:0')
tensor(1.0895, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0208, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4276, device='cuda:0')
tensor(0.9976, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8125, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4148, device='cuda:0')
tensor(1.2210, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4088, device='cuda:0')
tensor(0.7839, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7900, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4234, device='cuda:0')
tensor(0.7936, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9894, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4334, device='cuda:0')
tensor(1.2058, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1208, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4099, device='cuda:0')
tensor(1.1988, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9277, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4172, device='cuda:0')
tensor(0.8420, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1151, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4220, device='cuda:0')
tensor(0.9690, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0016, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4092, device='cuda:0')
tensor(1.2484, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0912, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3916, device='cuda:0')
tensor(1.1331, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9181, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4283, device='cuda:0')
tensor(1.0294, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8914, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4027, device='cuda:0')
tensor(0.8757, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7473, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4215, device='cuda:0')
tensor(0.8287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2173, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4167, device='cuda:0')
tensor(1.1238, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0557, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4132, device='cuda:0')
tensor(1.1116, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0101, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4107, device='cuda:0')
tensor(0.8841, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9412, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4183, device='cuda:0')
tensor(1.0152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0493, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4008, device='cuda:0')
tensor(1.0130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2920, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4032, device='cuda:0')
tensor(0.9665, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0334, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3952, device='cuda:0')
tensor(0.8689, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9202, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4249, device='cuda:0')
tensor(1.1568, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8193, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4018, device='cuda:0')
tensor(0.9614, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0648, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4028, device='cuda:0')
tensor(1.3347, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5452, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4268, device='cuda:0')
tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0708, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3980, device='cuda:0')
tensor(1.1514, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1226, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4129, device='cuda:0')
tensor(1.0749, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0933, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4088, device='cuda:0')
tensor(1.2328, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1048, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3882, device='cuda:0')
tensor(1.0880, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8702, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4126, device='cuda:0')
tensor(1.2981, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9954, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4072, device='cuda:0')
tensor(1.2804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3569, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4018, device='cuda:0')
tensor(1.1758, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0701, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3859, device='cuda:0')
tensor(1.2634, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3158, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3869, device='cuda:0')
tensor(1.0314, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9730, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3827, device='cuda:0')
tensor(1.1346, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0373, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3940, device='cuda:0')
tensor(1.0767, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0844, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3927, device='cuda:0')
tensor(1.0362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0837, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3847, device='cuda:0')
tensor(1.2103, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2485, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3814, device='cuda:0')
tensor(1.3717, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3794, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3830, device='cuda:0')
tensor(1.2575, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8741, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3792, device='cuda:0')
tensor(1.2104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2543, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3795, device='cuda:0')
tensor(1.1553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9337, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3732, device='cuda:0')
tensor(1.1190, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0943, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3987, device='cuda:0')
tensor(0.9523, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0907, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3683, device='cuda:0')
tensor(1.0693, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8944, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3961, device='cuda:0')
tensor(1.0547, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0608, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3899, device='cuda:0')
tensor(1.1810, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8331, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3894, device='cuda:0')
tensor(1.1376, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2860, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3840, device='cuda:0')
tensor(1.1107, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8960, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3528, device='cuda:0')
tensor(1.3245, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2073, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3812, device='cuda:0')
tensor(0.9959, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0929, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3764, device='cuda:0')
tensor(1.0882, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0299, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3634, device='cuda:0')
tensor(1.2127, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0037, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3682, device='cuda:0')
tensor(1.1112, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0298, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3584, device='cuda:0')
tensor(1.1207, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0069, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3638, device='cuda:0')
tensor(1.1989, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0312, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3691, device='cuda:0')
tensor(1.1791, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2291, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3670, device='cuda:0')
tensor(1.0435, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0288, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3742, device='cuda:0')
tensor(1.2140, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2736, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3530, device='cuda:0')
tensor(1.3698, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4099, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3575, device='cuda:0')
tensor(1.2890, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3690, device='cuda:0')
tensor(1.2202, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3574, device='cuda:0')
tensor(1.2963, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0147, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3642, device='cuda:0')
tensor(1.0576, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7059, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3547, device='cuda:0')
tensor(1.3611, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9573, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3534, device='cuda:0')
tensor(1.1018, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1646, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3602, device='cuda:0')
tensor(1.3597, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0158, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3635, device='cuda:0')
tensor(1.1500, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3549, device='cuda:0')
tensor(1.1726, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9904, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3404, device='cuda:0')
tensor(1.5189, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3681, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3413, device='cuda:0')
tensor(1.2890, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2556, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3388, device='cuda:0')
tensor(1.3422, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0901, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3291, device='cuda:0')
tensor(1.1976, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1594, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3392, device='cuda:0')
tensor(1.4716, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5223, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3344, device='cuda:0')
tensor(1.3801, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3271, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3138, device='cuda:0')
tensor(1.0261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2228, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3475, device='cuda:0')
tensor(1.0225, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4044, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3274, device='cuda:0')
tensor(1.2439, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0853, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3304, device='cuda:0')
tensor(1.0802, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2797, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3296, device='cuda:0')
tensor(1.2234, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2130, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3143, device='cuda:0')
tensor(1.0612, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1924, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3177, device='cuda:0')
tensor(1.1670, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0825, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3291, device='cuda:0')
tensor(1.2096, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1009, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3102, device='cuda:0')
tensor(1.3783, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1359, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3297, device='cuda:0')
tensor(1.0095, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5096, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3290, device='cuda:0')
tensor(1.0458, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0672, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3290, device='cuda:0')
tensor(0.9853, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0371, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3164, device='cuda:0')
tensor(1.2194, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5279, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3231, device='cuda:0')
tensor(1.3313, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3240, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2996, device='cuda:0')
tensor(1.3036, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3920, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2988, device='cuda:0')
tensor(1.3435, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2327, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3012, device='cuda:0')
tensor(1.0114, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1958, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3096, device='cuda:0')
tensor(1.4214, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2588, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3019, device='cuda:0')
tensor(1.0981, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2790, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2872, device='cuda:0')
tensor(1.0776, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9917, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2999, device='cuda:0')
tensor(1.1678, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1690, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2886, device='cuda:0')
tensor(1.5827, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1388, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2848, device='cuda:0')
tensor(0.9311, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2741, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2905, device='cuda:0')
tensor(1.1955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2509, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2867, device='cuda:0')
tensor(1.3861, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4543, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3054, device='cuda:0')
tensor(1.3973, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0900, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2853, device='cuda:0')
tensor(1.3389, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2592, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2892, device='cuda:0')
tensor(0.9741, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2831, device='cuda:0')
tensor(1.4455, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2419, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2990, device='cuda:0')
tensor(1.3318, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4958, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2802, device='cuda:0')
tensor(1.4739, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4441, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3114, device='cuda:0')
tensor(1.1623, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1688, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2847, device='cuda:0')
tensor(1.0252, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4072, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2870, device='cuda:0')
tensor(1.1760, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.6316, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2834, device='cuda:0')
tensor(1.2572, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3582, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2826, device='cuda:0')
tensor(1.3069, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5080, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2843, device='cuda:0')
tensor(0.6281, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2931, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2783, device='cuda:0')
tensor(1.1177, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3665, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2746, device='cuda:0')
tensor(1.3062, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2185, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2708, device='cuda:0')
tensor(1.1833, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4430, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2796, device='cuda:0')
tensor(1.2995, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0180, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2730, device='cuda:0')
tensor(1.1399, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3035, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2544, device='cuda:0')
tensor(0.9563, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1844, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2723, device='cuda:0')
tensor(1.2674, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1505, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.2809, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(0.6446, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9730, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3015, device='cuda:0')
tensor(0.7218, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7534, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3761, device='cuda:0')
tensor(1.2756, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7140, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3889, device='cuda:0')
tensor(0.8517, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7663, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4013, device='cuda:0')
tensor(1.0173, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9162, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4220, device='cuda:0')
tensor(0.8479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1516, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4357, device='cuda:0')
tensor(0.7841, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6358, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4537, device='cuda:0')
tensor(1.1912, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5591, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4376, device='cuda:0')
tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7502, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4631, device='cuda:0')
tensor(0.9796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9985, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4547, device='cuda:0')
tensor(0.8143, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7586, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4562, device='cuda:0')
tensor(0.9071, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8714, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4719, device='cuda:0')
tensor(0.7957, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6550, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4599, device='cuda:0')
tensor(1.0654, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9008, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4831, device='cuda:0')
tensor(1.0304, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4782, device='cuda:0')
tensor(1.0673, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7854, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4665, device='cuda:0')
tensor(0.8751, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8058, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4758, device='cuda:0')
tensor(0.9918, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9990, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4791, device='cuda:0')
tensor(0.8981, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7546, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4905, device='cuda:0')
tensor(0.8007, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6036, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5023, device='cuda:0')
tensor(1.1362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2166, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4865, device='cuda:0')
tensor(0.9362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6589, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4978, device='cuda:0')
tensor(1.1541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9714, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4781, device='cuda:0')
tensor(0.7152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7801, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4937, device='cuda:0')
tensor(1.1144, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7620, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5165, device='cuda:0')
tensor(0.5707, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0317, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5210, device='cuda:0')
tensor(1.0803, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6891, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4787, device='cuda:0')
tensor(1.1698, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4966, device='cuda:0')
tensor(0.6743, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9385, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5121, device='cuda:0')
tensor(1.1871, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0972, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5046, device='cuda:0')
tensor(0.8580, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0976, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5005, device='cuda:0')
tensor(0.9946, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9783, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4696, device='cuda:0')
tensor(1.1951, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8273, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4870, device='cuda:0')
tensor(0.9941, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9035, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4978, device='cuda:0')
tensor(1.1647, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7224, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4892, device='cuda:0')
tensor(1.1202, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1401, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4883, device='cuda:0')
tensor(0.9034, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7865, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4823, device='cuda:0')
tensor(0.9683, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6412, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4999, device='cuda:0')
tensor(1.0664, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8162, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4897, device='cuda:0')
tensor(1.0202, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9160, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4837, device='cuda:0')
tensor(1.2674, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0136, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4705, device='cuda:0')
tensor(0.9574, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9091, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4974, device='cuda:0')
tensor(1.0199, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0345, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4877, device='cuda:0')
tensor(0.8500, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9639, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5040, device='cuda:0')
tensor(1.1287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0184, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4931, device='cuda:0')
tensor(0.8022, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9958, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4848, device='cuda:0')
tensor(0.9585, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7815, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4811, device='cuda:0')
tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0434, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4987, device='cuda:0')
tensor(0.8185, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0802, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4811, device='cuda:0')
tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9283, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5098, device='cuda:0')
tensor(1.2553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4792, device='cuda:0')
tensor(1.0553, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9729, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4836, device='cuda:0')
tensor(1.2178, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8332, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4841, device='cuda:0')
tensor(1.1483, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0735, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4824, device='cuda:0')
tensor(1.0960, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2436, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4735, device='cuda:0')
tensor(0.7657, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9983, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4745, device='cuda:0')
tensor(0.9660, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9994, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4847, device='cuda:0')
tensor(0.8697, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1857, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4695, device='cuda:0')
tensor(1.2292, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0549, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4565, device='cuda:0')
tensor(1.0213, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9838, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4584, device='cuda:0')
tensor(1.0469, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3537, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4751, device='cuda:0')
tensor(1.2371, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8678, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4745, device='cuda:0')
tensor(1.1414, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9712, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5036, device='cuda:0')
tensor(0.6528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1096, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4618, device='cuda:0')
tensor(1.2486, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2179, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4715, device='cuda:0')
tensor(1.2130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0023, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4561, device='cuda:0')
tensor(0.9616, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8331, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4699, device='cuda:0')
tensor(0.8857, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2032, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4614, device='cuda:0')
tensor(1.0655, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4032, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4712, device='cuda:0')
tensor(1.1383, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0378, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4536, device='cuda:0')
tensor(1.0090, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0811, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4606, device='cuda:0')
tensor(1.0924, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1945, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4453, device='cuda:0')
tensor(1.0868, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2629, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4474, device='cuda:0')
tensor(1.0343, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1888, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4466, device='cuda:0')
tensor(1.2706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1638, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4606, device='cuda:0')
tensor(1.1354, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0427, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4596, device='cuda:0')
tensor(1.1229, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0817, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4375, device='cuda:0')
tensor(1.1234, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9383, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4504, device='cuda:0')
tensor(1.2701, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8535, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4361, device='cuda:0')
tensor(1.0911, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0108, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4426, device='cuda:0')
tensor(0.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4307, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4266, device='cuda:0')
tensor(1.0150, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1170, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4377, device='cuda:0')
tensor(1.1307, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0620, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4271, device='cuda:0')
tensor(1.3235, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.3779, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4400, device='cuda:0')
tensor(1.1022, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1282, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4256, device='cuda:0')
tensor(1.2643, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0715, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4446, device='cuda:0')
tensor(1.3282, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1809, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4343, device='cuda:0')
tensor(1.0512, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4259, device='cuda:0')
tensor(0.8996, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2852, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4376, device='cuda:0')
tensor(0.9682, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2794, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4314, device='cuda:0')
tensor(0.9710, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1255, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4290, device='cuda:0')
tensor(1.2570, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1384, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4255, device='cuda:0')
tensor(1.2979, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2460, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4018, device='cuda:0')
tensor(1.0554, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2440, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4149, device='cuda:0')
tensor(0.5451, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8773, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4210, device='cuda:0')
tensor(0.9234, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1391, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4200, device='cuda:0')
tensor(1.2152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0735, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4260, device='cuda:0')
tensor(1.1174, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0074, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4375, device='cuda:0')
tensor(0.9971, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1373, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3998, device='cuda:0')
tensor(1.0467, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3172, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4149, device='cuda:0')
tensor(1.3146, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1744, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3916, device='cuda:0')
tensor(1.2078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0979, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4018, device='cuda:0')
tensor(1.1283, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0180, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3994, device='cuda:0')
tensor(1.1001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1285, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4102, device='cuda:0')
tensor(1.0245, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3624, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4063, device='cuda:0')
tensor(1.1820, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1479, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4031, device='cuda:0')
tensor(1.1691, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1221, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4027, device='cuda:0')
tensor(1.3733, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2163, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3951, device='cuda:0')
tensor(1.2295, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0908, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3952, device='cuda:0')
tensor(1.2402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4065, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3992, device='cuda:0')
tensor(1.3003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3403, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3955, device='cuda:0')
tensor(1.1287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9688, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3953, device='cuda:0')
tensor(1.1391, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1222, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4008, device='cuda:0')
tensor(1.0100, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2811, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4055, device='cuda:0')
tensor(1.2881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0437, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3891, device='cuda:0')
tensor(1.0632, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4143, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3792, device='cuda:0')
tensor(1.1498, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2653, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3732, device='cuda:0')
tensor(1.2604, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1512, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3704, device='cuda:0')
tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0764, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3833, device='cuda:0')
tensor(1.2715, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3883, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3777, device='cuda:0')
tensor(1.2141, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2347, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3680, device='cuda:0')
tensor(1.1697, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5538, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3836, device='cuda:0')
tensor(1.0650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9862, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3815, device='cuda:0')
tensor(1.3078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0380, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3689, device='cuda:0')
tensor(1.3180, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0394, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3706, device='cuda:0')
tensor(0.8791, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3008, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3762, device='cuda:0')
tensor(1.3544, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3253, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3756, device='cuda:0')
tensor(1.0866, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2068, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3689, device='cuda:0')
tensor(1.2467, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1707, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3652, device='cuda:0')
tensor(1.0778, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3789, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3673, device='cuda:0')
tensor(1.0882, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1826, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3677, device='cuda:0')
tensor(1.0678, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2956, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3720, device='cuda:0')
tensor(1.2184, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0483, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3752, device='cuda:0')
tensor(1.3763, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3575, device='cuda:0')
tensor(1.4058, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2075, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3687, device='cuda:0')
tensor(1.5131, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3201, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3452, device='cuda:0')
tensor(1.1986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0517, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3643, device='cuda:0')
tensor(1.2922, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0777, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3522, device='cuda:0')
tensor(1.1925, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1905, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3506, device='cuda:0')
tensor(1.3362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3890, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3547, device='cuda:0')
tensor(1.1608, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4302, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3363, device='cuda:0')
tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3279, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3443, device='cuda:0')
tensor(1.2604, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2158, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3419, device='cuda:0')
tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3139, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3590, device='cuda:0')
tensor(1.2230, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2532, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3385, device='cuda:0')
tensor(1.4590, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3285, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3463, device='cuda:0')
tensor(1.3977, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1816, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3523, device='cuda:0')
tensor(1.2313, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1864, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3489, device='cuda:0')


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(0.7579, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9463, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.3788, device='cuda:0')
tensor(0.8052, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0094, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4538, device='cuda:0')
tensor(0.9454, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6748, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4703, device='cuda:0')
tensor(0.6930, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5099, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4877, device='cuda:0')
tensor(0.4685, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0229, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5056, device='cuda:0')
tensor(1.0056, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6694, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4924, device='cuda:0')
tensor(0.3872, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1096, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5199, device='cuda:0')
tensor(1.0206, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9779, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5302, device='cuda:0')
tensor(0.7797, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0984, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5034, device='cuda:0')
tensor(0.8430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6775, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5207, device='cuda:0')
tensor(0.7546, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8154, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5485, device='cuda:0')
tensor(0.6455, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.4764, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5476, device='cuda:0')
tensor(0.6391, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6571, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5702, device='cuda:0')
tensor(0.6469, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8749, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5806, device='cuda:0')
tensor(1.0088, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6791, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5674, device='cuda:0')
tensor(0.7841, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9782, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5618, device='cuda:0')
tensor(0.8727, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2165, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5758, device='cuda:0')
tensor(0.6986, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9858, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5661, device='cuda:0')
tensor(0.5745, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9902, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5748, device='cuda:0')
tensor(1.0016, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9345, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5733, device='cuda:0')
tensor(0.7447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5694, device='cuda:0')
tensor(0.8874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0944, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5785, device='cuda:0')
tensor(1.2333, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7800, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5596, device='cuda:0')
tensor(0.9652, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5587, device='cuda:0')
tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9481, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5604, device='cuda:0')
tensor(1.0819, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9021, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5570, device='cuda:0')
tensor(1.2179, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1062, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5624, device='cuda:0')
tensor(0.9506, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9636, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5680, device='cuda:0')
tensor(0.8939, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8861, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5710, device='cuda:0')
tensor(0.9158, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8162, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5718, device='cuda:0')
tensor(0.7977, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7310, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5771, device='cuda:0')
tensor(1.1667, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8371, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5790, device='cuda:0')
tensor(0.9156, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1558, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5773, device='cuda:0')
tensor(0.8057, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7615, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5854, device='cuda:0')
tensor(1.0337, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.4231, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5940, device='cuda:0')
tensor(1.1177, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0318, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5709, device='cuda:0')
tensor(1.0901, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7958, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5723, device='cuda:0')
tensor(0.7674, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7776, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5863, device='cuda:0')
tensor(0.9854, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9906, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5772, device='cuda:0')
tensor(0.8453, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7831, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5809, device='cuda:0')
tensor(1.2412, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7961, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6033, device='cuda:0')
tensor(0.9672, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0420, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5898, device='cuda:0')
tensor(1.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8784, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5626, device='cuda:0')
tensor(0.9899, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9018, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5796, device='cuda:0')
tensor(1.0007, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8593, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5528, device='cuda:0')
tensor(0.7088, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9480, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5605, device='cuda:0')
tensor(1.1588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7428, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5564, device='cuda:0')
tensor(0.7731, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1901, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5693, device='cuda:0')
tensor(0.9122, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8211, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5745, device='cuda:0')
tensor(0.8329, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2595, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5641, device='cuda:0')
tensor(0.9777, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8892, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5679, device='cuda:0')
tensor(0.9163, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0637, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5601, device='cuda:0')
tensor(0.9664, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5702, device='cuda:0')
tensor(0.8881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8655, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5460, device='cuda:0')
tensor(1.0604, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9399, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5487, device='cuda:0')
tensor(0.8177, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2309, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5598, device='cuda:0')
tensor(0.9894, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9988, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5523, device='cuda:0')
tensor(0.8826, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1834, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5329, device='cuda:0')
tensor(0.5991, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1273, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5546, device='cuda:0')
tensor(0.8839, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7698, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5452, device='cuda:0')
tensor(1.0010, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0851, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5354, device='cuda:0')
tensor(0.9874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9918, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5625, device='cuda:0')
tensor(1.0317, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8825, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5391, device='cuda:0')
tensor(1.0173, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2436, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5398, device='cuda:0')
tensor(1.0342, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0037, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5229, device='cuda:0')
tensor(1.2376, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0306, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5175, device='cuda:0')
tensor(1.0822, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1024, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5202, device='cuda:0')
tensor(1.0612, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9571, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5278, device='cuda:0')
tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0025, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5372, device='cuda:0')
tensor(1.1300, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0108, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5384, device='cuda:0')
tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0389, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5449, device='cuda:0')
tensor(1.0401, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3235, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5196, device='cuda:0')
tensor(1.1642, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9425, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5223, device='cuda:0')
tensor(1.1510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8902, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5226, device='cuda:0')
tensor(1.0478, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2680, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5202, device='cuda:0')
tensor(1.2629, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0985, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5368, device='cuda:0')
tensor(0.9589, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9411, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5395, device='cuda:0')
tensor(1.0412, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1685, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5245, device='cuda:0')
tensor(0.9393, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0143, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5254, device='cuda:0')
tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9096, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5296, device='cuda:0')
tensor(1.0194, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9638, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5031, device='cuda:0')
tensor(1.1534, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0062, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5083, device='cuda:0')
tensor(1.0639, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0638, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5215, device='cuda:0')
tensor(1.1390, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5241, device='cuda:0')
tensor(1.0356, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1892, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5200, device='cuda:0')
tensor(1.1063, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5587, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5144, device='cuda:0')
tensor(0.9815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5062, device='cuda:0')
tensor(1.1232, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9570, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5133, device='cuda:0')
tensor(0.9277, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3544, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5069, device='cuda:0')
tensor(1.1130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1744, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4962, device='cuda:0')
tensor(0.6503, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0510, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5075, device='cuda:0')
tensor(1.2158, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0685, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5060, device='cuda:0')
tensor(1.1882, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0890, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4923, device='cuda:0')
tensor(1.1519, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1584, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4960, device='cuda:0')
tensor(1.0254, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1132, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5150, device='cuda:0')
tensor(1.1426, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0355, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4967, device='cuda:0')
tensor(1.1351, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2037, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5059, device='cuda:0')
tensor(1.1246, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2886, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4906, device='cuda:0')
tensor(1.1839, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9648, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4928, device='cuda:0')
tensor(1.1609, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1663, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4987, device='cuda:0')
tensor(1.3423, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9786, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4991, device='cuda:0')
tensor(1.0982, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2600, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4844, device='cuda:0')
tensor(1.1936, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0907, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5002, device='cuda:0')
tensor(0.8400, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0845, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4845, device='cuda:0')
tensor(1.3101, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9558, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4846, device='cuda:0')
tensor(1.2370, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3000, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4694, device='cuda:0')
tensor(1.0274, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9598, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4694, device='cuda:0')
tensor(1.0525, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0405, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4697, device='cuda:0')
tensor(1.5366, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9417, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4642, device='cuda:0')
tensor(1.0191, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2032, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4691, device='cuda:0')
tensor(1.2602, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9865, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4703, device='cuda:0')
tensor(1.1462, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0076, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4661, device='cuda:0')
tensor(1.2888, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2304, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4687, device='cuda:0')
tensor(1.3812, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0741, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4667, device='cuda:0')
tensor(1.2001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9610, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4632, device='cuda:0')
tensor(1.5007, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2724, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4596, device='cuda:0')
tensor(0.6104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0931, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4669, device='cuda:0')
tensor(1.1993, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1774, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4535, device='cuda:0')
tensor(1.0457, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2755, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4747, device='cuda:0')
tensor(1.4494, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2296, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4602, device='cuda:0')
tensor(1.2052, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2520, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4548, device='cuda:0')
tensor(1.0053, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3229, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4495, device='cuda:0')
tensor(1.1017, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0127, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4640, device='cuda:0')
tensor(1.0865, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1932, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4571, device='cuda:0')
tensor(1.2496, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3642, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4451, device='cuda:0')
tensor(0.9993, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3654, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4418, device='cuda:0')
tensor(1.0110, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0915, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4577, device='cuda:0')
tensor(1.0535, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9754, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4426, device='cuda:0')
tensor(1.1070, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8979, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4674, device='cuda:0')
tensor(1.0617, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2720, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4466, device='cuda:0')
tensor(1.3658, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0850, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4517, device='cuda:0')
tensor(1.0065, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1693, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4450, device='cuda:0')
tensor(1.0068, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1000, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4449, device='cuda:0')
tensor(1.3178, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2272, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4421, device='cuda:0')
tensor(1.5799, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0021, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4286, device='cuda:0')
tensor(1.5661, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9829, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4440, device='cuda:0')
tensor(1.0008, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3253, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4404, device='cuda:0')
tensor(1.1001, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2709, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4341, device='cuda:0')
tensor(1.1957, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2310, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4410, device='cuda:0')
tensor(1.1249, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4392, device='cuda:0')
tensor(1.0629, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1943, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4330, device='cuda:0')
tensor(1.1632, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9604, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4361, device='cuda:0')
tensor(1.1881, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.4190, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4178, device='cuda:0')
tensor(1.1581, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1535, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4212, device='cuda:0')
tensor(1.2287, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2781, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4163, device='cuda:0')
tensor(1.2110, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2935, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4203, device='cuda:0')
tensor(1.0081, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2212, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4204, device='cuda:0')
tensor(1.1839, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(0.5033, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4317, device='cuda:0')
tensor(0.3198, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9957, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4987, device='cuda:0')
tensor(0.7224, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7716, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5309, device='cuda:0')
tensor(0.5573, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5438, device='cuda:0')
tensor(0.7867, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6727, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5674, device='cuda:0')
tensor(0.7260, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6640, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5967, device='cuda:0')
tensor(0.8266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9936, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5972, device='cuda:0')
tensor(0.8250, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6862, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6081, device='cuda:0')
tensor(0.7854, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7428, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6169, device='cuda:0')
tensor(0.8299, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7796, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5973, device='cuda:0')
tensor(0.7907, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9114, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6150, device='cuda:0')
tensor(0.8150, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9637, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6289, device='cuda:0')
tensor(0.7552, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7421, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6344, device='cuda:0')
tensor(0.8947, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6959, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6660, device='cuda:0')
tensor(0.7647, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8653, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6435, device='cuda:0')
tensor(0.7348, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9966, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6462, device='cuda:0')
tensor(0.7018, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0396, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6538, device='cuda:0')
tensor(0.8043, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8287, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6250, device='cuda:0')
tensor(0.7086, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8052, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6446, device='cuda:0')
tensor(0.7065, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6617, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6525, device='cuda:0')
tensor(0.5985, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6342, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6527, device='cuda:0')
tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7300, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6423, device='cuda:0')
tensor(0.7792, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0949, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6602, device='cuda:0')
tensor(0.4697, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6642, device='cuda:0')
tensor(0.9146, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7100, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6651, device='cuda:0')
tensor(0.6651, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9769, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6450, device='cuda:0')
tensor(1.1362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8870, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6509, device='cuda:0')
tensor(0.5315, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1332, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6617, device='cuda:0')
tensor(0.7563, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0217, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6427, device='cuda:0')
tensor(1.1215, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6576, device='cuda:0')
tensor(1.0107, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8489, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6570, device='cuda:0')
tensor(1.2024, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8806, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6410, device='cuda:0')
tensor(1.0104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7557, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6387, device='cuda:0')
tensor(1.0786, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8117, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6487, device='cuda:0')
tensor(1.0748, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8910, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6575, device='cuda:0')
tensor(0.9837, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7483, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6478, device='cuda:0')
tensor(1.1208, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8849, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6677, device='cuda:0')
tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1194, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6728, device='cuda:0')
tensor(0.6810, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8902, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6649, device='cuda:0')
tensor(1.0334, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0304, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6447, device='cuda:0')
tensor(1.2250, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8102, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6443, device='cuda:0')
tensor(0.8015, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7806, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6549, device='cuda:0')
tensor(0.8141, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0195, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6678, device='cuda:0')
tensor(0.8485, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8736, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6469, device='cuda:0')
tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6939, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6412, device='cuda:0')
tensor(0.8828, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8804, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6400, device='cuda:0')
tensor(0.8425, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1965, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6426, device='cuda:0')
tensor(0.8098, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9250, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6291, device='cuda:0')
tensor(0.8856, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8287, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6362, device='cuda:0')
tensor(0.7955, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7516, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6256, device='cuda:0')
tensor(0.8421, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8844, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6574, device='cuda:0')
tensor(0.8318, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0387, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6360, device='cuda:0')
tensor(1.0104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7999, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6228, device='cuda:0')
tensor(0.9338, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8882, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6406, device='cuda:0')
tensor(0.8909, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8065, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6417, device='cuda:0')
tensor(0.8671, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2304, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6503, device='cuda:0')
tensor(0.8545, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3062, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6114, device='cuda:0')
tensor(0.9542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1121, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6244, device='cuda:0')
tensor(1.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7834, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6098, device='cuda:0')
tensor(0.9692, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9384, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6108, device='cuda:0')
tensor(0.9753, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9544, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6249, device='cuda:0')
tensor(0.8750, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9107, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6203, device='cuda:0')
tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0327, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6231, device='cuda:0')
tensor(1.2492, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7717, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6219, device='cuda:0')
tensor(1.0290, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1214, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6172, device='cuda:0')
tensor(0.8560, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1141, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6214, device='cuda:0')
tensor(1.0104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8168, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6174, device='cuda:0')
tensor(0.8715, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9992, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6105, device='cuda:0')
tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8123, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6169, device='cuda:0')
tensor(0.8929, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8916, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6005, device='cuda:0')
tensor(1.0888, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9626, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6164, device='cuda:0')
tensor(0.8291, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9557, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6099, device='cuda:0')
tensor(1.1022, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.4970, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5973, device='cuda:0')
tensor(1.0652, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0191, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6068, device='cuda:0')
tensor(1.0510, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0837, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5888, device='cuda:0')
tensor(1.0958, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1145, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6005, device='cuda:0')
tensor(0.9129, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0475, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5919, device='cuda:0')
tensor(0.8950, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8648, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6029, device='cuda:0')
tensor(1.1388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0012, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5816, device='cuda:0')
tensor(1.3104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9444, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6053, device='cuda:0')
tensor(0.9645, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0369, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6026, device='cuda:0')
tensor(1.0544, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7789, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5995, device='cuda:0')
tensor(1.0124, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2500, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6072, device='cuda:0')
tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8300, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5985, device='cuda:0')
tensor(0.9528, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2186, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5958, device='cuda:0')
tensor(1.1484, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9090, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5858, device='cuda:0')
tensor(1.1255, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9696, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5937, device='cuda:0')
tensor(1.1833, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0865, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5782, device='cuda:0')
tensor(0.7997, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5831, device='cuda:0')
tensor(0.9535, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8731, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5708, device='cuda:0')
tensor(1.0523, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0843, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5570, device='cuda:0')
tensor(1.1493, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1106, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5771, device='cuda:0')
tensor(0.8941, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2086, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5933, device='cuda:0')
tensor(1.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9316, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5849, device='cuda:0')
tensor(0.9016, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8208, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5727, device='cuda:0')
tensor(1.1271, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1161, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5814, device='cuda:0')
tensor(1.2606, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2987, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5768, device='cuda:0')
tensor(0.9845, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7926, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5769, device='cuda:0')
tensor(1.2306, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0686, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5697, device='cuda:0')
tensor(1.0157, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0795, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5698, device='cuda:0')
tensor(1.1065, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0843, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5533, device='cuda:0')
tensor(1.0211, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9647, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5564, device='cuda:0')
tensor(1.0896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1798, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5585, device='cuda:0')
tensor(1.1444, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1181, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5514, device='cuda:0')
tensor(1.2384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1191, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5521, device='cuda:0')
tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9959, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5704, device='cuda:0')
tensor(1.0470, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1026, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5496, device='cuda:0')
tensor(1.3912, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0102, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5587, device='cuda:0')
tensor(0.9611, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0131, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5527, device='cuda:0')
tensor(1.1232, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0915, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5703, device='cuda:0')
tensor(0.9943, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0333, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5525, device='cuda:0')
tensor(1.0889, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1603, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5429, device='cuda:0')
tensor(0.9883, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0819, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5381, device='cuda:0')
tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1505, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5594, device='cuda:0')
tensor(1.1626, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9058, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5466, device='cuda:0')
tensor(1.1340, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5372, device='cuda:0')
tensor(1.1147, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9233, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5414, device='cuda:0')
tensor(1.2107, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9952, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5344, device='cuda:0')
tensor(1.2804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9574, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5459, device='cuda:0')
tensor(0.9976, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2514, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5334, device='cuda:0')
tensor(1.2166, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1404, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5343, device='cuda:0')
tensor(1.0775, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0083, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5394, device='cuda:0')
tensor(1.2348, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9328, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5294, device='cuda:0')
tensor(1.0447, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0933, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5209, device='cuda:0')
tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8622, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5291, device='cuda:0')
tensor(0.8940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0258, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5220, device='cuda:0')
tensor(1.0286, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0450, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5370, device='cuda:0')
tensor(1.2526, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9573, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5258, device='cuda:0')
tensor(1.0875, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2765, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5078, device='cuda:0')
tensor(1.2011, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.5543, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5158, device='cuda:0')
tensor(1.0635, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0689, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5115, device='cuda:0')
tensor(1.0999, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2766, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5062, device='cuda:0')
tensor(1.2422, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1971, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5062, device='cuda:0')
tensor(1.0983, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0223, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5245, device='cuda:0')
tensor(1.0439, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2506, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5109, device='cuda:0')
tensor(1.1080, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2765, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5056, device='cuda:0')
tensor(1.0408, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1937, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5021, device='cuda:0')
tensor(1.0724, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0559, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5160, device='cuda:0')
tensor(0.8413, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0656, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5034, device='cuda:0')
tensor(1.1589, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2458, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5105, device='cuda:0')
tensor(1.2600, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2076, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5040, device='cuda:0')
tensor(1.5144, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0582, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4953, device='cuda:0')
tensor(1.0759, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9665, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5096, device='cuda:0')
tensor(0.9831, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9097, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5019, device='cuda:0')
tensor(1.0134, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2706, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4967, device='cuda:0')
tensor(1.1754, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0625, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4930, device='cuda:0')
tensor(1.4382, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9635, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4844, device='cuda:0')
tensor(1.2366, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2560, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.4852, device='cuda:0')
tensor(1.0648, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/1477 [00:00<?, ?it/s]

tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5040, device='cuda:0')
tensor(0.7314, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6906, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5659, device='cuda:0')
tensor(0.8822, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8382, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.5990, device='cuda:0')
tensor(0.9995, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7263, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6189, device='cuda:0')
tensor(0.9108, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6685, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6497, device='cuda:0')
tensor(0.9823, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0080, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6515, device='cuda:0')
tensor(0.3762, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.4399, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6544, device='cuda:0')
tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8211, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6622, device='cuda:0')
tensor(0.8912, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6774, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6801, device='cuda:0')
tensor(0.8235, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8786, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6944, device='cuda:0')
tensor(0.8402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8402, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6871, device='cuda:0')
tensor(0.7873, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6698, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7050, device='cuda:0')
tensor(0.7936, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.3996, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7018, device='cuda:0')
tensor(0.8337, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0615, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6991, device='cuda:0')
tensor(0.7500, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5698, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7297, device='cuda:0')
tensor(0.9462, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8939, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7131, device='cuda:0')
tensor(1.0542, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0354, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7090, device='cuda:0')
tensor(0.5064, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6720, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7088, device='cuda:0')
tensor(0.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7227, device='cuda:0')
tensor(0.5976, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5302, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7199, device='cuda:0')
tensor(1.0269, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7307, device='cuda:0')
tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7263, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7224, device='cuda:0')
tensor(0.8492, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9061, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7213, device='cuda:0')
tensor(1.0341, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1440, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7306, device='cuda:0')
tensor(0.8169, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5961, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7268, device='cuda:0')
tensor(0.8588, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7444, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7439, device='cuda:0')
tensor(1.0052, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9748, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7135, device='cuda:0')
tensor(0.7710, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1130, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7325, device='cuda:0')
tensor(0.8256, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7095, device='cuda:0')
tensor(0.7847, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7804, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7225, device='cuda:0')
tensor(0.9393, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8482, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7267, device='cuda:0')
tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7233, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7330, device='cuda:0')
tensor(0.8032, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7310, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7163, device='cuda:0')
tensor(0.8388, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7375, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7245, device='cuda:0')
tensor(0.9146, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7380, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7313, device='cuda:0')
tensor(0.9514, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8277, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7231, device='cuda:0')
tensor(0.7370, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8331, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7337, device='cuda:0')
tensor(0.8430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9027, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7234, device='cuda:0')
tensor(0.8716, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9681, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7342, device='cuda:0')
tensor(0.5910, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7275, device='cuda:0')
tensor(0.6188, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9750, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7239, device='cuda:0')
tensor(0.7629, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0085, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7176, device='cuda:0')
tensor(0.7550, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0623, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7130, device='cuda:0')
tensor(0.8212, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7245, device='cuda:0')
tensor(1.0244, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1262, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7286, device='cuda:0')
tensor(0.8325, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6228, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6992, device='cuda:0')
tensor(0.8350, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8747, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7402, device='cuda:0')
tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8098, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7127, device='cuda:0')
tensor(0.4057, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8515, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7278, device='cuda:0')
tensor(1.0469, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8225, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7060, device='cuda:0')
tensor(0.6441, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6940, device='cuda:0')
tensor(1.1208, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0202, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6985, device='cuda:0')
tensor(0.8427, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0708, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7031, device='cuda:0')
tensor(0.8511, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1151, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7116, device='cuda:0')
tensor(0.9864, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0003, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7102, device='cuda:0')
tensor(1.0375, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0124, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6935, device='cuda:0')
tensor(0.6422, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7715, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6971, device='cuda:0')
tensor(0.9074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7945, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6926, device='cuda:0')
tensor(1.0613, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1405, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7049, device='cuda:0')
tensor(1.0936, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0010, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6901, device='cuda:0')
tensor(0.8044, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0982, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7057, device='cuda:0')
tensor(0.9984, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8874, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6935, device='cuda:0')
tensor(0.9660, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0863, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6866, device='cuda:0')
tensor(0.6859, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6901, device='cuda:0')
tensor(0.5577, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9530, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6959, device='cuda:0')
tensor(0.9123, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6716, device='cuda:0')
tensor(0.9541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9279, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6768, device='cuda:0')
tensor(1.0940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7562, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6753, device='cuda:0')
tensor(1.0730, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8344, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6729, device='cuda:0')
tensor(0.8598, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7812, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6755, device='cuda:0')
tensor(0.8825, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9081, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6797, device='cuda:0')
tensor(0.7145, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1736, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6848, device='cuda:0')
tensor(0.9130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8421, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6731, device='cuda:0')
tensor(0.9026, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.5670, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6843, device='cuda:0')
tensor(0.7217, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9397, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6750, device='cuda:0')
tensor(1.0377, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9206, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6719, device='cuda:0')
tensor(0.9046, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0149, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6579, device='cuda:0')
tensor(0.7807, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0840, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6577, device='cuda:0')
tensor(0.7384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1311, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6621, device='cuda:0')
tensor(0.9370, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6995, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6698, device='cuda:0')
tensor(1.1086, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8300, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6732, device='cuda:0')
tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8624, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6620, device='cuda:0')
tensor(1.1320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9625, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6559, device='cuda:0')
tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8818, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6415, device='cuda:0')
tensor(1.0224, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0756, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6756, device='cuda:0')
tensor(0.9237, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9431, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6606, device='cuda:0')
tensor(0.8720, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1398, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6559, device='cuda:0')
tensor(1.1765, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8087, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6456, device='cuda:0')
tensor(1.0475, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9467, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6432, device='cuda:0')
tensor(1.0584, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0292, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6436, device='cuda:0')
tensor(1.1030, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8967, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6416, device='cuda:0')
tensor(1.2405, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.3507, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6399, device='cuda:0')
tensor(0.7532, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1999, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6509, device='cuda:0')
tensor(1.0320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0260, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6511, device='cuda:0')
tensor(0.9307, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1641, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7735, device='cuda:0')
tensor(0.7753, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8021, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7772, device='cuda:0')
tensor(0.6943, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6438, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7608, device='cuda:0')
tensor(0.7781, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9772, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.8030, device='cuda:0')
tensor(0.7725, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7744, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7838, device='cuda:0')
tensor(0.7670, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1745, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7901, device='cuda:0')
tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0161, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7925, device='cuda:0')
tensor(0.7948, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7838, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7807, device='cuda:0')
tensor(1.0726, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7645, device='cuda:0')
tensor(0.7363, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9991, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7708, device='cuda:0')
tensor(1.0863, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2500, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7702, device='cuda:0')
tensor(0.8018, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8485, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7613, device='cuda:0')
tensor(1.1235, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9857, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7481, device='cuda:0')
tensor(0.9696, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7218, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7557, device='cuda:0')
tensor(1.0541, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0992, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7459, device='cuda:0')
tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0392, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7492, device='cuda:0')
tensor(0.7531, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2839, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7486, device='cuda:0')
tensor(1.0861, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8615, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7418, device='cuda:0')
tensor(0.8998, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0305, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7410, device='cuda:0')
tensor(0.9228, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0770, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7408, device='cuda:0')
tensor(0.9633, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8009, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7245, device='cuda:0')
tensor(1.1003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6935, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7463, device='cuda:0')
tensor(0.9973, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0159, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7393, device='cuda:0')
tensor(0.7604, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8954, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7512, device='cuda:0')
tensor(0.8755, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7974, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7294, device='cuda:0')
tensor(0.9791, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9748, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7378, device='cuda:0')
tensor(0.8896, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7719, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7280, device='cuda:0')
tensor(1.1420, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7359, device='cuda:0')
tensor(0.6856, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1924, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7494, device='cuda:0')
tensor(0.9601, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7917, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7334, device='cuda:0')
tensor(1.0690, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8903, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7218, device='cuda:0')
tensor(0.8863, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9231, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7342, device='cuda:0')
tensor(0.4704, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9023, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7302, device='cuda:0')
tensor(0.8821, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7838, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7234, device='cuda:0')
tensor(1.1874, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7263, device='cuda:0')
tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7220, device='cuda:0')
tensor(0.8074, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8574, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7402, device='cuda:0')
tensor(0.5574, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7306, device='cuda:0')
tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8385, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7182, device='cuda:0')
tensor(0.9058, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8528, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7179, device='cuda:0')
tensor(1.1306, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8795, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7085, device='cuda:0')
tensor(1.0764, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2436, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7389, device='cuda:0')
tensor(0.8784, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8391, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7067, device='cuda:0')
tensor(0.7664, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8881, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7136, device='cuda:0')
tensor(1.1168, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0122, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7074, device='cuda:0')
tensor(1.1384, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0430, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7082, device='cuda:0')
tensor(1.2430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8085, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6954, device='cuda:0')
tensor(1.0281, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1926, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7103, device='cuda:0')
tensor(1.0372, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1151, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6967, device='cuda:0')
tensor(1.1568, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9767, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7154, device='cuda:0')
tensor(0.8418, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8763, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7057, device='cuda:0')
tensor(0.8576, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0139, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6857, device='cuda:0')
tensor(0.8431, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1834, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6861, device='cuda:0')
tensor(0.9550, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1277, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6957, device='cuda:0')
tensor(0.8900, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.7085, device='cuda:0')
tensor(1.1383, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0855, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6915, device='cuda:0')
tensor(0.9006, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0612, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6790, device='cuda:0')
tensor(0.9853, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9606, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6903, device='cuda:0')
tensor(0.9789, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8692, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6992, device='cuda:0')
tensor(0.8919, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9158, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6790, device='cuda:0')
tensor(0.8804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0948, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6653, device='cuda:0')
tensor(1.2650, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0937, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6760, device='cuda:0')
tensor(0.9479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0465, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6630, device='cuda:0')
tensor(1.1561, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.2882, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6667, device='cuda:0')
tensor(1.1000, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.1695, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

testloss tensor(3.6607, device='cuda:0')
tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(1.0652, device='cuda:0', grad_fn=<NllLossBackward0>)


  0%|          | 0/78 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Test out model

In [5]:
# load checkpoint - replace checkpoint path to that of the desired checkpoint
model = MambaAudioModel(vocab_size).to(device)
model.load_state_dict(torch.load(f"{YOUR_BASE_PATH}/checkpoints/3b7b75ed-a2c7-495d-b61e-72560e8a23df/checkpoint.pt"))

layers 6


<All keys matched successfully>

In [21]:
# Generate new audio
def unconditional_generation(model):
    idx = torch.tensor([[10,]]).to(device)
    max_new_tokens = 1999
    idx_next = []
    for i in tqdm(range(max_new_tokens)):
        idx_cond = idx[:,-block_size:]
        logits, loss = model(idx_cond)
        last_timestep = logits[:,-1,:]
        probs = F.softmax(last_timestep, dim=1)
        next_index = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_index), dim=1)

    save_to_file(idx, f"{YOUR_BASE_PATH}/alice_in_wonderland_overfit_4quantizers_6.wav")

unconditional_generation(model)

100%|██████████| 1999/1999 [00:16<00:00, 124.08it/s]


In [16]:
def produce_wav(filename, model, example):
  firstbit = example.shape[-1]//2
  tokens = example[:firstbit]
  tokens = tokens.reshape(1,firstbit)
  max_new_tokens=example.shape[-1]-firstbit
  idx = tokens.to(device)
  idx_next = []
  for i in tqdm(range(max_new_tokens)):
    idx_cond = idx[:,-block_size:]
    logits, loss = model(idx_cond)
    last_timestep = logits[:,-1,:]
    probs = F.softmax(last_timestep, dim=1)
    next_index = torch.multinomial(probs, num_samples=1)
    idx = torch.cat((idx, next_index), dim=1)

  print(idx.shape,idx)
  # assert(torch.allclose(idx[0][:firstbit], example[:firstbit]))
  save_to_file(idx, f"shared_fs/{filename}_test.wav")
  save_to_file(example.unsqueeze(0), f"shared_fs/{filename}_input.wav")

    
produce_wav('alice_in_wonderland_overfit_4quantizers', model, audio_dataset['test'][10]['tokens'][0])

KeyError: "Column test not in the dataset. Current columns in the dataset: ['original_sampling_rate', 'audio_array']"