## Посимвольная языковая модель.

В первом задании Вам нужно написать и обучить посимвольную нейронную языковую модель для вычисления вероятностей буквенных последовательностей (то есть слов). Такие модели используются в задачах словоизменения и распознавания/порождения звучащей речи. Для обучения модели используйте данные для русского языка из [репозитория](https://github.com/sigmorphon/conll2018/tree/master/task1/surprise).

**В процессе написания Вам нужно решить следующие проблемы:**
    
* как будет выглядеть обучающая выборка; что будет являться признаками, и что - метками классов.
* как сделать так, чтобы модель при предсказании символа учитывала все предыдущие символы слова.
* какие специальные символы нужно использовать.
* как передавать в модель текущее состояние рекуррентной сети

**Результаты:**

* предобработчик данных,
* генератор обучающих данных (батчей),
* обученная модель
* перплексия модели на настроечной выборке
* посимвольные вероятности слов в контрольной выборке

**Дополнительно:**

* дополнительный вход модели (часть речи слова, другие морфологические признаки), влияет ли его добавление на перплексию
* сравнение различных архитектур нейронной сети (FC, RNN, LSTM, QRNN, ...)

Подумайте, какие вспомогательные токены могут быть вам полезны. Выдайте им индексы от `0` до `len(AUXILIARY) - 1`

**План**
- Данные
    - Признаки: набор символов токена, заканчивается токеном END
    - Метки класса: набор символов того же токена, начинается с токена BEGIN
- Для учета всех предыдущих символов, при предсказании следующего символа, дополнительно мы должны передавать на вход предыдущий токен
- Специальные символы
    - BEGIN, END, MASK, UNK
- (???) Как передавать в модель текущее состояние рекуррентной сети

In [41]:
# it is better to do all imports at the first cell
from pathlib import Path
from itertools import islice
from operator import itemgetter
from functools import partial
from argparse import Namespace

from tqdm import tqdm_notebook
from tensorboardX import SummaryWriter

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [42]:
# Uncomment to download data
# !wget https://github.com/sigmorphon/conll2018/blob/master/task1/surprise/russian-train-high
# !wget https://github.com/sigmorphon/conll2018/blob/master/task1/surprise/russian-dev
# !wget https://github.com/sigmorphon/conll2018/blob/master/task1/surprise/russian-test

In [43]:
DATA_PATH = Path('./data')
MODELS_PATH = Path('./models')

In [44]:
file_paths = {'train': DATA_PATH/'russian-train-high',
              'dev': DATA_PATH/'russian-dev',
              'test': DATA_PATH/'russian-test'}

In [5]:
class Vocabulary:
    def __init__(self, token_to_idx=None):
        # Initialize mapping (token -> idx) if empty
        if token_to_idx is None:
            token_to_idx = {}
        
        # Generate 2 mappings (tokens -> idx, idx -> token)
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
    
    def add_token(self, token):
        if token in self._token_to_idx:
            # get index of token if it is already exists in vocabulary
            index = self._token_to_idx[token]
        else:
            # for new token, append it to mapping with new index
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        
        # return index of token
        return index
    
    def lookup_token(self, token):
        # return index by token
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        # return token by index
        return self._idx_to_token[index]
    
    def __len__(self):
        # override len function to get vocabulary size more easily
        return len(self._token_to_idx)

In [6]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None,
                 unk_token='<UNK>',
                 mask_token='<MASK>',
                 begin_token='<BEGIN>',
                 end_token='<END>'):
        super().__init__(token_to_idx)
        
        # Save special token symbols
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_token = begin_token
        self._end_token = end_token
        
        # Get and save indices for special token symbols
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)        
        self.begin_index = self.add_token(self._begin_token)        
        self.end_index = self.add_token(self._end_token)
    
    def lookup_token(self, token):
        # Override method to use <UNK> index 
        # if the token is not in vocabulary
        return self._token_to_idx.get(token, self.unk_index)

In [7]:
class CharLMVectorizer:
    def __init__(self, char_vocab):
        # Save character vocabulary
        self.char_vocab = char_vocab
        
    def vectorize(self, word, vector_length=-1):
        # Wrap word with <BEGIN> and <END> tokens
        indices = [self.char_vocab.begin_index]
        indices.extend(self.char_vocab.lookup_token(token) for token in word)
        indices.append(self.char_vocab.end_index)
        
        
#         # If vector_length is provided (eval mode) as argument - use it
#         # Otherwise, alculate length for source and target vectors (train mode):
#         # length of word + 1 special token (<BEGIN> or <END>)
#         # if vector_length == -1:
#         #    vector_length = len(word) + 1
#         vector_length = len(word) + 1
        
#         # Create padded version of the source vector
#         # <BEGIN> <char1> ... <charN> <MASK> ... <MASK>
#         # where N - length of original word
#         source_vector = np.full(vector_length, self.char_vocab.mask_index,
#                                  dtype=np.int64)
#         source_indices = indices[:-1]
#         source_vector[:len(source_indices)] = source_indices
        
#         # Create padded version of the target vector
#         # <char1> ... <charN> <END> <MASK> ... <MASK>
#         # where N - length of original word
#         target_vector = np.full(vector_length, self.char_vocab.mask_index,
#                                 dtype=np.int64)
#         target_indices = indices[1:]
#         target_vector[:len(target_indices)] = target_indices

        source_vector = indices[:-1]
        target_vector = indices[1:]
        
        length = len(source_vector)
        
        # Return padded versions of the source and target vectors
        return {'source_vector': source_vector, 
                'target_vector': target_vector,
                'length': length}
    
    @classmethod
    def from_dataframe(cls, full_df, data_type):
        # Create sequence vocabulary
        char_vocab = SequenceVocabulary()
        
        # Get dataframe subset to built vocabulary
        target_df = full_df[full_df['data_type'].isin(data_type)]
        
        # Add tokens to vocabulary from train dataset
        for _, row in target_df.iterrows():
            for char in row['word']:
                char_vocab.add_token(char)
            
        return cls(char_vocab)

In [8]:
class CharLMDataset(Dataset):
    def __init__(self, full_df, vectorizer):
        # Save original dataset (train/dev/test)
        self.full_df = full_df
        
        # Save vectorizer
        self._vectorizer = vectorizer
        
        # Calculate maximum word length (including BOS/EOS tokens) 
        # across whole dataset for further padding (not efficient)
        # self._max_seq_length = max(map(len, self.full_df['word'])) + 1
        
        # Save train/dev/test datasets separately
        # and save its sizes (number of rows)
        self.train_df = self.full_df[self.full_df['data_type'] == 'train']
        self.train_size = len(self.train_df)
        
        self.dev_df = self.full_df[self.full_df['data_type'] == 'dev']
        self.dev_size = len(self.dev_df)
        
        self.test_df = self.full_df[self.full_df['data_type'] == 'test']
        self.test_size = len(self.test_df)

        # Store information about datasets in dictionary
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'dev': (self.dev_df, self.dev_size),
                             'test': (self.test_df, self.test_size)}
        
        # Set train data as default
        self.set_data_type('train')
    
    @classmethod
    def read_dataset(cls, file_path, data_type):
        # Read specific file and save its data type (train/dev/test)
        df = pd.read_csv(file_path, sep='\t', 
                         header=None, names=['word'], 
                         usecols=[0])
        df['data_type'] = data_type
        
        # Return dataframe with data and its type
        return df
    
    @classmethod
    def load_dataset(cls, files_path):
        dfs_list = []
        
        # Read all datasets specified in files_path
        for data_type, file_path in file_paths.items():
            df = cls.read_dataset(file_path, data_type)
            dfs_list.append(df)
        
        # Concatenate all datasets
        full_df = pd.concat(dfs_list, axis=0, ignore_index=True)
        
        # Return concatenated dataframe with specified data types
        return full_df
    
    @classmethod
    def make_vectorizer(cls, files_path):
        # Load all data from files specified in files_path
        full_df = cls.load_dataset(files_path)
        
        # Create CharLMDataset class using full dataset and vectorizer
        return cls(full_df, CharLMVectorizer.from_dataframe(full_df, 
                                                            data_type=['train']))
    
    def get_vectorizer(self):
        # Return vectorizer related to Dataset
        return self._vectorizer
    
    def set_data_type(self, data_type='train'):
        # Set type, data, and its size as current dataset
        self._target_type = data_type
        self._target_df, self._target_size = self._lookup_dict[data_type] 
        
    def __len__(self):
        # Return length of the current dataset
        return self._target_size
    
    def __getitem__(self, index):
        # Get example by index from the current dataset
        row = self._target_df.iloc[index]
        
        # Generate source and target vector, and its length from the example
        # with padding to maximum word size (including BOS/EOS tokens)
        vector_dict = self._vectorizer.vectorize(row['word'])#, self._max_seq_length)
        
        # Return generated vectors with its length
        return vector_dict
    
    def get_num_batches(self, batch_size):
        # Calculate the number of full batches
        # for tracking progress in tqdm
        return len(self) // batch_size

In [9]:
lm_dataset = CharLMDataset.make_vectorizer(file_paths)
print(lm_dataset._target_type)
lm_dataset.full_df['data_type'].value_counts()

train


train    10000
test      1000
dev       1000
Name: data_type, dtype: int64

In [10]:
print(lm_dataset.get_vectorizer().char_vocab._token_to_idx)
print(len(lm_dataset.get_vectorizer().char_vocab._token_to_idx))
print(lm_dataset.get_vectorizer().char_vocab._idx_to_token)
print(len(lm_dataset.get_vectorizer().char_vocab._idx_to_token))

{'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3, 'в': 4, 'а': 5, 'л': 6, 'о': 7, 'н': 8, 'с': 9, 'к': 10, 'и': 11, 'й': 12, 'е': 13, 'з': 14, 'ч': 15, 'ы': 16, 'т': 17, 'р': 18, 'ё': 19, 'п': 20, 'ь': 21, 'г': 22, 'б': 23, 'ю': 24, 'я': 25, 'д': 26, 'у': 27, 'ш': 28, 'м': 29, 'х': 30, 'ж': 31, 'ц': 32, ' ': 33, 'щ': 34, '-': 35, 'ф': 36, 'э': 37, 'ъ': 38, 'С': 39, 'Ш': 40, 'И': 41, 'З': 42, 'А': 43, 'Г': 44, 'Э': 45, 'Л': 46, 'Ф': 47, 'В': 48, 'П': 49, 'М': 50, 'Р': 51, 'Б': 52, 'Х': 53, 'Н': 54, 'Е': 55}
56
{0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>', 4: 'в', 5: 'а', 6: 'л', 7: 'о', 8: 'н', 9: 'с', 10: 'к', 11: 'и', 12: 'й', 13: 'е', 14: 'з', 15: 'ч', 16: 'ы', 17: 'т', 18: 'р', 19: 'ё', 20: 'п', 21: 'ь', 22: 'г', 23: 'б', 24: 'ю', 25: 'я', 26: 'д', 27: 'у', 28: 'ш', 29: 'м', 30: 'х', 31: 'ж', 32: 'ц', 33: ' ', 34: 'щ', 35: '-', 36: 'ф', 37: 'э', 38: 'ъ', 39: 'С', 40: 'Ш', 41: 'И', 42: 'З', 43: 'А', 44: 'Г', 45: 'Э', 46: 'Л', 47: 'Ф', 48: 'В', 49: 'П', 50: 'М', 51: 'Р', 52: '

In [11]:
lm_dataset[1]

{'source_vector': [2, 8, 13, 14, 5, 10, 7, 8, 15, 13, 8, 8, 16, 12],
 'target_vector': [8, 13, 14, 5, 10, 7, 8, 15, 13, 8, 8, 16, 12, 3],
 'length': 14}

In [12]:
lm_dataset.get_vectorizer().vectorize('НеЙрОсЕтЬ')

{'source_vector': [2, 54, 13, 1, 18, 1, 9, 55, 17, 1],
 'target_vector': [54, 13, 1, 18, 1, 9, 55, 17, 1, 3],
 'length': 10}

In [13]:
dd = [{'source_vector': np.array([ 2,  4,  5,  6,  6,  7,  8,  9, 10, 11, 12]), 
       'target_vector': np.array([ 4,  5,  6,  6,  7,  8,  9, 10, 11, 12,  3]), 'length': 11}, 
      {'source_vector': np.array([ 2,  8, 13, 14,  5, 10,  7,  8, 15, 13,  8,  8, 16, 12]), 
       'target_vector': np.array([ 8, 13, 14,  5, 10,  7,  8, 15, 13,  8,  8, 16, 12,  3]), 'length': 14}]

In [14]:
def pad_sequence(elem, item_name, max_length, value=0):
    data = elem[item_name]
    data_len = elem['length']
    data = np.pad(data, (0, max_length - data_len), 
                  mode='constant', constant_values=value)
    return data

In [15]:
l = [x for x in range(3)]
torch.tensor(l)

tensor([0, 1, 2])

In [25]:
def collate_fn(batch):
    get_length_item = itemgetter('length')
    
    batch_lengths = torch.tensor(list(map(get_length_item, batch)))
    max_batch_length = torch.max(batch_lengths)
    
    padded_source_batch = partial(pad_sequence, item_name='source_vector', 
                                  max_length=max_batch_length, value=0)
    padded_source_batch = list(map(padded_source_batch, batch))
    padded_source_batch = np.vstack(padded_source_batch)
    padded_source_batch = torch.from_numpy(padded_source_batch)
    
    padded_target_batch = partial(pad_sequence, item_name='target_vector', 
                                  max_length=max_batch_length, value=0)
    padded_target_batch = list(map(padded_target_batch, batch))
    padded_target_batch = np.vstack(padded_target_batch)
    padded_target_batch = torch.from_numpy(padded_target_batch)
    
    return {'source_batch': padded_source_batch, 
            'target_batch': padded_target_batch,
            'batch_lengths': batch_lengths}

In [26]:
def generate_batches(dataset, batch_size, collate_fn,
                     shuffle=True, drop_last=True,
                     device='cpu'):
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size,
                             shuffle=shuffle, drop_last=drop_last,
                             collate_fn=collate_fn)
    
    for data_dict in data_loader:
        lengths = data_dict['batch_lengths'].numpy()
        sort_idx = lengths.argsort()[::-1].tolist()
        
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sort_idx].to(device)
        yield out_data_dict

In [27]:
for batch in islice(generate_batches(lm_dataset, batch_size=3, 
                                     shuffle=False, collate_fn=collate_fn), 1):
    print(batch)
    x_source = batch['source_batch']
    lengths = batch['batch_lengths']
    print(x_source, lengths)

{'source_batch': tensor([[ 2,  8, 13, 14,  5, 10,  7,  8, 15, 13,  8,  8, 16, 12],
        [ 2, 11,  9, 17, 18, 19, 20, 16,  4,  5, 17, 21,  0,  0],
        [ 2,  4,  5,  6,  6,  7,  8,  9, 10, 11, 12,  0,  0,  0]]), 'target_batch': tensor([[ 8, 13, 14,  5, 10,  7,  8, 15, 13,  8,  8, 16, 12,  3],
        [11,  9, 17, 18, 19, 20, 16,  4,  5, 17, 21,  3,  0,  0],
        [ 4,  5,  6,  6,  7,  8,  9, 10, 11, 12,  3,  0,  0,  0]]), 'batch_lengths': tensor([14, 12, 11])}
tensor([[ 2,  8, 13, 14,  5, 10,  7,  8, 15, 13,  8,  8, 16, 12],
        [ 2, 11,  9, 17, 18, 19, 20, 16,  4,  5, 17, 21,  0,  0],
        [ 2,  4,  5,  6,  6,  7,  8,  9, 10, 11, 12,  0,  0,  0]]) tensor([14, 12, 11])


In [45]:
class CharLMModel(nn.Module):
    def __init__(self, num_embeddings, embedding_size,
                 hidden_size, num_classes):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings, embedding_size, 
                                      padding_idx=0)
        self.rnn = nn.GRU(embedding_size, hidden_size, 
                          bidirectional=False, batch_first=True)
        
        self.fc1 = nn.Linear(in_features=hidden_size,
                             out_features=num_classes)
        
    def forward(self, x_source, x_lengths, apply_softmax=False):
        x_embedded = self.embedding(x_source)
        x_packed = pack_padded_sequence(x_embedded, x_lengths.detach().cpu().numpy(),
                                        batch_first=True)
        x_rnn_out, x_rnn_h = self.rnn(x_packed)
#         x_rnn_h = x_rnn_h.permute(1, 0, 2)
#         x_rnn_h = x_rnn_h.reshape(x_rnn_h.shape[0], -1)
        x_unpacked, _ = pad_packed_sequence(x_rnn_out, batch_first=True)
        y_out = self.fc1(x_unpacked)
        
        if apply_softmax:
            y_out = F.softmax(y_out, dim=2)
        
        return y_out # x_unpacked #, x_rnn_h

In [46]:
vectorizer = lm_dataset.get_vectorizer()
vocab_size = len(vectorizer.char_vocab)

model = CharLMModel(num_embeddings=vocab_size,
                    embedding_size=20,
                    hidden_size=10,
                    num_classes=vocab_size)

In [47]:
y_out = model(x_source, lengths)
y_out.shape

torch.Size([3, 14, 56])

Setting all possible random states to fixed number

In [49]:
def set_seeds(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

Create namespace with all parameters for training (specified values were used for the final model)

In [65]:
args = Namespace(
    file_paths = {'train': DATA_PATH/'russian-train-high',
                  'dev': DATA_PATH/'russian-dev',
                  'test': DATA_PATH/'russian-test'},
    model_state_path = MODELS_PATH/'charLMModel.pth',
    
    embedding_size = 50,
    hidden_size = 10,
    
    seed = 42,
    
    num_epochs = 3,
    batch_size = 1024,
    learning_rate = 0.03,
    save_iterations = 1e8,
    early_stopping_criteria = 1e8,
    factor=1e8,
    patience=1e8,
    
    cuda=True
)

Create functions for creating and updating necessary parameters while training

In [66]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': [], 
            'epoch_idx': 0,
            'batch_idx': 0,
            'loss': [],
            'model_file_name': args.model_state_path}

def update_train_state(args, model, train_state):
    if train_state['batch_idx'] == 0:
        train_state['stop_early'] = False
        torch.save(model.state_dict(), train_state['model_file_name'])
    else:
        loss = train_state['loss'][-1]

        if loss < train_state['early_stopping_best_val']:
            train_state['early_stopping_best_val'] = loss
            train_state['early_stopping_step'] = 0
            
            if train_state['batch_idx'] % args.save_iterations == 0:
                torch.save(model.state_dict(), train_state['model_file_name'])
        else:
            train_state['early_stopping_step'] += 1 
    
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria
    return train_state

Check if we can use GPU or CPU

In [67]:
if not torch.cuda.is_available():
    args.cuda=False
    
print(f'Using CUDA: {args.cuda}')
args.device = torch.device('cuda' if args.cuda else 'cpu')

Using CUDA: False


### Main Training Cycle

In [69]:
set_seeds(args.seed)

lm_dataset = CharLMDataset.make_vectorizer(args.file_paths)

vectorizer = lm_dataset.get_vectorizer()
mask_index = vectorizer.char_vocab.mask_index
vocab_size = len(vectorizer.char_vocab)

model = CharLMModel(num_embeddings=vocab_size,
                    embedding_size=args.embedding_size,
                    hidden_size=args.hidden_size,
                    num_classes=vocab_size)
model = model.to(args.device)

optimizer = optim.Adam(params=model.parameters(),
                      lr=args.learning_rate)

sequence_loss = nn.CrossEntropyLoss(ignore_index=mask_index)

# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
#                                                  mode='min',
#                                                  factor=args.factor,
#                                                  patience=args.patience)

epoch_bar = tqdm_notebook(desc='Epochs', 
                          total=args.num_epochs,
                          position=0)

lm_dataset.set_data_type('train')
train_bar = tqdm_notebook(desc='Train data',
                          total=lm_dataset.get_num_batches(args.batch_size), 
                          position=1)

lm_dataset.set_data_type('dev')
val_bar = tqdm_notebook(desc='Dev data',
                        total=lm_dataset.get_num_batches(args.batch_size), 
                        position=1)

train_state = make_train_state(args)

writer = SummaryWriter(log_dir='logs', comment='task_1')

try:
    for epoch_index in range(1, args.num_epochs + 1):
        train_state['epoch_index'] = epoch_index
        
        lm_dataset.set_data_type('train')
        batch_generator = generate_batches(dataset=lm_dataset, 
                                           batch_size=args.batch_size,
                                           collate_fn=collate_fn,
                                           shuffle=False,
                                           drop_last=True,
                                           device=args.device)
        
        running_loss = 0.0
        running_acc = 0.0
        model.train()
        
        for batch_idx, batch_dict in enumerate(batch_generator, 1):
            optimizer.zero_grad()

            y_pred = model(batch['source_batch'], 
                           batch['batch_lengths'])
            y_pred = y_pred.reshape(-1, y_pred.shape[2])
            
            y_true = batch['target_batch']
            y_true = y_true.reshape(-1)
            
            loss = sequence_loss(y_pred, y_true)

#             loss = loss / args.batch_size
            loss_value = loss.item()
            running_loss += (loss_value - running_loss) / (batch_idx)
            loss.backward()
            
            print(loss)

            learning_rate = optimizer.param_groups[0]['lr']

            train_state['batch_idx'] = batch_idx
            train_state['loss'].append(running_loss)
            train_state['learning_rate'].append(learning_rate)

            train_state = update_train_state(args=args,
                                             model=model,
                                             train_state=train_state)

            train_params = dict(loss=running_loss,
                                lr=learning_rate,
                                early_step=train_state['early_stopping_step'],
                                early_best=train_state['early_stopping_best_val'])
            train_bar.set_postfix(train_params)
            train_bar.update()

            optimizer.step()
#             scheduler.step(train_state['loss'][-1])
            
#             iter_idx += 1

            if train_state['stop_early']:
                break
        
        train_bar.n = 0
        epoch_bar.update()
        
#     torch.save(classifier.state_dict(), str(train_state['model_file_name']) + '_last')
except KeyboardInterrupt:
    print('Exit training')

HBox(children=(IntProgress(value=0, description='Epochs', max=3, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Train data', max=9, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dev data', max=1, style=ProgressStyle(descr…

tensor(4.1607, grad_fn=<NllLossBackward>)
tensor(3.7125, grad_fn=<NllLossBackward>)
tensor(3.4096, grad_fn=<NllLossBackward>)
tensor(3.1547, grad_fn=<NllLossBackward>)
tensor(2.9171, grad_fn=<NllLossBackward>)
tensor(2.6945, grad_fn=<NllLossBackward>)
tensor(2.4886, grad_fn=<NllLossBackward>)
tensor(2.3034, grad_fn=<NllLossBackward>)
tensor(2.1382, grad_fn=<NllLossBackward>)
tensor(1.9875, grad_fn=<NllLossBackward>)
tensor(1.8530, grad_fn=<NllLossBackward>)
tensor(1.7300, grad_fn=<NllLossBackward>)
tensor(1.6163, grad_fn=<NllLossBackward>)
tensor(1.5115, grad_fn=<NllLossBackward>)
tensor(1.4154, grad_fn=<NllLossBackward>)
tensor(1.3254, grad_fn=<NllLossBackward>)
tensor(1.2427, grad_fn=<NllLossBackward>)
tensor(1.1640, grad_fn=<NllLossBackward>)
tensor(1.0937, grad_fn=<NllLossBackward>)
tensor(1.0260, grad_fn=<NllLossBackward>)
tensor(0.9648, grad_fn=<NllLossBackward>)
tensor(0.9075, grad_fn=<NllLossBackward>)
tensor(0.8568, grad_fn=<NllLossBackward>)
tensor(0.8102, grad_fn=<NllLossBac

CrossEntropyLoss()