https://www.notion.so/c2cc4e28d7ad46de912c9996f52276da

In [1]:
import os
import pandas as pd
import numpy as np
import youtokentome as yttm
from tqdm.notebook import tqdm

import torch
from torch import nn, Tensor
from torch.optim.lr_scheduler import StepLR

from models import LstmClassificator, LstmPackedClassificator, DoubleLstmClassificator
from train_model import train_model, run_model, save_model
from data_utils import create_tokenizer, TextDataset, TextWithLengthDataset, SplitTextDataset, Dataloaders

vocab_size = 184 # 2 ** 7 # 184
PAD_TOKEN = 0
UNK_TOKEN = 1
BOS_TOKEN = 2
EOS_TOKEN = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32 # 64

# Choose dataset
data_path = 'data'
train_data_path = os.path.join(data_path, 'train_data.tsv')
test_data_path = os.path.join(data_path, 'test_data.tsv')
# Tokenizer model path.
tokenizer_path = os.path.join('tokenizer', f'v{vocab_size}.tokenizer')

DatasetClass = TextWithLengthDataset
ModelClass = LstmPackedClassificator

print(f'Vocab size: {vocab_size}')

Vocab size: 184


In [2]:
torch.cuda.is_available(), torch.cuda.current_device()

(True, 0)

## Load Data

In [3]:
train_data = pd.read_csv(train_data_path, sep='\t', index_col=0)
test_data = pd.read_csv(test_data_path, sep='\t', index_col=0)

  mask |= (ar1 == a)


In [4]:
train_data.duplicated().sum()

0

In [5]:
true_inds = train_data.index[train_data.answer == True]
false_inds = train_data.index[train_data.answer == False]
len(true_inds), len(false_inds)

(182876, 3800327)

In [6]:
data_for_tokenizer = [train_data['ru_name'], train_data['eng_name'],
                      test_data['ru_name'], test_data['eng_name']]
tokens_ids = {'pad_id':PAD_TOKEN, 'unk_id':UNK_TOKEN,
              'bos_id':BOS_TOKEN, 'eos_id':EOS_TOKEN}
tokenizer = create_tokenizer(tokenizer_path, data_for_tokenizer, vocab_size, tokens_ids)
del data_for_tokenizer, tokens_ids

Loading pretrained tokenizer...


In [7]:
ratio = 0.8
# TODO: Distribute true/false evenly?
msk = np.random.rand(len(train_data)) < ratio

train_sample_size = 130000 # len(true_inds) // 2
test_sample_size  = 36000 # len(true_inds) // 4


datasets = {
    'train': DatasetClass(train_data[msk],  tokenizer, sample_size=train_sample_size),
    'val':   DatasetClass(train_data[~msk], tokenizer, sample_size=test_sample_size),
}

print('Train:', len(datasets['train']),
      '\nVal:', len(datasets['val']))

Train: 260000 
Val: 72000


In [8]:
dataset_name = 'val'
dataset_true_inds  = datasets[dataset_name].data.index[datasets[dataset_name].data.answer == True]
dataset_false_inds = datasets[dataset_name].data.index[datasets[dataset_name].data.answer == False]
len(dataset_true_inds), len(dataset_false_inds)

(36316, 760075)

In [9]:
idx = np.random.randint(len(datasets['train']))
datasets['train'].decode(datasets['train'][idx][0])

['<BOS>',
 '▁общество',
 '▁с',
 '▁ограниченной',
 '▁ответственностью',
 '▁"',
 'в',
 'о',
 'ст',
 'о',
 'ч',
 'н',
 'а',
 'я',
 '▁',
 'р',
 'е',
 'м',
 'о',
 'н',
 'т',
 'но',
 '-',
 'ст',
 'ро',
 'и',
 'т',
 'е',
 'л',
 'ь',
 'н',
 'а',
 'я',
 '▁',
 'ко',
 'м',
 'п',
 'ани',
 'я',
 '"',
 '<EOS>',
 '▁',
 'e',
 'a',
 'st',
 '▁',
 'r',
 'e',
 'p',
 'a',
 'i',
 'r',
 '▁',
 'b',
 'u',
 'i',
 'l',
 'd',
 'in',
 'n',
 'g',
 '▁company',
 '▁limited',
 '<EOS>']

In [10]:
data_iterators = Dataloaders(datasets, pad_token=PAD_TOKEN,
                             batch_size=BATCH_SIZE, shuffle=False,
                             num_workers=0, pin_memory=True)

## LSTM

In [11]:
from models import ModuleWithWeightsInit, PoolingModule, Attn
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [12]:
model = ModelClass(vocab_size=tokenizer.vocab_size(),
                                emb_size=64,
                                hidden_size=128,
                                num_classes=2,
                                num_layers=2,
                                padding_idx=PAD_TOKEN,
                                dropout=0.1,
                                fc_dims = 1024,
                                pool_attn=False,
                                pool_max=True,
                                pool_min=True,
                         )
model = model.to(device)
stats = None
model_save_path=os.path.join('models', f'packed_{vocab_size}_64_128.model')

In [13]:
# model, stats = torch.load(model_save_path)

In [14]:
n_epochs = 1
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=2, gamma=0.95)

In [15]:
stats = train_model(model, n_epochs, data_iterators, criterion, optimizer,
                    scheduler=scheduler, stats=stats)

------------ Epoch 0; lr: 0.00100 ------------
Resample data from datasets...


HBox(children=(FloatProgress(value=0.0, description='Train Epoch #0 ', max=8125.0, style=ProgressStyle(descrip…


Train Acc: 0.8818
Train Loss: 0.2881


HBox(children=(FloatProgress(value=0.0, description='Val Epoch #0 ', max=2250.0, style=ProgressStyle(descripti…


Val Acc: 0.9307
Val Loss: 0.1936


In [16]:
stats

{'train': {'acc': [0.8817538461538461], 'loss': [0.28805430213717315]},
 'val': {'acc': [0.9306805555555555], 'loss': [0.19356126191218695]}}

In [None]:
import matplotlib.pyplot as plt
plt.title('Accuracy')
plt.plot(stats['val']['acc'],   '-b',  label='Val')
plt.plot(stats['train']['acc'], '--g', label='Train')
plt.legend()
plt.show()


plt.title('Loss')
plt.plot(stats['val']['loss'],   '-b',  label='Val')
plt.plot(stats['train']['loss'], '--g', label='Train')
plt.legend()
plt.show()

## Test on full validation set

In [None]:
test_dataset = DatasetClass(train_data[~msk], tokenizer, normalize=False)
test_iterator = Dataloaders({'test':test_dataset}, pad_token=PAD_TOKEN,
                                             batch_size=128, shuffle=False, num_workers=0)
is_train_phase = False
desc = "Test model "
test_results = run_model(model, test_iterator['test'], is_train_phase, criterion, optimizer, desc=desc)

# max 98.5

## Make predictions

In [19]:
def predict(model, data_iterator, desc=""):
    """Run the model through the given data.

    :param model: model to run
    :param data_iterator: iterator for data
    :param is_train_phase: if `True` run model in train mode
    :param desc: description for the status printing
    :returns: list of predictions
    """
    # Get device from the model
    device = next(model.parameters()).get_device()
    # Put the model in eval mode.
    model.eval()
    predictions = []
    pbar = tqdm(total=len(data_iterator), desc=desc, position=0, leave=True)
    for i, data in enumerate(data_iterator):
        for j, tensor in enumerate(data):
            data[j] = tensor.to(device, non_blocking=True)
        
        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            out = model(*data)

        predictions.extend(out)
        pbar.update(1)
    pbar.close()
    return predictions

In [20]:
from torch.utils.data import Dataset
from data_utils import pad_sequence

class TestTextDataset(Dataset):
    """Custom dataset for train data."""

    __output_types = {'id': yttm.OutputType.ID,
                      'subword': yttm.OutputType.SUBWORD}

    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        # TODO: Change `eos` token to `sep` token
        self._sep_token = self.tokenize("", eos=True)[0]

    def tokenize(self, sentence, output_type='id', **kwargs):
        """Tokenize the sentence.
        :param s: the sentence to tokenize
        :param output_type: either 'id' or 'subword' for corresponding output
        :return: tokenized sentence"""
        if not isinstance(sentence, str):
            return [self.tokenize(sent, output_type, **kwargs) for sent in sentence]
        return self.tokenizer.encode(sentence.lower().strip(),
                                     output_type=self.__output_types[output_type], **kwargs)

    def decode(self, tokens):
        return [self.tokenizer.id_to_subword(token) for token in tokens]
    
    @staticmethod
    def _my_collate(batch, pad_token):
        src, src_lens = zip(*batch)
        src = [Tensor(s) for s in src]
        src = pad_sequence(src, batch_first=True, padding_value=pad_token).long()
        src_lens = Tensor(src_lens).long()
        return [src, src_lens]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        ru  = self.tokenize(self.data['ru_name'].iloc[idx],  bos=True)
        eng = self.tokenize(self.data['eng_name'].iloc[idx], eos=True)
        src = ru + [self._sep_token] + eng
        src_len = len(src)
        return src, src_len

In [None]:
test_dataset = {'test': TestTextDataset(test_data,  tokenizer)}
test_iterator = Dataloaders(test_dataset, pad_token=PAD_TOKEN,
                                          batch_size=128, shuffle=False, num_workers=0)

res = predict(model, test_iterator['test'], desc='Predict')

In [None]:
res_test = []
for xi in tqdm(res):
    res_test.append(xi.topk(1).indices.squeeze().item())

In [69]:
result = pd.DataFrame(res_test, columns=['answer'])

In [71]:
result.to_csv(' result.tsv', sep='\t')