In [1]:
from allennlp.data.dataset_readers import DatasetReader, Seq2SeqDatasetReader
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, CharacterTokenizer
from allennlp.data.fields import LabelField, TextField, Field
from allennlp.data import Instance
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.iterators import BucketIterator

from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder

# preprocessing
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

# encoder
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.modules.seq2seq_encoders import StackedSelfAttentionEncoder

# model
from allennlp.nn.activations import Activation
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.training.trainer import Trainer
from allennlp.data.iterators import BucketIterator

# attention
from allennlp.modules.attention import AdditiveAttention
from allennlp.modules.matrix_attention import BilinearMatrixAttention, DotProductMatrixAttention

# other packages
from overrides import overrides
import pandas as pd
from collections import namedtuple
from sklearn.model_selection import train_test_split
import itertools
from allennlp.predictors import SimpleSeq2SeqPredictor

# pytorch
import torch
import torch.nn as nn
import torch.optim as optim




In [2]:
torch.cuda.is_available()

True

In [3]:
Config = namedtuple('Config', [
                    'lazy',
                    'max_vocab_size', 
                    'batch_size', 
                    'epochs', 
                    'max_seq_len', 
                    'IN_EMBEDDING_DIM', 
                    'HIDDEN_DIM', 
                    'OUT_EMBEDDING_DIM', 
                    'CUDA_DEVICE',
                    'n_samples'
])

# create config file

In [4]:
config = Config(False,
               10000,
               64,
               10,
               100,
               64,
               32,
               64,
               0 if torch.cuda.is_available() else -1, 
               60000)

# prepare data

In [None]:
raw_dataset = pd.read_csv('data/ru_train.csv')

In [None]:
raw_dataset['before'] = raw_dataset['before'].astype(str)
raw_dataset['after'] = raw_dataset['after'].astype(str)

In [None]:
d = raw_dataset['class'].value_counts().to_dict()

In [None]:
for i in d:
    if d[i] > config.n_samples:
        d[i] = config.n_samples

In [None]:
d

In [None]:
from imblearn.under_sampling import RandomUnderSampler


rus = RandomUnderSampler(sampling_strategy=d, random_state=0)

In [None]:
raw_dataset_resampled, _ = rus.fit_resample(raw_dataset, raw_dataset['class'])

In [None]:
raw_dataset_resampled['class'].value_counts()

In [None]:
df = raw_dataset_resampled[['before', 'after']]

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1)

In [None]:
df_train.to_csv('train_dataset.tsv', index=False, header=False, sep='\t')
df_test.to_csv('test_dataset.tsv', index=False, header=False, sep='\t')

# create reader

In [5]:
reader = Seq2SeqDatasetReader(
    source_tokenizer = CharacterTokenizer(),
    target_tokenizer = CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')},
    lazy=config.lazy
)

train_dataset = reader.read('train_dataset.tsv',)
validation_dataset = reader.read('test_dataset.tsv')

431636it [00:25, 16993.75it/s]
47960it [00:01, 32467.59it/s]


# prepare vocabulary

In [6]:
vocab = Vocabulary.from_instances(train_dataset,
                                  min_count={'tokens': 3, 'target_tokens': 3}
                                 )

100%|██████████| 431636/431636 [00:06<00:00, 68458.98it/s]


In [7]:
in_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=config.IN_EMBEDDING_DIM
                        )

In [8]:
source_embedder = BasicTextFieldEmbedder({"tokens": in_embedding})

In [9]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("source_tokens", "num_tokens")],
                         )

In [10]:
iterator.index_with(vocab)

# prepare model

In [11]:
## get simple encoder
encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(config.IN_EMBEDDING_DIM, 
                                              config.HIDDEN_DIM, 
                                              batch_first=True))

attention = AdditiveAttention(config.HIDDEN_DIM, config.HIDDEN_DIM)

In [12]:
max_decoding_steps = 100

model = SimpleSeq2Seq(vocab, 
                      source_embedder, 
                      encoder, 
                      max_decoding_steps,
                      target_embedding_dim=config.OUT_EMBEDDING_DIM,
                      target_namespace='target_tokens',
                      beam_size=8,
                      use_bleu=True,
                      attention=attention,
                      scheduled_sampling_ratio = 0.15)

In [13]:
if torch.cuda.is_available():
    model.cuda(config.CUDA_DEVICE)

In [14]:
optimizer = optim.Adam(model.parameters())

# training

In [15]:
torch.cuda.is_available()

True

In [16]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  num_epochs=1,
                  cuda_device=config.CUDA_DEVICE,
                  patience=2)

In [17]:
print(f'Will train for {config.epochs} epochs')
for i in range(config.epochs):
    print(f'Epoch: {i+1}')
    trainer.train()

  0%|          | 0/6745 [00:00<?, ?it/s]

Will train for 5 epochs
Epoch: 1


loss: 1.6567 ||: 100%|██████████| 6745/6745 [08:25<00:00, 13.33it/s] 
BLEU: 0.4146, loss: 0.7449 ||: 100%|██████████| 750/750 [05:28<00:00,  2.28it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 2


loss: 0.5141 ||: 100%|██████████| 6745/6745 [08:16<00:00, 13.58it/s]
BLEU: 0.6049, loss: 0.2693 ||: 100%|██████████| 750/750 [05:13<00:00,  2.40it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 3


loss: 0.2538 ||: 100%|██████████| 6745/6745 [08:11<00:00, 13.73it/s]
BLEU: 0.7464, loss: 0.1764 ||: 100%|██████████| 750/750 [04:48<00:00,  2.60it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 4


loss: 0.1868 ||: 100%|██████████| 6745/6745 [08:08<00:00, 13.81it/s]
BLEU: 0.7865, loss: 0.1441 ||: 100%|██████████| 750/750 [04:31<00:00,  2.77it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 5


loss: 0.1557 ||: 100%|██████████| 6745/6745 [08:08<00:00, 13.81it/s]
BLEU: 0.8343, loss: 0.1104 ||: 100%|██████████| 750/750 [04:30<00:00,  2.78it/s]
