In [2]:
from allennlp.data.dataset_readers import DatasetReader, Seq2SeqDatasetReader
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, CharacterTokenizer
from allennlp.data.fields import LabelField, TextField, Field
from allennlp.data import Instance
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.iterators import BucketIterator

from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder

# preprocessing 
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

# encoder
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.modules.seq2seq_encoders import StackedSelfAttentionEncoder

# model
from allennlp.nn.activations import Activation
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.training.trainer import Trainer
from allennlp.data.iterators import BucketIterator

# attention
from allennlp.modules.attention import LinearAttention, BilinearAttention, DotProductAttention

# other packages
from overrides import overrides
import pandas as pd
from collections import namedtuple
from sklearn.model_selection import train_test_split
import itertools
from allennlp.predictors import SimpleSeq2SeqPredictor

# pytorch
import torch
import torch.nn as nn
import torch.optim as optim




In [3]:
torch.cuda.is_available()

True

In [4]:
Config = namedtuple('Config', [
                    'lazy',
                    'max_vocab_size', 
                    'batch_size', 
                    'lr', 
                    'epochs', 
                    'max_seq_len', 
                    'IN_EMBEDDING_DIM', 
                    'HIDDEN_DIM', 
                    'OUT_EMBEDDING_DIM', 
                    'CUDA_DEVICE'
])

In [5]:
config = Config(False,
               10000,
               32,
               10e-4,
               10,
               128,
               64,
               32,
               32,
               0 if torch.cuda.is_available() else -1)

In [6]:
reader = Seq2SeqDatasetReader(
    source_tokenizer = CharacterTokenizer(),
    target_tokenizer = CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')},
    lazy=config.lazy
)

In [7]:
raw_dataset = pd.read_csv('data/ru_train.csv')

In [8]:
raw_dataset['before'] = raw_dataset['before'].astype(str)
raw_dataset['after'] = raw_dataset['after'].astype(str)

In [10]:
raw_dataset[['before', 'after']]

Unnamed: 0,before,after
0,По,По
1,состоянию,состоянию
2,на,на
3,1862 год,тысяча восемьсот шестьдесят второй год
4,.,.
...,...,...
10574511,в,в
10574512,одиннадцати,одиннадцати
10574513,странах,странах
10574514,мира,мира


In [11]:
'''df = raw_dataset[:].groupby(['sentence_id',).agg({
    'before': lambda x: ' '.join(x),
    'after': lambda x: ' '.join(x)
})'''

df = raw_dataset[['before', 'after']]

In [12]:
df_train, df_test = train_test_split(df)

In [13]:
df_train.shape

(7930887, 2)

In [14]:
df_train[:1000000].to_csv('train_dataset.tsv', index=False, header=False, sep='\t')
df_test[:10000].to_csv('test_dataset.tsv', index=False, header=False, sep='\t')

In [15]:
train_dataset = reader.read('train_dataset.tsv',)
validation_dataset = reader.read('test_dataset.tsv')

1000000it [00:36, 27454.74it/s]
10000it [00:00, 49268.99it/s]


# prepare vocabulary

In [16]:
vocab = Vocabulary.from_instances(train_dataset,
                                  min_count={'tokens': 3, 'target_tokens': 3}
                                 )

100%|██████████| 1000000/1000000 [00:08<00:00, 123118.50it/s]


In [17]:
vocab_test = Vocabulary.from_instances(validation_dataset,
                                       min_count={'tokens': 3, 'target_tokens': 3}
                                      )

100%|██████████| 10000/10000 [00:00<00:00, 99276.29it/s]


In [18]:
in_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=config.IN_EMBEDDING_DIM
                        )

In [19]:
source_embedder = BasicTextFieldEmbedder({"tokens": in_embedding})

In [20]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("source_tokens", "num_tokens")],
                         )

In [21]:
iterator.index_with(vocab)

# prepare model

In [22]:
## get attention encoder

encoder = StackedSelfAttentionEncoder(input_dim=config.IN_EMBEDDING_DIM, 
                                      hidden_dim=config.HIDDEN_DIM, 
                                      projection_dim=64, 
                                      feedforward_hidden_dim=64, 
                                      num_layers=1, 
                                      num_attention_heads=8)

attention = DotProductAttention()

In [23]:
max_decoding_steps = 100

model = SimpleSeq2Seq(vocab, 
                      source_embedder, 
                      encoder, 
                      max_decoding_steps,
                      target_embedding_dim=config.OUT_EMBEDDING_DIM,
                      target_namespace='target_tokens',
                      use_bleu=True,
                      attention=attention,
                      scheduled_sampling_ratio = 0.15)

In [24]:
if torch.cuda.is_available():
    model.cuda(config.CUDA_DEVICE)

In [25]:
model

SimpleSeq2Seq(
  (_source_embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (_encoder): StackedSelfAttentionEncoder(
    (feedforward_0): FeedForward(
      (_linear_layers): ModuleList(
        (0): Linear(in_features=64, out_features=64, bias=True)
        (1): Linear(in_features=64, out_features=32, bias=True)
      )
      (_dropout): ModuleList(
        (0): Dropout(p=0.1, inplace=False)
        (1): Dropout(p=0.1, inplace=False)
      )
    )
    (feedforward_layer_norm_0): LayerNorm()
    (self_attention_0): MultiHeadSelfAttention(
      (_combined_projection): Linear(in_features=32, out_features=192, bias=True)
      (_output_projection): Linear(in_features=64, out_features=32, bias=True)
      (_attention_dropout): Dropout(p=0.1, inplace=False)
    )
    (layer_norm_0): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (_attention): DotProductAttention()
  (_target_embedder): Embedding()
  (_decoder_cell): LSTMCell(64, 32)
  (_outpu

In [26]:
optimizer = optim.Adam(model.parameters())

In [27]:
# training

In [28]:
torch.cuda.is_available()

True

In [29]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  num_epochs=1,
                  cuda_device=config.CUDA_DEVICE)

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


In [None]:
print(f'Will train for {config.epochs} epochs')
for i in range(config.epochs):
    print(f'Epoch: {i+1}')
    trainer.train()

    predictor = SimpleSeq2SeqPredictor(model, reader)

    for instance in itertools.islice(validation_dataset, 0, 100, 10):
        print('SOURCE:', instance.fields['source_tokens'].tokens)
        print('GOLD:', instance.fields['target_tokens'].tokens)
        print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])

  0%|          | 0/31250 [00:00<?, ?it/s]

Will train for 10 epochs
Epoch: 1


loss: 1.1055 ||: 100%|██████████| 31250/31250 [22:48<00:00, 22.84it/s]
BLEU: 0.4600, loss: 0.2235 ||: 100%|██████████| 313/313 [01:01<00:00,  5.07it/s]


SOURCE: [@start@, к, о, д, е, к, с, @end@]
GOLD: [@start@, к, о, д, е, к, с, @end@]
PRED: ['к', 'о', 'д', 'е', 'к', 'с']
SOURCE: [@start@, 1, 4, @end@]
GOLD: [@start@, ч, е, т, ы, р, н, а, д, ц, а, т, ь, @end@]
PRED: ['т', 'ы', 'с', 'я', 'ч', 'а', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в', 'я', 'т', 'ь', 'с', 'о', 'т', ' ', 'д', 'е', 'в']
SOURCE: [@start@, о, р, г, а, н, и, з, а, ц, и, й, @end@]
GOLD: [@start@, о, р, г, а, н, и, з, а, ц, и, й, @end@]
PRED: ['о', 'р', 'г', 'а', 'н', 'и', 'з', 'а', 'ц', 'и', 'й']
SOURCE: [@start@, W, h, i, t, b, u, r, n, @end@]
GOLD: [@start@, у, _, t, r, a, n, s,  , и, _, t, r, a, n,

  0%|          | 0/31250 [00:00<?, ?it/s]

PRED: ['е', 'с', 'т', 'ь']
SOURCE: [@start@, н, е, с, к, о, л, ь, к, о, @end@]
GOLD: [@start@, н, е, с, к, о, л, ь, к, о, @end@]
PRED: ['н', 'е', 'с', 'к', 'о', 'л', 'ь', 'к', 'о', 'к']
SOURCE: [@start@, -, @end@]
GOLD: [@start@, -, @end@]
PRED: ['-']
SOURCE: [@start@, с, о, @end@]
GOLD: [@start@, с, о, @end@]
PRED: ['с', 'о']
SOURCE: [@start@, о, б, ъ, е, к, т, о, м, @end@]
GOLD: [@start@, о, б, ъ, е, к, т, о, м, @end@]
PRED: ['о', 'б', 'ъ', 'е', 'к', 'т', 'о', 'м']
SOURCE: [@start@, п, о, @end@]
GOLD: [@start@, п, о, @end@]
PRED: ['п', 'о']
Epoch: 2


loss: 0.4500 ||:   1%|          | 298/31250 [00:30<28:40, 17.99it/s]  