In [1]:
from collections import namedtuple
# other packages
import pandas as pd
from allennlp.predictors import SimpleSeq2SeqPredictor, Seq2SeqPredictor
import tqdm
# pytorch
import torch
import torch.optim as optim
# preprocessing
from imblearn.under_sampling import RandomUnderSampler
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.vocabulary import Vocabulary
# model
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
# attention
from allennlp.modules.attention import BilinearAttention
# encoder
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.training.trainer import Trainer
from sklearn.model_selection import train_test_split

In [2]:
torch.cuda.is_available()

True

In [3]:
Config = namedtuple('Config', [
                    'lazy',
                    'max_vocab_size', 
                    'batch_size', 
                    'epochs', 
                    'max_seq_len', 
                    'IN_EMBEDDING_DIM', 
                    'HIDDEN_DIM', 
                    'OUT_EMBEDDING_DIM', 
                    'CUDA_DEVICE',
                    'n_samples'
])

# create config file

In [4]:
config = Config(False,
               10000,
               64,
               10,
               100,
               64,
               32,
               64,
               0 if torch.cuda.is_available() else -1, 
               60000)

# prepare data

In [5]:
raw_dataset = pd.read_csv('data/ru_train.csv')

raw_dataset['before'] = raw_dataset['before'].astype(str)
raw_dataset['after'] = raw_dataset['after'].astype(str)

d = raw_dataset['class'].value_counts().to_dict()

for i in d:
    if d[i] > config.n_samples:
        d[i] = config.n_samples

In [9]:
d

{'PLAIN': 60000,
 'PUNCT': 60000,
 'CARDINAL': 60000,
 'LETTERS': 60000,
 'DATE': 60000,
 'VERBATIM': 60000,
 'ORDINAL': 46738,
 'MEASURE': 40534,
 'TELEPHONE': 10088,
 'DECIMAL': 7297,
 'ELECTRONIC': 5832,
 'MONEY': 2690,
 'FRACTION': 2460,
 'DIGIT': 2012,
 'TIME': 1945}

In [10]:
rus = RandomUnderSampler(sampling_strategy=d, random_state=0)

raw_dataset_resampled, _ = rus.fit_resample(raw_dataset, raw_dataset['class'])

In [12]:
raw_dataset_resampled['class'].value_counts()

DATE          60000
CARDINAL      60000
VERBATIM      60000
PLAIN         60000
LETTERS       60000
PUNCT         60000
ORDINAL       46738
MEASURE       40534
TELEPHONE     10088
DECIMAL        7297
ELECTRONIC     5832
MONEY          2690
FRACTION       2460
DIGIT          2012
TIME           1945
Name: class, dtype: int64

In [13]:
df = raw_dataset_resampled[['before', 'after']]

df_train, df_test = train_test_split(df, test_size=0.1)

df_train.to_csv('train_dataset.tsv', index=False, header=False, sep='\t')
df_test.to_csv('test_dataset.tsv', index=False, header=False, sep='\t')

# create reader

In [16]:
reader = Seq2SeqDatasetReader(
    source_tokenizer = CharacterTokenizer(),
    target_tokenizer = CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')},
    lazy=config.lazy
)

train_dataset = reader.read('train_dataset.tsv',)
validation_dataset = reader.read('test_dataset.tsv')

431636it [00:25, 17171.00it/s]
47960it [00:01, 32205.01it/s]


# prepare vocabulary

In [None]:
vocab = Vocabulary.from_instances(train_dataset,
                                  min_count={'tokens': 3, 'target_tokens': 3}
                                 )

in_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=config.IN_EMBEDDING_DIM
                        )

source_embedder = BasicTextFieldEmbedder({"tokens": in_embedding})

iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("source_tokens", "num_tokens")],
                         )

iterator.index_with(vocab)

# prepare model

In [22]:
## get simple encoder
encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(config.IN_EMBEDDING_DIM, 
                                              config.HIDDEN_DIM, 
                                              batch_first=True))

attention = BilinearAttention(config.HIDDEN_DIM, config.HIDDEN_DIM)

max_decoding_steps = 100

model = SimpleSeq2Seq(vocab, 
                      source_embedder, 
                      encoder, 
                      max_decoding_steps,
                      target_embedding_dim=config.OUT_EMBEDDING_DIM,
                      target_namespace='target_tokens',
                      beam_size=8,
                      use_bleu=True,
                      attention=attention,
                      scheduled_sampling_ratio = 0.15)

if torch.cuda.is_available():
    model.cuda(config.CUDA_DEVICE)
    
optimizer = optim.Adam(model.parameters())

# training

In [27]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  num_epochs=1,
                  cuda_device=config.CUDA_DEVICE,
                  patience=2)

In [28]:
print(f'Will train for {config.epochs} epochs')
for i in range(config.epochs):
    print(f'Epoch: {i+1}')
    trainer.train()

  0%|          | 0/6745 [00:00<?, ?it/s]

Will train for 10 epochs
Epoch: 1


loss: 1.6752 ||: 100%|██████████| 6745/6745 [07:50<00:00, 14.33it/s]
BLEU: 0.2784, loss: 0.7575 ||: 100%|██████████| 750/750 [05:26<00:00,  2.30it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 2


loss: 0.5723 ||: 100%|██████████| 6745/6745 [07:40<00:00, 14.64it/s]
BLEU: 0.4893, loss: 0.3247 ||: 100%|██████████| 750/750 [05:05<00:00,  2.46it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 3


loss: 0.3164 ||: 100%|██████████| 6745/6745 [07:41<00:00, 14.62it/s]
BLEU: 0.5930, loss: 0.2103 ||: 100%|██████████| 750/750 [04:54<00:00,  2.54it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 4


loss: 0.2314 ||: 100%|██████████| 6745/6745 [07:36<00:00, 14.78it/s]
BLEU: 0.5848, loss: 0.1750 ||: 100%|██████████| 750/750 [04:54<00:00,  2.54it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 5


loss: 0.1910 ||: 100%|██████████| 6745/6745 [07:31<00:00, 14.95it/s]
BLEU: 0.7051, loss: 0.1449 ||: 100%|██████████| 750/750 [05:02<00:00,  2.48it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 6


loss: 0.1669 ||: 100%|██████████| 6745/6745 [07:31<00:00, 14.95it/s]
BLEU: 0.7473, loss: 0.1249 ||: 100%|██████████| 750/750 [04:55<00:00,  2.54it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 7


loss: 0.1517 ||: 100%|██████████| 6745/6745 [07:31<00:00, 14.95it/s]
BLEU: 0.7219, loss: 0.1176 ||: 100%|██████████| 750/750 [04:45<00:00,  2.63it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 8


loss: 0.1398 ||: 100%|██████████| 6745/6745 [07:35<00:00, 14.81it/s]
BLEU: 0.7397, loss: 0.1148 ||: 100%|██████████| 750/750 [04:35<00:00,  2.72it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 9


loss: 0.1319 ||: 100%|██████████| 6745/6745 [07:31<00:00, 14.92it/s]
BLEU: 0.7870, loss: 0.1009 ||: 100%|██████████| 750/750 [04:40<00:00,  2.67it/s]
  0%|          | 0/6745 [00:00<?, ?it/s]

Epoch: 10


loss: 0.1242 ||: 100%|██████████| 6745/6745 [07:44<00:00, 14.52it/s]
BLEU: 0.8116, loss: 0.0984 ||: 100%|██████████| 750/750 [04:57<00:00,  2.52it/s]


In [29]:
with open("BilinearAttention.th", 'wb') as f:
    torch.save(model.state_dict(), f)

In [83]:
tqdm.tqdm.pandas()

predictor = SimpleSeq2SeqPredictor(model, reader)

  from pandas import Panel


In [77]:
''.join(predictor.predict('1984')['predicted_tokens'])

'тысяча девятьсот восемьдесят четыре'

In [35]:
del raw_dataset, df, train_dataset, trainer

In [86]:
kaggle_test = pd.read_csv('data/ru_test_2.csv')
kaggle_test['before'] = kaggle_test['before'].astype(str)

In [87]:
small_kaggle = pd.DataFrame(kaggle_test['before'].unique(), columns=['before'])

In [88]:
small_kaggle['after'] = small_kaggle['before'].progress_apply(lambda x: ''.join(predictor.predict(x)['predicted_tokens']))

100%|██████████| 175991/175991 [1:16:09<00:00, 38.51it/s]


In [90]:
small_kaggle['after'] = small_kaggle['after'].progress_apply(lambda x: x.lower())

100%|██████████| 175991/175991 [00:00<00:00, 740641.65it/s]


In [None]:
small_kaggle = pd.DataFrame(small_kaggle)

In [None]:
small_kaggle = small_kaggle.set_index('before')

In [None]:
small_kaggle

In [None]:
kaggle_test = kaggle_test.join(small_kaggle, on='before')

In [None]:
kaggle_test

In [None]:
kaggle_test['id'] = kaggle_test['sentence_id'].astype(str)+'_'+kaggle_test['token_id'].astype(str)

In [None]:
kaggle_test[['id', 'after']].to_csv('sub.csv', index=False)

In [95]:
small_kaggle = pd.DataFrame(small_kaggle)

In [97]:
small_kaggle = small_kaggle.set_index('before')

In [101]:
small_kaggle

Unnamed: 0_level_0,after
before,Unnamed: 1_level_1
Эта,эта
книга,книга
",",","
отличающаяся,отличающаяся
«,«
...,...
Лечче,лечче
Апулия,апулия
бессмысленные,бессмысленные
Лоурел,лоурел


In [103]:
kaggle_test = kaggle_test.join(small_kaggle, on='before')

In [104]:
kaggle_test

Unnamed: 0,sentence_id,token_id,before,after
0,0,0,Эта,эта
1,0,1,книга,книга
2,0,2,",",","
3,0,3,отличающаяся,отличающаяся
4,0,4,«,«
...,...,...,...,...
989875,69999,17,убедил,убедил
989876,69999,18,его,его
989877,69999,19,выполнить,выполнить
989878,69999,20,приказ,приказ


In [105]:
kaggle_test['id'] = kaggle_test['sentence_id'].astype(str)+'_'+kaggle_test['token_id'].astype(str)

In [108]:
kaggle_test[['id', 'after']].to_csv('sub.csv', index=False)