In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
from torch.optim import Adam

In [2]:
torch.manual_seed(0)
random.seed(0)

In [3]:
!gdown --fuzzy https://drive.google.com/file/d/1k1quangHHsq25rJOTLShK3H0hN1mVqp9/view?usp=sharing -O train.csv
!gdown --fuzzy https://drive.google.com/file/d/1NsJXT6eBrXDKXKggfWjwcYuoU84CII3g/view?usp=drive_link -O test.csv

Downloading...
From: https://drive.google.com/uc?id=1k1quangHHsq25rJOTLShK3H0hN1mVqp9
To: /kaggle/working/train.csv
100%|██████████████████████████████████████| 38.0k/38.0k [00:00<00:00, 57.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NsJXT6eBrXDKXKggfWjwcYuoU84CII3g
To: /kaggle/working/test.csv
100%|████████████████████████████████████████| 134k/134k [00:00<00:00, 77.9MB/s]


In [4]:
import pandas as pd


train_dataset = pd.read_csv('train.csv').values
test_dataset = pd.read_csv('test.csv')

In [5]:
train_dataset

array([['den tjugofjärde 05 2049', '24-05-2049'],
       ['15/11/77', '15-11-2077'],
       ["sipsa'e 02 2049", '14-02-2049'],
       ...,
       ['le neuf mars 2007', '09-03-2007'],
       ['am vier und zwanzigsten juni 2007', '24-06-2007'],
       ['sechster juni 2007', '06-06-2007']], dtype=object)

In [6]:
test_dataset

Unnamed: 0,id,data
0,0,24 января 2007
1,1,le six mars 2049
2,2,le dix 05 2077
3,3,27 июня 2049
4,4,08 гыйнварда 2077
...,...,...
4671,4671,am fünfzehnten januar 2049
4672,4672,тугызынчы 05 2049
4673,4673,der achzehnte 02 2007
4674,4674,vierzehnter 12 2049


In [7]:
MAX_LENGTH = max(map(lambda x: len(x[0]), train_dataset)) + 1

MAX_LENGTH

41

In [8]:
SOS_token = 0
EOS_token = 1


class Lang:

    def __init__(self, name):
        self.name = name
        self.word2index = {
            'SOS': 0,
            'EOS': 1
        }
        self.index2word = {
            0: 'SOS',
            1: 'EOS'
        }

    @property
    def n_words(self) -> int:
        return len(self.index2word)

    def add_sentence(self, sentence):
        for word in list(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word

In [9]:
input_lang = Lang('human')
output_lang = Lang('iso')

for pair in train_dataset:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

human 82
iso 13


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
class Encoder(nn.Module):

    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [12]:
class Decoder(nn.Module):

    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        output = self.embedding(x).view(1, 1, -1)
        output = self.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [13]:
def sentence2idx(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]


def sentence2tensor(lang, sentence):
    indexes = sentence2idx(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair2tensor(x):
    input_tensor = sentence2tensor(input_lang, x[0])
    target_tensor = sentence2tensor(output_lang, x[1])
    return input_tensor, target_tensor

In [14]:
def train_single(
        input_tensor, target_tensor,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer,
        criterion
):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    encoder_hidden = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = False

    if use_teacher_forcing:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, elem)
            decoder_input = elem
    else:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, topi = decoder_output.data.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, elem)
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / len(target_tensor)

In [15]:
def train(encoder, decoder, n_epochs=5, print_every=100):
    encoder.train()
    decoder.train()

    encoder_optimizer = Adam(encoder.parameters(), lr=1e-3)
    decoder_optimizer = Adam(decoder.parameters(), lr=1e-3)

    criterion = nn.NLLLoss()

    for epoch in range(n_epochs):
        print_loss_total = 0

        print(f'Epoch [{epoch + 1:02d}/{n_epochs:02d}]')
        training_pairs = [
            pair2tensor(x) for x in train_dataset
        ]

        for i, training_pair in enumerate(training_pairs):
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train_single(
                input_tensor, target_tensor,
                encoder, decoder,
                encoder_optimizer, decoder_optimizer,
                criterion
            )
            print_loss_total += loss

            if (i + 1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print(f'Training ({i / len(training_pairs) * 100:.1f}%) loss: {print_loss_avg:.4f}')

In [16]:
encoder_model = Encoder(input_lang.n_words, 128).to(device)
decoder_model = Decoder(128, output_lang.n_words).to(device)

train(encoder_model, decoder_model, n_epochs=40)

Epoch [01/40]
Training (9.0%) loss: 1.8857
Training (18.2%) loss: 1.3371
Training (27.3%) loss: 0.7938
Training (36.4%) loss: 0.6853
Training (45.6%) loss: 0.6303
Training (54.7%) loss: 0.6155
Training (63.8%) loss: 0.5840
Training (73.0%) loss: 0.5784
Training (82.1%) loss: 0.5457
Training (91.2%) loss: 0.5270
Epoch [02/40]
Training (9.0%) loss: 0.5198
Training (18.2%) loss: 0.5283
Training (27.3%) loss: 0.5029
Training (36.4%) loss: 0.4867
Training (45.6%) loss: 0.4639
Training (54.7%) loss: 0.4775
Training (63.8%) loss: 0.4447
Training (73.0%) loss: 0.4359
Training (82.1%) loss: 0.4154
Training (91.2%) loss: 0.4034
Epoch [03/40]
Training (9.0%) loss: 0.4006
Training (18.2%) loss: 0.4036
Training (27.3%) loss: 0.3850
Training (36.4%) loss: 0.3510
Training (45.6%) loss: 0.3396
Training (54.7%) loss: 0.3440
Training (63.8%) loss: 0.3211
Training (73.0%) loss: 0.3268
Training (82.1%) loss: 0.3101
Training (91.2%) loss: 0.3001
Epoch [04/40]
Training (9.0%) loss: 0.2832
Training (18.2%) l

In [17]:
@torch.no_grad()
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()

    input_tensor = sentence2tensor(input_lang, sentence)
    encoder_hidden = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    decoded_words = []

    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        _, topi = decoder_output.data.topk(1)

        decoded_words.append(output_lang.index2word[topi.item()])

        if topi.item() == EOS_token:
            break

        decoder_input = topi.squeeze().detach()

    return decoded_words


def predict_(encoder, decoder, dataset):
    result = []

    for _ in dataset:
        result.append(evaluate(encoder, decoder, _)[:10])

    return result

In [18]:
test_dataset = pd.read_csv('test.csv')

In [19]:
test_prediction = predict_(encoder_model, decoder_model, test_dataset['data'])

In [20]:
test_prediction = [''.join(x) for x in test_prediction]

In [21]:
test_dataset['label'] = test_prediction

In [22]:
test_dataset[['id', 'label']].to_csv('submission.csv', index=None)