In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [4]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOS": 0, "EOS": 1, "unk": 2}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "unk"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def trim_vocab(self, min_occurance):
        words_to_delete = [word for word, count in self.word2count.items() if count < min_occurance]

        if "unk" not in self.word2count:
            self.word2count["unk"] = 0

        for word in words_to_delete:
            self.word2count["unk"] += self.word2count[word]
            del self.word2index[word]
            del self.word2count[word]

        self.index2word = {0: "SOS", 1: "EOS", 2: "unk"}
        self.n_words = 3  # Count SOS and EOS

        for word in self.word2count.keys():
            if word != 'unk':
                self.word2index[word] = self.n_words
                self.index2word[self.n_words] = word
                self.n_words += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [6]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/tatoeba/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
# Since there are a lot of example sentences and we want to train something quickly,
# we'll trim the data set to only relatively short and simple sentences.
# Here the maximum length is 10 words (that includes ending punctuation) and 
# we're filtering to sentences that translate to the form "I am" or "He is" etc. (accounting for apostrophes replaced earlier).
MAX_LENGTH = 10

eng_prefixes = [
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re ", "I don t", "Do you", "I want", "Are you", "I have", "I think",
       "I can t", "I was", "He is", "I m not", "This is", "I just", "I didn t",
       "I am", "I thought", "I know", "Tom is", "I had", "Did you", "Have you",
       "Can you", "He was", "You don t", "I d like", "It was", "You should",
       "Would you", "I like", "It is", "She is", "You can t", "He has",
       "What do", "If you", "I need", "No one", "You are", "You have",
       "I feel", "I really", "Why don t", "I hope", "I will", "We have",
       "You re not", "You re very", "She was", "I love", "You must", "I can"]
eng_prefixes = (map(lambda x: x.lower(), eng_prefixes))
eng_prefixes = tuple(eng_prefixes)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
# Read text file and split into lines, split lines into pairs
# Normalize text, filter by length and content
# Make word lists from sentences in pairs
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))

    # trim down the data according to specification above
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    # remove all words with a frequency below a threshold
    print("Mark all OOV with 'unk' for all lines")
    input_lang.trim_vocab(2)
    output_lang.trim_vocab(2)

    # update pairs with 'unk'
    new_pairs = []
    for pair in pairs:
        new_pair = []
        input_tokens = []
        output_tokens = []
        for word in pair[0].split(' '):
            if word in input_lang.word2index:
                input_tokens.append(word)
            else:
                input_tokens.append('unk')

        for word in pair[1].split(' '):
            if word in output_lang.word2index:
                output_tokens.append(word)
            else:
                output_tokens.append('unk')

        new_pair.append(" ".join(input_tokens))
        new_pair.append(" ".join(output_tokens))
        new_pairs.append(new_pair)

    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)

    return input_lang, output_lang, new_pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

Reading lines...
Read 135842 sentence pairs
Trimmed to 37298 sentence pairs
Counting words...
Mark all OOV with 'unk' for all lines
Counted words:
fra 5314
eng 3755


In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [13]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [30]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

    n = len(pairs)
    n_test = int(0.1 * n)
    n_dev = int(0.2 * n)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    random.shuffle(pairs)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_data = torch.utils.data.Subset(data, range(n_dev, n))
    dev_data = torch.utils.data.Subset(data, range(n_test, n_dev))
    test_data = torch.utils.data.Subset(data, range(n_test))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    dev_sampler = RandomSampler(dev_data)
    dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader, dev_dataloader, test_dataloader

In [23]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [16]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [34]:
def train(train_dataloader, val_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    vloss_avg = 0
    for epoch in range(1, n_epochs + 1):
        encoder.train(True)
        decoder.train(True)
        train_loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += train_loss
        plot_loss_total += train_loss

        # calculate loss in dev set
        encoder.eval()
        decoder.eval()

        # Disable gradient computation and reduce memory consumption.
        running_vloss = 0
        with torch.no_grad():
            for i, vdata in enumerate(val_dataloader):
                input_tensor, target_tensor = vdata
                encoder_outputs, encoder_hidden = encoder(input_tensor)
                decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

                val_loss = criterion(
                    decoder_outputs.view(-1, decoder_outputs.size(-1)),
                    target_tensor.view(-1)
                )
                running_vloss += val_loss
                
        vloss_avg += running_vloss / (i + 1)
        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0

            print_vloss_avg = vloss_avg / print_every
            vloss_avg = 0
            print('%s (%d %d%%) %.4f %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg, print_vloss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [18]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [35]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader, val_dataloader, test_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, val_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

Reading lines...
Read 135842 sentence pairs
Trimmed to 37298 sentence pairs
Counting words...
Mark all OOV with 'unk' for all lines
Counted words:
fra 5314
eng 3755
2m 17s (- 34m 22s) (5 6%) 1.3996 1.2843
4m 37s (- 32m 19s) (10 12%) 0.5692 0.8367
6m 57s (- 30m 8s) (15 18%) 0.3421 0.7534
9m 16s (- 27m 50s) (20 25%) 0.2426 0.7482
11m 35s (- 25m 29s) (25 31%) 0.1878 0.7622
13m 52s (- 23m 7s) (30 37%) 0.1542 0.7814
16m 11s (- 20m 48s) (35 43%) 0.1319 0.8022
18m 30s (- 18m 30s) (40 50%) 0.1165 0.8263
20m 48s (- 16m 10s) (45 56%) 0.1047 0.8469
23m 6s (- 13m 51s) (50 62%) 0.0963 0.8718
25m 24s (- 11m 32s) (55 68%) 0.0890 0.8919
27m 43s (- 9m 14s) (60 75%) 0.0839 0.9150
30m 1s (- 6m 55s) (65 81%) 0.0789 0.9261
32m 20s (- 4m 37s) (70 87%) 0.0746 0.9419
34m 38s (- 2m 18s) (75 93%) 0.0719 0.9575
36m 56s (- 0m 0s) (80 100%) 0.0692 0.9735


In [36]:
def evaluate_on_test_set(test_dataloader, encoder, decoder, criterion):

    encoder.eval()
    decoder.eval()

    total_loss = 0
    with torch.no_grad():
        for data in test_dataloader:
            input_tensor, target_tensor = data
    
            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
    
            loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_tensor.view(-1)
            )
            total_loss += loss.item()

    return total_loss / len(test_dataloader)
    
test_loss = evaluate_on_test_set(test_dataloader, encoder, decoder, nn.NLLLoss())

In [37]:
test_loss

0.9593960790552645

In [38]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [39]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [53]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> je ne vais pas encore abandonner
= i m not giving up yet
< i m not giving up yet <EOS>

> que pensez vous de mon costume ?
= what do you think of my costume ?
< what do you think of my costume ? <EOS>

> elle etait clouee au lit hier
= she was sick in bed yesterday
< she was sick in bed for tomorrow night <EOS>

> il fut entendu en train de chanter la chanson
= he was heard singing the song
< he was heard singing the song <EOS>

> vous interessez vous aux langues etrangeres ?
= are you interested in foreign languages ?
< are you interested in foreign languages ? <EOS>

> nous en sommes toutes responsables
= we re all to blame for that
< we re all to blame for that <EOS>

> je veux chanter une chanson
= i want to sing a song
< i want to sing a song a song <EOS>

> j espere qu il va reussir
= i hope that he will succeed
< i hope he will succeed <EOS>

> j aimerais que tu rencontres ma femme
= i d like you to meet my wife
< i d like you to meet my wife <EOS>

> il a les yeux unk
= he ha

In [57]:
encoder.eval()
decoder.eval()
output_words, _ = evaluate(encoder, decoder, "le sous les", input_lang, output_lang)
output_sentence = ' '.join(output_words)
output_sentence

'i have the study in the trouble <EOS>'

In [44]:
pair = random.choice(pairs)
pair[0]

'je n en suis pas certain'