In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import random
import time
import os
import pandas as pd
import math
import bcolz
import pickle
import re

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
print("Pytorch: {}".format(torch.__version__))

Pytorch: 0.2.0_4


# Data loading

In [3]:
all_data_org = pd.read_csv('data/en_train_org.csv')

In [4]:
all_data_org[pd.isnull(all_data_org['before'])][:3]

Unnamed: 0,sentence_id,token_id,class,before,after
616107,49226,17,LETTERS,,n a
684691,54634,1,PLAIN,,
965529,76612,7,PLAIN,,


In [5]:
all_data = all_data_org.dropna()
print("Data rows: {},  (dropped none rows: {})".format(len(all_data), len(all_data_org)-len(all_data)))
all_data = all_data[all_data['class'] != 'VERBATIM']
print("Data rows: {},  (dropped rows: {})".format(len(all_data), len(all_data_org)-len(all_data)))
all_data = all_data.reset_index(drop=True)

Data rows: 9918390,  (dropped none rows: 51)
Data rows: 9840282,  (dropped rows: 78159)


Note we dropped VERBATIM class. Thats because it had so many weird characters.

In [6]:
all_data_sentence_index = all_data.set_index('sentence_id')

### More balanced sample

In [7]:
all_data.groupby("class")["class"].count()

class
ADDRESS           522
CARDINAL       133744
DATE           258348
DECIMAL          9821
DIGIT            5442
ELECTRONIC       5162
FRACTION         1196
LETTERS        152790
MEASURE         14783
MONEY            6128
ORDINAL         12703
PLAIN         7353647
PUNCT         1880507
TELEPHONE        4024
TIME             1465
Name: class, dtype: int64

In [8]:
all_data_classes = list(all_data.groupby('class'))

In [9]:
def data_balance_randomize_classes(max_len=10000):
    global data_balanced_classes
    data_balanced_classes = pd.concat([v.sample(min(max_len, len(v))) for k, v in all_data_classes])

In [10]:
%time
data_balance_randomize_classes()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.58 µs


In [11]:
data_balanced_classes.groupby("class")["class"].count()

class
ADDRESS         522
CARDINAL      10000
DATE          10000
DECIMAL        9821
DIGIT          5442
ELECTRONIC     5162
FRACTION       1196
LETTERS       10000
MEASURE       10000
MONEY          6128
ORDINAL       10000
PLAIN         10000
PUNCT         10000
TELEPHONE      4024
TIME           1465
Name: class, dtype: int64

In [12]:
data_balanced_classes.sample(10)

Unnamed: 0,sentence_id,token_id,class,before,after
4015740,310950,3,PUNCT,.,.
7501902,573374,9,DIGIT,24706,two four seven o six
7295163,557828,16,DECIMAL,2.5,two point five
5579598,429216,10,CARDINAL,2013,two thousand thirteen
6556192,502427,2,PLAIN,Hush,Hush
81065,6558,4,DECIMAL,38 million,thirty eight million
8402486,640852,7,PLAIN,Sylvania,Sylvania
2973606,231678,4,DATE,1943,nineteen forty three
672866,54165,10,CARDINAL,1,one
4302562,332666,5,TIME,7:18 p.m.,seven eighteen p m


### Word vectors

In [13]:
def load_glove(name):
    with open(name, 'r') as f: lines = [line.split() for line in f]
    words = [d[0] for d in lines]
    vecs = np.stack(np.array(d[1:], dtype=np.float32) for d in lines)
    wordidx = {o:i for i,o in enumerate(words)}
    return vecs, words, wordidx

In [14]:
wv_vecs, wv_words, wv_idx = load_glove('/home/ohu/koodi/data/glove_wordvec/glove.6B.50d.txt')

In [15]:
re_apos = re.compile(r"(\w)'s\b")         # make 's a separate word
re_mw_punc = re.compile(r"(\w[’'])(\w)")  # other ' in a word creates 2 words
re_punc = re.compile("([\"().,;:/_?!—])") # add spaces around punctuation
re_mult_space = re.compile(r"  *")        # replace multiple spaces with just one

def simple_tokeniser(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' - ')
    sent = re_punc.sub(r" \1 ", sent)
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()
simple_tokeniser("asdf's   asdf   -testaaa")

['asdf', "'s", 'asdf', '-', 'testaaa']

arr = [simple_tokeniser(s_)[0] for s_ in list(all_data.sample(1000)['before'])]
[s in wv_idx for s in arr].count(True) / 1000

In [16]:
def get_random_sample():
    sample_row = data_balanced_classes.iloc[random.randint(1, len(data_balanced_classes)-1)]
    sentence_id = sample_row['class']

    #rows = all_data[all_data['sentence_id']==sample_row['sentence_id']]
    rows = all_data_sentence_index.loc[sample_row['sentence_id']]
    befores = rows.before.values
        
    token_id_idx = list(rows['token_id']).index(sample_row['token_id'])
    befores[token_id_idx] = '*****'
    str_list = simple_tokeniser(' '.join(befores))
    
    word_vect = np.zeros((1, len(str_list), wv_vecs.shape[1]), dtype=np.float32)
    # var = np.zeros((1, len(str_list), wv_vecs.shape[1]+1))
    for i, w in enumerate(str_list):
        if w=='*****':
            word_vect[0][i] = np.zeros((1, wv_vecs.shape[1]))
        else:
            try:
                word_vect[0][i] = wv_vecs[wv_idx[w]]
            except KeyError:
                word_vect[0][i] = np.random.rand(1, wv_vecs.shape[1])
    return sample_row['before'], sample_row['after'], sample_row['class'], word_vect

s_bef, s_aft, s_class, s_word_v = get_random_sample()
print(s_class, ':', s_bef, '->', s_aft, '('+str(len(s_aft))+')', ':', s_word_v.shape, type(s_word_v[0,0,0]))

In [17]:
%%timeit
get_random_sample()

492 µs ± 3.73 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Categories and Chars

In [18]:
categories_all = all_data["class"].unique()
print(categories_all)
print(len(categories_all))
categories_index = dict((c, i) for i, c in enumerate(categories_all))

['PLAIN' 'PUNCT' 'DATE' 'LETTERS' 'CARDINAL' 'DECIMAL' 'MEASURE' 'MONEY'
 'ORDINAL' 'TIME' 'ELECTRONIC' 'DIGIT' 'FRACTION' 'TELEPHONE' 'ADDRESS']
15


In [19]:
letters_before_all = sorted(list(set(''.join(all_data['before']))))
print(len(letters_before_all))
letters_after_all = sorted(list(set(''.join(all_data['after']))))
print(len(letters_after_all))

112
85


In [20]:
letters_all = ['<EOS>', '<SOS>'] + sorted(list(set(letters_before_all + letters_after_all)))
letters_all_index = dict((c, i) for i, c in enumerate(letters_all))
print(''.join(letters_all))
print(len(letters_all))

<EOS><SOS> !"#$%&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz|~¡£¥ª«²³µº»¼½¾¿éɒʻˈΩμ—€⅓⅔⅛⅝⅞
114


In [21]:
def onehot_char(char):
    tensor = torch.zeros(1, len(letters_all))
    tensor[0, letters_all_index[char]] = 1
    return tensor
onehot_char('<EOS>')[0, 0] == 1

sos_variable = Variable(onehot_char('<SOS>')).cuda()

In [22]:
def string_to_tensor(line, include_eos=False):
    tensor_length = len(line)+1 if include_eos else len(line)
    tensor = torch.zeros(1, tensor_length, len(letters_all))
    for li, letter in enumerate(line):
        tensor[0, li, letters_all_index[letter]] = 1
    if include_eos:
        tensor[0, -1, letters_all_index['<EOS>']] = 1
    return tensor
print(string_to_tensor('wordup').size())
print(string_to_tensor('wordup', include_eos=True).size())

torch.Size([1, 6, 114])
torch.Size([1, 7, 114])


# Models

In [23]:
use_cuda = True

### Encoder

In [24]:
class EncoderRNN(nn.Module):
    def __init__(self, wordvect_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(EncoderRNN, self).__init__()

        #self.train_iterations = 0
        #self.train_history = []
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size
        # self.output_size = output_size

        self.rnn_words = nn.LSTM(wordvect_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
        # self.lin_output = nn.Linear(hidden_size*2, output_size)

    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        # output = self.lin_output(output)
        # output = F.log_softmax(output)
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        if use_cuda:
            var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
            var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [27]:
def test_encoder():
    s_bef, s_aft, s_class, s_word_vs = get_random_sample()
    s_string = string_to_tensor(s_bef)
    encoder_rnn = EncoderRNN(wordvect_size=s_word_vs.shape[-1], chars_input_size=len(letters_all),
                                      words_hidden_size=64, chars_hidden_size=128,
                                      words_layers=1, chars_layers=1).cuda()
    print('Word vect size:', s_word_vs.shape, '. String vector size:', s_string.size())
    output_encoded = encoder_rnn(Variable(torch.from_numpy(s_word_vs)).cuda(), Variable(s_string).cuda())
    print('Output:', output_encoded.size())
    return encoder_rnn, output_encoded;
encoder_rnn, output_encoded = test_encoder()

Word vect size: (1, 29, 50) . String vector size: torch.Size([1, 4, 114])
Output: torch.Size([1, 192])


### Decoder

In [29]:
class DecoderRNN(nn.Module):
    def __init__(self, chars_input_size, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(chars_input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
                         # LSTM would require own hidden included
        
        self.lin_out = nn.Linear(hidden_size, output_size)
        #self.softmax = nn.LogSoftmax()

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden


In [31]:
decoder_rnn = DecoderRNN(chars_input_size=len(letters_all), hidden_size=output_encoded.size()[-1],
                         output_size=len(letters_all)).cuda()
tmp_a, tmp_b = decoder_rnn(sos_variable.view(1,1,-1), output_encoded.view(1,1,-1))
tmp_a.topk(1)

(Variable containing:
 -4.6277
 [torch.cuda.FloatTensor of size 1x1 (GPU 0)], Variable containing:
  98
 [torch.cuda.LongTensor of size 1x1 (GPU 0)])

In [32]:
decoder_input = Variable(string_to_tensor(letters_all[tmp_a.topk(1)[1].data[0][0]])).cuda()
tmp = decoder_rnn(decoder_input, tmp_b)
[t.size() for t in tmp]

[torch.Size([1, 114]), torch.Size([1, 1, 192])]

# Training etc

### Accuracy

### Training

In [33]:
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [175]:
def train(s_bef, s_aft, s_word_vs, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_function,
          use_teacher_forcing, max_length=20):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    s_bef_string = string_to_tensor(s_bef, include_eos=False)
    target_arr = list(s_aft) + ['<EOS>']

    encoder_output = encoder(Variable(torch.from_numpy(s_word_vs)).cuda(), Variable(s_bef_string).cuda())
    encoder_output = encoder_output.view(1,1,-1)
    decoder_hidden = encoder_output

    decoder_input = sos_variable.view(1,1,-1).cuda()

    decoded_chars_arr = []
    for i in range(len(target_arr)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        decoder_target = Variable(torch.LongTensor([letters_all_index[target_arr[i]]])).cuda()
        loss += loss_function(decoder_output, decoder_target)

        topv, topi = decoder_output.data.topk(1)
        char_index = topi[0][0]
        char = letters_all[char_index] # Use own prediction as next char
        decoded_chars_arr.append(char)
        
        if use_teacher_forcing:
            char = target_arr[i] # replace input with right target char
        else:
            # use output char normally as input char
            if char == '<EOS>':
                print("BREAKINg eos")
                break

        decoder_input = Variable(string_to_tensor(char)).cuda()
        

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return((loss.data[0] / len(target_arr)), ''.join(decoded_chars_arr))
#print(train())

In [35]:
model_train_iterations = 0
model_train_history = []

In [36]:
%%timeit
data_balance_randomize_classes()

314 ms ± 2.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
def train_iterations(n_iters=100000, lr=0.001, teacher_forcing_ratio=0.5,
                     print_every=5000, plot_every=1000):
    global model_train_iterations
    global model_train_history
    start = time.time()
    decoder_rnn.train()
    encoder_rnn.train()

    current_loss = 0
    current_loss_iter = 0

    encoder_optimizer = torch.optim.Adam(encoder_rnn.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder_rnn.parameters(), lr=lr)
    loss_function = nn.NLLLoss()
    
    for iteration in range(1, n_iters + 1):
        model_train_iterations += 1

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        s_bef, s_aft, s_class, s_word_vs = get_random_sample()
        #print(s_class, ':', s_bef, '->', s_aft, '('+str(len(s_aft))+')')
        loss, result = train(s_bef=s_bef, s_aft=s_aft, s_word_vs=s_word_vs,
                             encoder=encoder_rnn, decoder=decoder_rnn,
                             encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer,
                             loss_function=nn.NLLLoss(), use_teacher_forcing=use_teacher_forcing,
                             max_length=40 )

        current_loss += loss
        current_loss_iter += 1

        # Print iter number, loss, name and guess
        if iteration % print_every == 0:
            teacher_forcing_str = ""
            if use_teacher_forcing:
                teacher_forcing_str = "(forcing)"
            print("{:>6d} {:>4.0%} ({:>8}) {:>7.3f}   | {:>6.2f}: {} -> {} {} ({})".format(
                      model_train_iterations, iteration/n_iters, timeSince(start),
                      current_loss/current_loss_iter, loss,
                      s_bef, teacher_forcing_str, result, s_aft))

        # Add current loss avg to list of losses
        if iteration % plot_every == 0:
            model_train_history.append((current_loss / plot_every, lr))
            current_loss = 0
            current_loss_iter = 0
            
        if model_train_iterations % 25000 == 0:
            data_balance_randomize_classes()
            
        if model_train_iterations % 25000 == 0:
            save_model()
    
    # test_model_accuracy(model, n_sample=10000)

In [40]:
train_iterations(n_iters=10, print_every=5)

     6  50% (   0m 0s)   4.700   |   4.67: 3 -> (forcing) .gngn (three)
    11 100% (   0m 0s)   4.680   |   4.64: XII -> (forcing)   eeee (twelve)


In [86]:
train_iterations(n_iters=2000, teacher_forcing_ratio=0.5, print_every=500)

  3500  25% (  0m 14s)   2.149   |   3.13: 2/3 ->  three nine (two thirds)
  4000  50% (  0m 29s)   2.160   |   2.52: 57 ->  o e t e t e (fifty seven)
  4500  75% (  0m 42s)   2.168   |   2.95: 3 ->  six t (three)
  5000 100% (  0m 57s)   2.151   |   2.78: 16118 ->  theeteen nineteen nin (one six one one eight)


In [87]:
train_iterations(n_iters=(30000-model_train_iterations), teacher_forcing_ratio=0.2, lr=0.0001)

 10000  20% (  2m 31s)   2.033   |   0.01: . ->  . (.)
 15000  40% (   5m 1s)   2.028   |   1.50: 8/21 -> (forcing) eight eho ty fivt   (eight twenty firsts)
 20000  60% (  7m 26s)   1.976   |   2.56: HDLs ->  a a a d (h d l's)
 25000  80% (  9m 50s)   1.920   |   0.88: $47,500 -> (forcing) fouty fiven  h usand tive eundred tollars (forty seven thousand five hundred dollars)
Saving: data/models/gen_1_rnn_25000
 30000 100% ( 12m 13s)   1.796   |   0.92: 0.92% -> (forcing) foro point oineteho torcent (zero point nine two percent)


In [88]:
train_iterations(n_iters=20000)

 35000  25% (  2m 29s)   1.547   |   7.01: - ->  — (-)
 40000  50% (  4m 57s)   1.432   |   2.09: of ->  fo (of)
 45000  75% (  7m 22s)   1.236   |   2.56: $325,000 ->  three hundred fifty thousand six hundred s (three hundred twenty five thousand dollars)
 50000 100% (  9m 47s)   1.098   |   0.48: 1965 -> (forcing) nineteen fixty sive (nineteen sixty five)
Saving: data/models/gen_1_rnn_50000


In [89]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.4)

 55000  10% (  2m 26s)   1.198   |   0.03: 2 -> (forcing) two (two)
 60000  20% (  4m 53s)   1.159   |   1.01: 100.00 ->  one hundred twenty tw (one hundred point o o)
 65000  30% (  7m 17s)   1.054   |   1.71: Idir ->  ind six (i d i r)
 70000  40% (  9m 47s)   1.026   |   1.20: F. A. ->  f o (f a)
 75000  50% ( 12m 18s)   0.954   |   0.46: 533.7 -> (forcing) five hundred thirty ehree point tiven (five hundred thirty three point seven)
Saving: data/models/gen_1_rnn_75000
 80000  60% ( 14m 44s)   0.894   |   0.55: 194 km2 -> (forcing) nie hundred sinety fiur pquare kilometers (one hundred ninety four square kilometers)
 85000  70% ( 17m 10s)   0.862   |   0.41: 539 ->  five hundred thirty nine (five hundred thirty nine)
 90000  80% ( 19m 35s)   0.818   |   2.77: Kevin ->  k e v (Kevin)
 95000  90% (  22m 1s)   0.824   |   0.26: 50.1% -> (forcing) fifty point one percent (fifty point one percent)
100000 100% ( 24m 32s)   0.813   |   0.00: " ->  " (")
Saving: data/models/gen_1_rnn_100000

In [90]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.2)

105000  10% (  2m 25s)   0.831   |   0.00: . -> (forcing) . (.)
110000  20% (  4m 52s)   0.770   |   0.07: 30.5 -> (forcing) thirty point five (thirty point five)
115000  30% (  7m 15s)   0.794   |   1.47: February 1921 ->  february twenteenteennenenen (february nineteen twenty one)
120000  40% (  9m 41s)   0.740   |   2.11: Six ->  S i (Six)
125000  50% (  12m 7s)   0.759   |   0.05: I ->  the first (the first)
Saving: data/models/gen_1_rnn_125000
130000  60% ( 14m 37s)   0.731   |   0.00: — ->  — (—)
135000  70% (  17m 3s)   0.756   |   0.50: 35.06 -> (forcing) thirty five point sneie (thirty five point o six)
140000  80% ( 19m 23s)   0.725   |   0.16: 364 ->  three hundred sixty four (three hundred sixty four)
145000  90% ( 21m 52s)   0.759   |   2.56: 85931 ->  eight five  ine e eeee eeeeeeeeeeeeeeeeeeeee (eighty five thousand nine hundred thirty one)
150000 100% ( 24m 21s)   0.722   |   0.04: ISBN -> (forcing) i s b n (i s b n)
Saving: data/models/gen_1_rnn_150000


In [91]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.4)

155000  10% (  2m 22s)   0.599   |   0.35: £800,000 -> (forcing) eight hundred mhousand dounds (eight hundred thousand pounds)
160000  20% (  4m 45s)   0.626   |   0.00: ) ->  ) ())
165000  30% (  7m 11s)   0.563   |   1.55: 1000 GHz -> (forcing) one hhousand thrhs   h (one thousand gigahertz)
170000  40% (  9m 35s)   0.560   |   1.76: VIII's ->  the sixthhht (the eighth's)
175000  50% ( 11m 56s)   0.513   |   0.00: ) ->  ) ())
Saving: data/models/gen_1_rnn_175000
180000  60% ( 14m 21s)   0.519   |   0.42: 14 August -> (forcing) the fourthenth of jugust (the fourteenth of august)
185000  70% ( 16m 47s)   0.560   |   0.01: to -> (forcing) to (to)
190000  80% ( 19m 12s)   0.535   |   1.39: 8.00 PM ->  eight poi (eight p m)
195000  90% ( 21m 38s)   0.531   |   1.24: .2005 ->  point two thousand (point two o o five)
200000 100% (  24m 5s)   0.548   |   0.00: 5 -> (forcing) five (five)
Saving: data/models/gen_1_rnn_200000


In [92]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.2, lr=0.0005)

205000  10% (  2m 24s)   0.537   |   1.06: Government ->  Govermener (Government)
210000  20% (  4m 47s)   0.545   |   0.87: QELP ->  p e e l (q e l p)
215000  30% (  7m 11s)   0.532   |   0.37: 2010 ->  twenty ten (twenty ten)
220000  40% (  9m 35s)   0.482   |   0.00: , ->  , (,)
225000  50% ( 11m 58s)   0.525   |   0.50: 1,500th ->  one thousand five hundred f (one thousand five hundredth)
Saving: data/models/gen_1_rnn_225000
230000  60% ( 14m 25s)   0.503   |   1.14: Dii ->  dii i (d i i)
235000  70% ( 16m 48s)   0.445   |   0.16: 10,500 ft -> (forcing) ten thousand five hundred feet (ten thousand five hundred feet)
240000  80% ( 19m 11s)   0.474   |   0.42: same ->  same (same)
245000  90% ( 21m 40s)   0.543   |   0.04: $18,000 ->  eighteen thousand dollars (eighteen thousand dollars)
250000 100% (  24m 4s)   0.483   |   0.04: $60,000 ->  sixty thousand dollars (sixty thousand dollars)
Saving: data/models/gen_1_rnn_250000


In [93]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.2, lr=0.001)

255000  10% (  2m 25s)   0.587   |   0.20: 68.5 percent -> (forcing) sixty eight point five percent (sixty eight point five percent)
260000  20% (  4m 48s)   0.532   |   0.09: £2m ->  two million pounds (two million pounds)
265000  30% (  7m 18s)   0.584   |   0.00: ) -> (forcing) ) ())
270000  40% (  9m 47s)   0.554   |   0.20: 4-4-0 -> (forcing) four sil four sil o (four sil four sil o)
275000  50% ( 12m 13s)   0.554   |   0.00: . -> (forcing) . (.)
Saving: data/models/gen_1_rnn_275000
280000  60% ( 14m 35s)   0.585   |   1.52: 0-521-29626-9 ->  o sil five two one two one sil o e                   (o sil five two one sil two nine six two six sil nine)
285000  70% (  17m 5s)   0.563   |   1.61: BJTs ->  b b s s (b j t's)
290000  80% ( 19m 30s)   0.525   |   1.27: $73,750 ->  seventy three thousand five hundred fifty  illasd  (seventy three thousand seven hundred fifty dollars)
295000  90% ( 21m 59s)   0.534   |   2.68: DM 23 billion ->  thent  nint   ninte    nne e  e   (twenty three 

In [94]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.4, lr=0.0001)

305000  10% (  2m 27s)   0.400   |   0.93: 2027 -> (forcing) two ty teelveeseven (twenty twenty seven)
310000  20% (  4m 49s)   0.414   |   0.69: 29 January 1622 -> (forcing) the twenty sinth of junu ry tixtyen thenty fhe (the twenty ninth of january sixteen twenty two)
315000  30% (  7m 10s)   0.375   |   0.66: HQE ->  h e e (h q e)
320000  40% (  9m 36s)   0.397   |   0.33: year -> (forcing) year (year)
325000  50% ( 11m 56s)   0.393   |   0.00: 3rd -> (forcing) third (third)
Saving: data/models/gen_1_rnn_325000
330000  60% ( 14m 24s)   0.344   |   0.04: S. J. -> (forcing) s j (s j)
335000  70% ( 16m 49s)   0.370   |   0.02: KM ->  k m (k m)
340000  80% ( 19m 14s)   0.350   |   0.01: 2.23 ->  two point two three (two point two three)
345000  90% ( 21m 43s)   0.358   |   0.04: are -> (forcing) are (are)
350000 100% (  24m 5s)   0.379   |   1.72: $12,058 ->  twelve thousand five hundred dollar (twelve thousand fifty eight dollars)
Saving: data/models/gen_1_rnn_350000


In [95]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.1, lr=0.001)

355000  10% (  2m 26s)   0.504   |   0.01: 40.3 ->  forty point three (forty point three)
360000  20% (  4m 54s)   0.535   |   0.00: . ->  . (.)
365000  30% (  7m 22s)   0.552   |   0.00: . ->  . (.)
370000  40% (  9m 45s)   0.532   |   1.35: $42,344 ->  forty two thousand five hundred threty three dollar (forty two thousand three hundred forty four dollars)
375000  50% ( 12m 16s)   0.503   |   0.00: :: ->  :: (::)
Saving: data/models/gen_1_rnn_375000
380000  60% ( 14m 46s)   0.532   |   0.61: 2,591 m ->  two thousand five hundred ninety six   eeee (two thousand five hundred ninety one meters)
385000  70% ( 17m 14s)   0.496   |   0.82: 95 ->  nine fifeee (ninety five)
390000  80% ( 19m 43s)   0.545   |   1.61: 0-15-100389-0 ->  o sil one five oie                                (o sil one five sil one o o three eight nine sil o)
395000  90% ( 22m 10s)   0.543   |   0.00: 53% ->  fifty three percent (fifty three percent)
400000 100% ( 24m 37s)   0.545   |   0.55: listing -> (forcing) lis

In [96]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.3, lr=0.0001)

405000  10% (  2m 26s)   0.427   |   0.00: ( ->  ( (()
410000  20% (   5m 0s)   0.351   |   0.64: 0-374-52700-8 -> (forcing) o sil three seven four oil  ive  wo siven sisnsil oight (o sil three seven four sil five two seven o o sil eight)
415000  30% (  7m 26s)   0.385   |   0.03: 200 m -> (forcing) two hundred meters (two hundred meters)
420000  40% (  9m 48s)   0.348   |   0.02: 1960 -> (forcing) nineteen sixty (nineteen sixty)
425000  50% ( 12m 15s)   0.417   |   1.44: officially ->  offocailll (officially)
Saving: data/models/gen_1_rnn_425000
430000  60% ( 14m 43s)   0.382   |   0.02: 87.0 ->  eighty seven point zero (eighty seven point zero)
435000  70% (  17m 9s)   0.362   |   0.00: 5th ->  fifth (fifth)
440000  80% ( 19m 32s)   0.365   |   0.00: on -> (forcing) on (on)
445000  90% (  22m 0s)   0.357   |   0.02: 1980 ->  nineteen eighty (nineteen eighty)
450000 100% ( 24m 29s)   0.343   |   0.79: 0.26 kg ->  zero point two six kilometer (zero point two six kilograms)
Saving: data

In [97]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.0, lr=0.0001)

455000  10% (  2m 26s)   0.394   |   0.17: $7,266 ->  seven thousand two hundred sixty six dollars (seven thousand two hundred sixty six dollars)
460000  20% (  4m 53s)   0.439   |   0.78: 0 ->  z (o)
465000  30% (  7m 19s)   0.396   |   0.00: " ->  " (")
470000  40% (  9m 47s)   0.324   |   1.51: Snorri ->  Sonrrr (Snorri)
475000  50% ( 12m 13s)   0.427   |   0.07: Biolib.cz ->  b i o l i b dot c z (b i o l i b dot c z)
Saving: data/models/gen_1_rnn_475000
480000  60% ( 14m 37s)   0.360   |   0.01: 1.34 ->  one point three four (one point three four)
485000  70% (  17m 4s)   0.405   |   0.85: Cyclingnews.com ->  c y c n i n c i n i d dot c o m (c y c l i n g n e w s dot c o m)
490000  80% ( 19m 31s)   0.417   |   0.05: October 2009 ->  october two thousand nine (october two thousand nine)
495000  90% ( 21m 57s)   0.341   |   0.11: 159 km ->  one hundred fifty nine kilometers (one hundred fifty nine kilometers)
500000 100% ( 24m 29s)   0.381   |   0.06: $6,900 ->  six thousand nine hun

In [98]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.0, lr=0.0001)

505000  10% (  2m 32s)   0.377   |   0.02: £2,000 ->  two thousand pounds (two thousand pounds)
510000  20% (  4m 53s)   0.359   |   0.00: 9 ->  nine (nine)
515000  30% (  7m 15s)   0.354   |   0.71: gorge ->  gogge (gorge)
520000  40% (  9m 42s)   0.359   |   0.01: $200 million ->  two hundred million dollars (two hundred million dollars)
525000  50% (  12m 3s)   0.377   |   0.00: 95% ->  ninety five percent (ninety five percent)
Saving: data/models/gen_1_rnn_525000
530000  60% ( 14m 27s)   0.397   |   0.00: ! ->  ! (!)
535000  70% (  17m 0s)   0.357   |   0.02: 1972 ->  nineteen seventy two (nineteen seventy two)
540000  80% ( 19m 29s)   0.365   |   0.07: 2014 ->  twenty fourteen (twenty fourteen)
545000  90% ( 21m 55s)   0.350   |   0.00: 19th ->  nineteenth (nineteenth)
550000 100% ( 24m 23s)   0.395   |   0.01: 36.3 ->  thirty six point three (thirty six point three)
Saving: data/models/gen_1_rnn_550000


In [99]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.0, lr=0.0001)

555000  10% (  2m 22s)   0.345   |   0.55: $19,087 ->  nineteen thousand eighty sived llaasss (nineteen thousand eighty seven dollars)
560000  20% (  4m 48s)   0.355   |   0.00: . ->  . (.)
565000  30% (  7m 21s)   0.410   |   0.00: : ->  : (:)
570000  40% (  9m 48s)   0.382   |   0.10: 1993 ->  nineteen ninety three (nineteen ninety three)
575000  50% ( 12m 36s)   0.329   |   0.01: 9pm ->  nine p m (nine p m)
Saving: data/models/gen_1_rnn_575000
580000  60% ( 51m 43s)   0.356   |   0.01: 126 ->  one hundred twenty six (one hundred twenty six)
585000  70% (  54m 7s)   0.363   |   0.13: FT.com ->  f t dot c o m (f t dot c o m)
590000  80% ( 56m 30s)   0.360   |   0.00: . ->  . (.)
595000  90% ( 58m 55s)   0.383   |   0.08: 12 June 2012 ->  the twelfth of june twenty twelve (the twelfth of june twenty twelve)
600000 100% ( 61m 25s)   0.380   |   1.06: JAZU ->  j j j a (j a z u)
Saving: data/models/gen_1_rnn_600000


# Model testing

### Results eval

In [181]:
def evaluate(encoder, decoder, s_bef, s_word_vs, max_length=20):
    s_string = string_to_tensor(s_bef, include_eos=True)

    encoder_output = encoder_rnn(Variable(torch.from_numpy(s_word_vs)).cuda(), Variable(s_string).cuda())
    encoder_output = encoder_output.view(1,1,-1)
    decoder_hidden = encoder_output
    
    decoder_input = sos_variable.view(1,1,-1).cuda()

    decoded_chars = []
    #for _ in range(max_length):
    for _ in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        char_index = topi[0][0]
        char = letters_all[char_index]
        decoded_chars.append(char)
        if char == '<EOS>':
            break
        
        decoder_input = Variable(string_to_tensor(char)).cuda()

    return ''.join(decoded_chars)

#evaluate(encoder_rnn, decoder_rnn, s_bef)

In [193]:
def test_model_accuracy(n_sample=10000, print_wrongs=False):
    data_balance_randomize_classes()
    n_correct = 0
    for iteration in range(n_sample):
        s_bef, s_aft, s_class, s_word_vs = get_random_sample()
        output = evaluate(encoder_rnn, decoder_rnn, s_bef, s_word_vs)
        
        output = output[0:len(s_aft)]
        if s_aft == output:
            n_correct += 1
        else:
            if print_wrongs:
                print("{:<20} -> {} \n{:<20} != {}".format(s_bef, output, '', s_aft))
                

    print("Accuracy: {:>4.2%} ({:>8d}/{:>8d})".format(
            n_correct/n_sample, n_correct, n_sample))

test_model_accuracy(20, print_wrongs=True)

$18,865              -> eighteen thousand ei 
                     != eighteen thousand eight hundred sixty five dollars
754-1985             -> seven five five fie  
                     != seven five four sil one nine eight five
Cornwall             -> Conrllll 
                     != Cornwall
espn                 -> espnnnt 
                     != e s p n
NUTV                 -> n u t t 
                     != n u t v
23 August 2012       -> the twenty third of  
                     != the twenty third of august twenty twelve
298                  -> two hundred ni 
                     != two nine eight
2.38                 -> two point three eigh 
                     != two point three eight
$18,125              -> eighteen thousand tw 
                     != eighteen thousand one hundred twenty five dollars
Accuracy: 55.00% (      11/      20)


In [187]:
data_balance_randomize_classes(10000)
for _ in range(30):
    s_bef, s_aft, s_class, s_word_vs = get_random_sample()
    s_result = evaluate(encoder_rnn, decoder_rnn, s_bef, s_word_vs, max_length=50)   
    
    s_result = s_result[0:len(s_aft)]
    
    print("{:<15} -> {:<20} ({:<10} - {})".format(
        s_bef, s_result, s_aft, s_class))

print()
print("Clipped results because mistake in training")

1,003 m         -> one thousand thirty meter (one thousand three meters - MEASURE)
500 lb          -> five hundred cennis  (five hundred pounds - MEASURE)
07              -> o seven              (o seven    - DIGIT)
Wug.za.net      -> z u g e dot e e e e e e (w u g dot z a dot n e t - ELECTRONIC)
football        -> foobblll             (football   - PLAIN)
1,300 ft        -> one thousand three hundred feet (one thousand three hundred feet - MEASURE)
ESPN.com        -> e s p n dot c o m    (e s p n dot c o m - ELECTRONIC)
connected       -> connentee            (connected  - PLAIN)
2008            -> two thousand eight   (two thousand eight - DATE)
1,153.1         -> one thousand one hundred tinety five ppont  nn (one thousand one hundred fifty three point one - DECIMAL)
II              -> two second           (the second - ORDINAL)
)               -> )                    ()          - PUNCT)
$50,673         -> fifty thousand sex hundred seventy shree dollars (fifty thousand six hundred

In [None]:
data_balance_randomize_classes(100000)
data_balanced_classes.groupby("class")["class"].count()

In [None]:
# data_balanced_classes.sample(10)

In [None]:
all_losses = [arr[0] for arr in model.train_history]
plt.figure()
plt.plot(all_losses)

# Saving the model

In [80]:
def save_model():
    saved_model_path = 'data/models/gen_1_rnn_' + str(model_train_iterations)
    print("Saving:", saved_model_path)
    torch.save(decoder_rnn.state_dict(), saved_model_path+'_decoder')
    torch.save(encoder_rnn.state_dict(), saved_model_path+'_encoder')