In [1]:
%matplotlib inline
import importlib
from pytorch_utils_oh_2 import *

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
MODEL_SAVE_PATH = 'numbers_gen_6_2_layer_output_lstm'

In [4]:
import pytorch_utils_oh_2; importlib.reload(pytorch_utils_oh_2); from pytorch_utils_oh_2 import *;

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


# Data loading

In [5]:
all_data = pickle.load(open("data/en_train_fixed_1.pkl", "rb" ))
all_data_sentence_index = all_data.set_index('sentence_id')

In [6]:
all_data.sample(2)

Unnamed: 0,sentence_id,token_id,class,before,after,class_org
7313013,554751,11,NOT_CHANGED,;,;,PUNCT
4630324,354644,19,NOT_CHANGED,settlement,settlement,PLAIN


In [7]:
categories_all = all_data["class"].unique()
print(categories_all)
print(len(categories_all))
categories_index = dict((c, i) for i, c in enumerate(categories_all))

['NOT_CHANGED' 'NUMBERS' 'LETTERS' 'PLAIN' 'VERBATIM' 'ELECTRONIC']
6


### Utils stuff

In [8]:
chars_normal, chars_normal_index = load_characters_pkl('data/en_features/chars_normal.pkl')
print(''.join(chars_normal))

<SOS><EOS>☒ !"#$%&'(),-./0123456789:;ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~£¥ª²³µº¼½¾éɒʻˈΩμ—€⅓⅔⅛


In [9]:
common_words, common_words_index = load_common_words_10k()
len(common_words)
common_words[0:10]

8192

['<EOS>', '<SOS>', '<UNK>', '<0000>', '<SAMPLE>', '.', ',', 'the', '"', 'of']

### More balanced sample

In [10]:
number_data = all_data[all_data['class'] == 'NUMBERS']
print("Data rows: {},  (dropped rows: {})".format(len(number_data), len(all_data)-len(number_data)))
number_data = number_data.reset_index(drop=True)

Data rows: 448172,  (dropped rows: 9470020)


In [11]:
balanced_data = number_data

balanced_data_length = len(balanced_data)
def balanced_data_sample_row():
    global balanced_data_last_sample
    balanced_data_last_sample = balanced_data.iloc[random.randint(1, balanced_data_length-1)]
    return balanced_data_last_sample

In [12]:
 balanced_data_sample_row()

sentence_id                                 450963
token_id                                         4
class                                      NUMBERS
before                              2 October 2011
after          the second of october twenty eleven
class_org                                     DATE
Name: 268840, dtype: object

### Number words

In [13]:
arr = list(set(list(number_data['after'])))
arr = [s.split(' ') for s in arr]
arr = np.concatenate(arr)
arr = sorted(list(set(arr)))
number_words = [EOS_TOKEN, SOS_TOKEN, UNKNOWN_WORD_TOKEN, NUMBER_WORD_TOKEN, SAMPLE_WORD_TOKEN] + arr
number_words_index = dict((c, i) for i, c in enumerate(number_words))
len(number_words)

511

In [14]:
def number_words_to_tensor(words, include_eos=True):
    return words_to_tensor(words, words_lookup_index=number_words_index, include_eos=include_eos)
number_words_to_tensor(['one', 'first']).shape

torch.Size([1, 3, 511])

In [15]:
number_words_index['first']

133

In [16]:
number_words_onehot_sos = number_words_to_tensor([SOS_TOKEN], include_eos=False)
#number_words_onehot_sos = Variable(torch.from_numpy(number_words_onehot_sos)).cuda()
number_words_onehot_sos.size()

torch.Size([1, 1, 511])

### Samples

In [17]:
def get_random_sample():
    sample_row = balanced_data_sample_row()
    sentence_id = sample_row['class']

    rows = all_data_sentence_index.loc[sample_row['sentence_id']]
    befores = list(rows.before)
        
    token_id_idx = list(rows['token_id']).index(sample_row['token_id'])
    befores[token_id_idx] = SAMPLE_WORD_TOKEN
    
    return sample_row['before'], sample_row['after'], sample_row['class'], befores
            
def tmp():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    print(s_class, ':', s_bef, '->', s_aft)
    print(' '.join(s_sentence))
    print(s_sentence)
    print(words_to_tensor(list(s_sentence), common_words_index).shape)
    print(string_to_tensor(s_bef, chars_normal_index).shape)
    print(number_words_to_tensor(s_aft.split(' ')).shape)
tmp()

NUMBERS : 725 -> seven hundred twenty five
During the five day procedure <SAMPLE> landings were made while 94 pilots were qualified .
['During', 'the', 'five', 'day', 'procedure', '<SAMPLE>', 'landings', 'were', 'made', 'while', '94', 'pilots', 'were', 'qualified', '.']
torch.Size([1, 16, 8192])
torch.Size([1, 4, 104])
torch.Size([1, 5, 511])


# Model functions

In [18]:
use_cuda = True

### Encoder

In [19]:
class EncoderRNN(nn.Module):
    def __init__(self, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [20]:
encoder_rnn = EncoderRNN(words_input_size=len(common_words), chars_input_size=len(chars_normal),
                         words_hidden_size=128, chars_hidden_size=128,
                         words_layers=2, chars_layers=2).cuda()
encoder_rnn

EncoderRNN (
  (rnn_words): LSTM(8192, 64, num_layers=2, batch_first=True, bidirectional=True)
  (rnn_chars): LSTM(104, 64, num_layers=2, batch_first=True, bidirectional=True)
)

In [21]:
def test_encoder_single_sample():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    return encoder_rnn(words_t, string_t)
    
encoder_output = test_encoder_single_sample()
encoder_output.size()

torch.Size([1, 256])

### Decoder

In [22]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.LSTM(input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
                         # LSTM would require own hidden included
        
        self.lin_out = nn.Linear(hidden_size, input_size)
        #self.softmax = nn.LogSoftmax()

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden
    
    def init_rest_hidden(self, input_var):
        hid_var_1 = Variable(torch.zeros(self.n_layers - 1, 1, self.hidden_size)).cuda()
        hid_var_2 = Variable(torch.zeros(self.n_layers, 1, self.hidden_size)).cuda()
        res_1 = torch.cat((input_var, hid_var_1), 0)
        return res_1, hid_var_2
        

decoder_rnn = DecoderRNN(input_size=len(number_words), hidden_size=encoder_output.size()[-1], n_layers=2)
decoder_rnn = decoder_rnn.cuda()
decoder_rnn

DecoderRNN (
  (rnn): LSTM(511, 256, num_layers=2, batch_first=True)
  (lin_out): Linear (256 -> 511)
)

In [23]:
[a.size() for a in decoder_rnn.init_rest_hidden(encoder_output.view(1,1,-1))]

[torch.Size([2, 1, 256]), torch.Size([2, 1, 256])]

In [24]:
tmp_hiddens = decoder_rnn.init_rest_hidden(encoder_output.view(1,1,-1))
tmp_a, tmp_b = decoder_rnn(Variable(number_words_onehot_sos).cuda(), tmp_hiddens)
print(tmp_a.size())
print([a.size() for a in tmp_b])
print(tmp_a.topk(1)[1])
print(number_words[tmp_a.topk(1)[1].data[0][0]])

torch.Size([1, 511])
[torch.Size([2, 1, 256]), torch.Size([2, 1, 256])]
Variable containing:
 462
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

toronto


In [25]:
tmp_a, tmp_b = decoder_rnn(Variable(number_words_onehot_sos).cuda(), tmp_b)
print(tmp_a.size())
print(tmp_a.topk(1)[1])
print(number_words[tmp_a.topk(1)[1].data[0][0]])

torch.Size([1, 511])
Variable containing:
 462
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

toronto


# Training etc

### Accuracy

In [26]:
def test_model_single_sample(model=None):
    s_bef, s_aft, s_class, s_sentence = sample = get_random_sample()
        
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = decoder_rnn.init_rest_hidden(encoder_output)
    decoder_input = Variable(number_words_onehot_sos).cuda()

    decoded_output = []
    max_length = 20
    for _ in range(max_length):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)
        #return decoder_output

        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = number_words[word_index] # Use own prediction as next input
                
        if word == EOS_TOKEN:
            break

        decoded_output.append(word)
        
        decoder_input = number_words_to_tensor([word], include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
    
    output = ' '.join(decoded_output)
    return output, output, s_aft, sample
    
tmp = test_model_single_sample(None)
tmp

('toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto',
 'toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto',
 'twelve',
 ('12',
  'twelve',
  'NUMBERS',
  ['Ynkepenee',
   '12',
   '30, ',
   'Yngelpenne',
   '12',
   '35,',
   ',',
   'Ynkepenne',
   '12',
   '41, ',
   'Ingelpenne',
   '12',
   '41,',
   ',',
   'Hingepenna',
   '12',
   '42, ',
   'Ingepepenn',
   '<SAMPLE>',
   '42,',
   ',',
   'Ingelpenn',
   '12',
   '52, ',
   'Enkepenne',
   '12',
   '82,',
   ',',
   'Inckepene',
   '1292',
   '.']))

In [27]:
def print_local_wrong_predictions(max_results=10):
    arr = get_some_wrong_predictions(None, test_model_single_sample, max_iterations=10000, max_results=max_results)
    for sample, predict, output in arr:
        s_bef, s_aft, s_class, s_sentence = sample
        print("{:<14} => {:<14} || {} \n{:>17} {}".format(s_bef, predict, s_aft, '', ' '.join(s_sentence), ))


In [28]:
print_local_wrong_predictions(2)

11             => toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto || eleven 
                  " Dog Hears Senate Howl ; Blind Owner Sees to It , " New York Times ( January 6, 1950 ) : <SAMPLE> .
1662           => toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto toronto || sixteen sixty two 
                  " Register of Salomon La Chair , Notary Public at New Amsterdam 1661 - <SAMPLE> " .


In [29]:
%%time
test_model_accuracy(encoder_rnn, test_model_single_sample)

Accuracy: 0.00% (       0/   10000)
CPU times: user 14min 15s, sys: 16.5 s, total: 14min 32s
Wall time: 6min 6s


0.0

### Training

In [30]:
def train(s_bef, s_aft, s_sentence, encoder_optimizer, decoder_optimizer, loss_function,
          use_teacher_forcing, max_length=20):
    
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = decoder_rnn.init_rest_hidden(encoder_output)
    decoder_input = Variable(number_words_onehot_sos).cuda()
    
    ###
    
    target_arr = s_aft.split(' ') + [EOS_TOKEN]
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    
    decoded_output = []
    for i in range(len(target_arr)):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)

        decoder_target_i = number_words_index[target_arr[i]]
        decoder_target_i = Variable(torch.LongTensor([decoder_target_i])).cuda()
        loss += loss_function(decoder_output, decoder_target_i)
        
        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = number_words[word_index] # Use own prediction as next input
        decoded_output.append(word)
        
        if use_teacher_forcing:
            word = target_arr[i] # replace input with right target
        else:
            # use output normally as input 
            if word == EOS_TOKEN:
                break
                
        decoder_input = number_words_to_tensor([word], include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
        
    if decoded_output[-1] == EOS_TOKEN:
        decoded_output = decoded_output[:-1]
        
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return ' '.join(decoded_output), (loss.data[0] / len(target_arr))


In [31]:
def train_iterations(n_iters=100000, lr=0.001, teacher_forcing_ratio=0.5,
                     print_every=10000, plot_every=1000):

    start = time.time()
    
    decoder_rnn.train()
    encoder_rnn.train()

    current_loss = 0
    current_loss_iter = 0

    encoder_optimizer = torch.optim.Adam(encoder_rnn.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder_rnn.parameters(), lr=lr)
    loss_function = nn.NLLLoss()
    
    for iteration in range(1, n_iters + 1):
        model_training.iterations += 1
        
        use_teacher_forcing = random.random() < teacher_forcing_ratio
        
        s_bef, s_aft, s_class, s_sentence = get_random_sample()
        
        result, loss = train(s_bef=s_bef, s_aft=s_aft, s_sentence=s_sentence,
                             encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer,
                             loss_function=nn.NLLLoss(), use_teacher_forcing=use_teacher_forcing,
                             max_length=40 )
        
        current_loss += loss
        current_loss_iter += 1

        # Print iter number, loss, name and guess
        if iteration % print_every == 0:
            teacher_forcing_str = ""
            if use_teacher_forcing:
                teacher_forcing_str = "(forcing)"
            correct = '✓' if result == s_aft else "✗: {}".format(s_aft)
            
            print("{:>6d} {:>4.0%} ({:>8}) {:>7.3f}   | {:>6.2f}: {} -> {} ({}) {}".format(
                      model_training.iterations, iteration/n_iters, time_since(start),
                      current_loss/current_loss_iter, loss,
                      s_bef, result, correct, teacher_forcing_str))

        # Add current loss avg to list of losses
        if iteration % plot_every == 0:
            model_training.losses.append(current_loss / plot_every)
            model_training.learning_rates.append(lr)
            current_loss = 0
            current_loss_iter = 0
            
        if model_training.iterations % 50000 == 0 or model_training.iterations == 10:
            model_training.save_models()
            acc = test_model_accuracy(encoder_rnn, test_model_single_sample)
            model_training.accuracy.append(acc)
    
    # test_model_accuracy(model, n_sample=10000)

In [32]:
model_training = ModelTraining(MODEL_SAVE_PATH, [encoder_rnn, decoder_rnn])

Save path: data/models/numbers_gen_6_2_layer_output_lstm


In [33]:
train_iterations(n_iters=50, print_every=9, lr=0.0001)

     9  18% (   0m 0s)   6.233   |   6.21: 2 -> toronto toronto (✗: two) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/10_(EncoderRNN/DecoderRNN)
Accuracy: 0.00% (       0/   10000)
    18  36% (  6m 12s)   6.230   |   6.23: 2006 -> toronto toronto toronto toronto (✗: two thousand six) (forcing)
    27  54% (  6m 12s)   6.226   |   6.21: March 22, 2011 -> two two two two two two (✗: march twenty second twenty eleven) (forcing)
    36  72% (  6m 13s)   6.160   |   6.19: 1984 -> two two two (✗: nineteen eighty four) (forcing)
    45  90% (  6m 13s)   6.085   |   6.18: 3 -> two two (✗: three) (forcing)


In [34]:
train_iterations(n_iters=(1000-model_training.iterations), print_every=500, lr=0.0001)

   550  53% (  0m 14s)   2.853   |   3.90: December 2011 -> two <EOS> <EOS> (✗: december twenty eleven) (forcing)


In [35]:
train_iterations(n_iters=9000, lr=0.0001, print_every=1000)

  2000  11% (  0m 33s)   2.531   |   2.87: 89 -> two (✗: eighty nine) 
  3000  22% (   1m 7s)   2.575   |   2.23: 50 -> two (✗: fifty) 
  4000  33% (  1m 41s)   2.487   |   1.65: 5 -> two (✗: five) (forcing)
  5000  44% (  2m 13s)   2.405   |   2.18: 29 August 2013 -> the twenty of of twenty twenty (✗: the twenty ninth of august twenty thirteen) 
  6000  56% (  2m 46s)   2.356   |   2.57: 33 -> one <EOS> (✗: thirty three) (forcing)
  7000  67% (  3m 18s)   2.322   |   1.84: 3,666.3 -> the twenty thousand (✗: three thousand six hundred sixty six point three) 
  8000  78% (  3m 54s)   2.238   |   3.13: May 31, 2013 -> the twenty thousand <EOS> <EOS> (✗: may thirty first twenty thirteen) (forcing)
  9000  89% (  4m 29s)   2.240   |   1.11: 1 -> one (✓) 
 10000 100% (   5m 4s)   2.203   |   1.15: II -> two (✓) 


In [36]:
train_iterations(n_iters=90000, print_every=10000)

 20000  11% (  5m 41s)   1.518   |   2.46: 18 November 1902 -> the twenty of of twenty twenty (✗: the eighteenth of november nineteen o two) 
 30000  22% ( 11m 27s)   1.200   |   0.02: 2 -> two (✓) 
 40000  33% ( 16m 52s)   0.918   |   0.18: 48 -> forty eight (✓) 
 50000  44% ( 22m 29s)   0.737   |   0.47: 247 -> two hundred forty nine (✗: two hundred forty seven) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/50000_(EncoderRNN/DecoderRNN)
Accuracy: 52.82% (    5282/   10000)
 60000  56% ( 30m 15s)   0.572   |   1.50: 1454.5/mi² -> one hundred three hundred fifty point point five per square kilometers (✗: one thousand four hundred fifty four point five per square miles) (forcing)
 70000  67% (  36m 4s)   0.515   |   0.72: 17 April 1973 -> the nineteenth of july nineteen ninety three (✗: the seventeenth of april nineteen seventy three) 
 80000  78% ( 41m 51s)   0.479   |   0.00: 14 -> fourteen (✓) (forcing)
 90000  89% ( 47m 40s)   0.425   |   0.00: 4 -> four (✓) (forcing

In [37]:
1+1

2

In [38]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.001)

110000   3% (  5m 42s)   0.353   |   0.00: 2008 -> two thousand eight (✓) 
120000   7% ( 10m 47s)   0.355   |   0.03: II -> two (✓) (forcing)
130000  10% ( 14m 50s)   0.333   |   0.05: 5.3% -> five point three percent (✓) 
140000  13% ( 18m 54s)   0.333   |   0.00: 220 -> two hundred twenty (✓) (forcing)
150000  17% ( 22m 55s)   0.298   |   0.02: 2006 -> two thousand six (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/150000_(EncoderRNN/DecoderRNN)
Accuracy: 72.76% (    7276/   10000)
160000  20% ( 28m 16s)   0.273   |   0.01: 2013 -> twenty thirteen (✓) 
170000  23% ( 32m 20s)   0.298   |   0.29: September 16, 2010 -> september sixteenth twenty ten (✓) 
180000  27% ( 36m 23s)   0.216   |   0.01: 1982 -> nineteen eighty two (✓) (forcing)
190000  30% ( 40m 27s)   0.234   |   0.48: 0.03 -> zero point five three (✗: zero point o three) (forcing)
200000  33% ( 43m 21s)   0.265   |   0.00: 3 -> three (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/200000_

In [39]:
print_local_wrong_predictions()

2010-06-21     => the twenty first of february twenty ten || the twenty first of june twenty ten 
                  Archived from the original on <SAMPLE> .
XVII           => twenty         || seventeen 
                  " Book IV , Chapter <SAMPLE> : Of Reason " .
1999           => nineteen ninety nine || one thousand nine hundred ninety nine 
                  Between <SAMPLE> - 2000 , Farouq , under the mandatory National Youth Service Corps ( NYSC ) Scheme served with the Nigeria National Assembly .
$35.01         => thirty five thousand two hundred one dollars || thirty five dollars and one cent 
                  Shares sharply rose 19% by the end of the trading day to <SAMPLE> a share , up from $26 .
Fri 29 Nov 2002 => friday the twenty first of february two thousand two || friday the twenty ninth of november two thousand two 
                  " Official Results <SAMPLE> " ( PDF ) .
198 cm         => one hundred ninety eight feet || one hundred ninety eight centimeters 
      

In [40]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.1, lr=0.001)

410000   3% (  2m 47s)   0.115   |   0.00: 59% -> fifty nine percent (✓) 
420000   7% (  5m 36s)   0.100   |   0.00: 26 -> twenty six (✓) 
430000  10% (  8m 24s)   0.150   |   0.06: III -> three (✓) 
440000  13% ( 11m 14s)   0.129   |   0.00: 301 -> three hundred one (✓) 
450000  17% (  14m 2s)   0.149   |   0.00: 1.9 mi -> one point nine miles (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/450000_(EncoderRNN/DecoderRNN)
Accuracy: 92.24% (    9224/   10000)
460000  20% ( 17m 46s)   0.128   |   0.00: 4 -> four (✓) 
470000  23% ( 20m 35s)   0.132   |   0.00: December 23, 2014 -> december twenty third twenty fourteen (✓) 
480000  27% ( 23m 24s)   0.129   |   0.00: 1659 -> sixteen fifty nine (✓) (forcing)
490000  30% ( 26m 12s)   0.141   |   3.11: 0 -> zero (✗: o) 
500000  33% (  29m 2s)   0.123   |   0.00: 90 -> ninety (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/500000_(EncoderRNN/DecoderRNN)
Accuracy: 91.25% (    9125/   10000)
510000  37% ( 32m 4

In [41]:
print_local_wrong_predictions()

0.19%          => zero point one seven percent || zero point one nine percent 
                  It got 8393 votes ( <SAMPLE> of the statewide vote ) , but was not close to winning any seat .
1912/3         => nineteen thousand two hundred hours || one thousand nine hundred twelve thirds 
                  This was mainly intended for goods , but a passenger service did run on the branch until <SAMPLE> .
$49,808        => forty nine thousand six hundred eight dollars || forty nine thousand eight hundred eight dollars 
                  The median income for a household in the village was <SAMPLE> , and the median income for a family was $56,375 .
64,000,000 m3  => sixty four thousand square miles || sixty four million cubic meters 
                  The project diverts and delivers an average of 52,000 acre feet ( <SAMPLE> ) of water a year .
343/344        => three hundred forty three thousand four hectares || three hundred forty three three hundred forty fourths 
                  <S

In [42]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.0001)

710000   3% (  2m 48s)   0.094   |   0.00: 41st -> forty first (✓) 
720000   7% (  5m 38s)   0.102   |   0.00: 168 -> one hundred sixty eight (✓) (forcing)
730000  10% (  8m 27s)   0.073   |   0.00: 11 -> eleven (✓) (forcing)
740000  13% ( 11m 18s)   0.068   |   0.01: 35 -> thirty five (✓) 
750000  17% (  14m 7s)   0.056   |   0.00: 300,000 -> three hundred thousand (✓) (forcing)
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/750000_(EncoderRNN/DecoderRNN)
Accuracy: 94.76% (    9476/   10000)
760000  20% ( 17m 51s)   0.063   |   0.00: 69 -> sixty nine (✓) (forcing)
770000  23% ( 20m 40s)   0.073   |   0.00: 1982 -> nineteen eighty two (✓) 
780000  27% ( 23m 30s)   0.079   |   0.00: 1st -> first (✓) (forcing)
790000  30% ( 26m 20s)   0.078   |   0.00: 4 April 2011 -> the fourth of april twenty eleven (✓) (forcing)
800000  33% (  29m 9s)   0.072   |   0.16: 2011 -> twenty eleven (✓) (forcing)
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/800000_(EncoderRNN/De

In [43]:
print_local_wrong_predictions()

US$14.5        => fourteen dollars || fourteen dollars and fifty cents 
                  Early estimates placed insured losses from the earthquake alone at <SAMPLE> to $34.6 billion .
0-06-438580-9  => o sil o six sil o o o o five o sil nine || o sil o six sil four three eight five eight o sil nine 
                  ISBN <SAMPLE> Vitruvius , Translation : Morris Hicky Morgan ( 1960 ) .
6256-6259      => six two five five sil two two five nine || six two five six sil six two five nine 
                  Score , Edition Peters <SAMPLE> .
2001-10-28     => the twenty eighth of december two thousand one || the twenty eighth of october two thousand one 
                  Patrick Marber Biography ( 1964 - ) Marowitz , Charles ( <SAMPLE> ) .
1391           => thirteen ninety one || one three nine one 
                  It meets a roundabout for Swineshead and passes through Drayton , meets the <SAMPLE> as Abbey Lane , then Drayton Road .
95.45%         => ninety six point two three percent 

In [44]:
train_iterations(n_iters=100000, print_every=10000, teacher_forcing_ratio=0, lr=0.001)

1010000  10% (  2m 50s)   0.133   |   0.00: 11 March 2016 -> the eleventh of march twenty sixteen (✓) 
1020000  20% (  5m 38s)   0.127   |   0.00: 1986 -> nineteen eighty six (✓) 
1030000  30% (  8m 28s)   0.106   |   0.03: $399 -> three hundred ninety nine dollars (✓) 
1040000  40% ( 11m 17s)   0.107   |   0.00: 2001 -> two thousand one (✓) 
1050000  50% (  14m 9s)   0.111   |   0.00: 720 -> seven hundred twenty (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/1050000_(EncoderRNN/DecoderRNN)
Accuracy: 92.81% (    9281/   10000)
1060000  60% ( 17m 53s)   0.132   |   0.00: 7.49 -> seven point four nine (✓) 
1070000  70% ( 20m 44s)   0.118   |   0.00: December 6, 2010 -> december sixth twenty ten (✓) 
1080000  80% ( 23m 34s)   0.152   |   0.01: 2015 -> twenty fifteen (✓) 
1090000  90% ( 26m 23s)   0.117   |   0.00: 50,000 -> fifty thousand (✓) 
1100000 100% ( 29m 14s)   0.131   |   0.00: 1987 -> nineteen eighty seven (✓) 
Saved model to data/models/numbers_gen_6_2_layer_

In [45]:
print_local_wrong_predictions()

2012-07-15     => the fifteenth of march twenty twelve || the fifteenth of july twenty twelve 
                  Retrieved on <SAMPLE> .
169/4522       => one hundred sixty nine thousand two hundred || one hundred sixty nine four thousand five hundred twenty seconds 
                  45 RTR War Diary , Oct 1942 , The National Archives , file WO <SAMPLE> .
2012-03-04     => the fourth of july twenty twelve || the fourth of march twenty twelve 
                  Archived from the original on <SAMPLE> .
19278          => nineteen thousand two hundred eighty eight || nineteen thousand two hundred seventy eight 
                  5 ; Issue <SAMPLE> ; col EThe Times , Thursday, May 06, 1847 ; pg .
2007-04-12     => the twelfth of july two thousand seven || the twelfth of april two thousand seven 
                  Ross , Barbara ; Tracy Connor ( <SAMPLE> ) .
05-2010        => the fifth of march twenty ten || o five sil two o one o 
                  There have been 6 rounds of surveys to da

In [46]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

1110000   3% (  2m 48s)   0.087   |   0.00: 57 -> fifty seven (✓) 
1120000   7% (  5m 37s)   0.089   |   0.01: 2007 -> two thousand seven (✓) 
1130000  10% (  8m 25s)   0.088   |   0.00: 42 -> forty two (✓) 
1140000  13% ( 11m 15s)   0.072   |   0.00: 5 April 2008 -> the fifth of april two thousand eight (✓) 
1150000  17% (  14m 4s)   0.076   |   0.00: 21 -> twenty one (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/1150000_(EncoderRNN/DecoderRNN)
Accuracy: 94.96% (    9496/   10000)
1160000  20% ( 17m 48s)   0.077   |   0.01: 2012 -> twenty twelve (✓) 
1170000  23% ( 20m 39s)   0.094   |   0.00: 2000 -> two thousand (✓) 
1180000  27% ( 23m 29s)   0.084   |   0.00: 67 -> sixty seven (✓) 
1190000  30% ( 26m 18s)   0.079   |   0.00: 14 February 2015 -> the fourteenth of february twenty fifteen (✓) 
1200000  33% (  29m 7s)   0.083   |   0.00: 1 -> one (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/1200000_(EncoderRNN/DecoderRNN)
Accuracy: 95.02% (    9

In [47]:
print_local_wrong_predictions()

5,724.7        => five thousand seven hundred twenty nine point seven || five thousand seven hundred twenty four point seven 
                  The population density was <SAMPLE> people per square mile ( 2,214.2/km2 ) .
(2009) 103     => two o o nine sil o o o three || two o o nine sil one o three 
                  Heredity <SAMPLE> , pp .
0 906 245 06 0 => o sil nine o o o o o o o o sil o sil o || o sil nine o six sil two four five sil o six sil o 
                  ISBN <SAMPLE> .
0-8020-8191-62011 => o sil eight o o o sil o o one one sil || o sil eight o two o sil eight one nine one sil six two o one one 
                  Toronto : University of Toronto Press , ISBN <SAMPLE> : Directions Home : Approaches to African Canadian Literature .
Tuesday, October 5, 2004 => tuesday november fifth two thousand four || tuesday october fifth two thousand four 
                  John Scheinman , " ABC Will Broadcast Belmont Stakes Starting in 2006 , " The Washington Post , <SAMPLE> .
10/03/20

In [48]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

1410000   5% (  3m 49s)   0.080   |   0.00: 1981 -> nineteen eighty one (✓) 
1420000  10% (  7m 40s)   0.071   |   0.00: 8 Oct 2011 -> the eighth of october twenty eleven (✓) 
1430000  15% ( 11m 33s)   0.058   |   0.00: 550 -> five hundred fifty (✓) 
1440000  20% ( 15m 24s)   0.065   |   0.00: 0.8% -> zero point eight percent (✓) 
1450000  25% (  19m 8s)   0.066   |   0.00: 1863 -> eighteen sixty three (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/1450000_(EncoderRNN/DecoderRNN)
Accuracy: 96.16% (    9616/   10000)
1460000  30% ( 24m 15s)   0.049   |   0.00: 56 -> fifty six (✓) 
1470000  35% (  28m 5s)   0.057   |   0.00: 100 -> one hundred (✓) 
1480000  40% ( 31m 54s)   0.080   |   0.01: 7/20 -> seven twentieths (✓) 
1490000  45% ( 35m 40s)   0.066   |   0.00: 1966 -> nineteen sixty six (✓) 
1500000  50% ( 39m 29s)   0.058   |   0.00: 24 -> twenty four (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/1500000_(EncoderRNN/DecoderRNN)
Accuracy: 96.35%

In [None]:
print_local_wrong_predictions()

1992           => nineteen ninety two || one thousand nine hundred ninety two 
                  131 Idaho Blue Book , 1991 - <SAMPLE> Edition , pg .
3,378.4/km²    => three thousand three hundred seventy eight point seven per square kilometers || three thousand three hundred seventy eight point four per square kilometers 
                  The population density was 8,996.8 people per square mile ( <SAMPLE> ) .
XXXV           => twenty five    || thirty five 
                  The Book Review <SAMPLE> ( 11 ) .
121968         => one hundred twenty thousand nine hundred sixty eight || one hundred twenty one thousand nine hundred sixty eight 
                  Congo Premier League : <SAMPLE> , 1978 , 1979 , 1980 , 1985 , 1987 , 1989 , 1993 , 1994 , 2000 , 2001 , 2006 .
01             => o one          || one 
                  The 2000 - <SAMPLE> season was again tough for Berg .
0-374-93239-5  => o sil three seven seven sil four nine one nine nine sil five || o sil three seven four sil 

In [None]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

1610000   5% (  3m 49s)   0.066   |   0.00: 1907 -> nineteen o seven (✓) 
1620000  10% (  7m 35s)   0.059   |   0.00: January 2009 -> january two thousand nine (✓) 
1630000  15% ( 11m 23s)   0.057   |   0.00: 52 -> fifty two (✓) 
1640000  20% ( 15m 15s)   0.046   |   0.00: 2015 -> twenty fifteen (✓) 
1650000  25% (  19m 7s)   0.070   |   0.00: 1:54 -> one fifty four (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/1650000_(EncoderRNN/DecoderRNN)
Accuracy: 96.38% (    9638/   10000)
1660000  30% ( 24m 13s)   0.054   |   0.00: March 20, 2009 -> march twentieth two thousand nine (✓) 
1670000  35% ( 27m 58s)   0.070   |   0.00: 1980 -> nineteen eighty (✓) 
1680000  40% ( 32m 21s)   0.065   |   0.00: December 16, 2013 -> december sixteenth twenty thirteen (✓) 


In [54]:
print_local_wrong_predictions()

34's           => thirty fives   || thirty fours 
                  West Philadelphia Streetcar Suburb Historic District " Studio <SAMPLE> Eponymous Trolley , or , A Short History of Route 34 " .
III            => three          || the third 
                  George <SAMPLE> , however , did not share the same view .
140            => one hundred forty || one four o 
                  Enneasartorite is a very rare mineral with formula Tl 6Pb 32 As 70 S <SAMPLE> .
2012           => twenty twelve  || two thousand twelve 
                  The Orange Drive Miami Beach Music Festival 2011 - <SAMPLE> will be held December 30, 31 , January 1 .
411's          => four one       || four elevens 
                  " <SAMPLE> WWE Raw Report 11.16 .09 " .
4463           => four thousand four hundred sixty three || four four six three 
                  Accident description for CCCP- L <SAMPLE> at the Aviation Safety Network .
2490           => two thousand four hundred ninety || two four nine o 
 

In [None]:
train_iterations(n_iters=400000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

2010000   2% (  3m 23s)   0.061   |   0.00: 1991-10-16 -> the sixteenth of october nineteen ninety one (✓) 
2020000   5% (  6m 43s)   0.061   |   0.00: 6 -> six (✓) 
2030000   8% (  10m 0s)   0.072   |   0.03: 149.5/km² -> one hundred forty nine point five per square kilometers (✓) 
2040000  10% (  13m 7s)   0.066   |   0.00: 1958 -> nineteen fifty eight (✓) 
2050000  12% ( 16m 18s)   0.046   |   0.00: January 2012 -> january twenty twelve (✓) 
Saved model to data/models/numbers_gen_6_2_layer_output_lstm/2050000_(EncoderRNN/DecoderRNN)
Accuracy: 96.21% (    9621/   10000)
2060000  15% ( 20m 21s)   0.048   |   0.00: 23 -> twenty three (✓) 
2070000  18% ( 23m 20s)   0.075   |   0.00: February 1506 -> february fifteen o six (✓) 
2080000  20% ( 26m 18s)   0.045   |   0.00: 1954 -> nineteen fifty four (✓) 
2090000  22% ( 29m 12s)   0.073   |   0.00: January 2001 -> january two thousand one (✓) 
2100000  25% ( 32m 11s)   0.063   |   0.00: 1 -> one (✓) 
Saved model to data/models/numbers_gen_

In [None]:
print_local_wrong_predictions()