In [17]:
%matplotlib inline
import importlib
import pytorch_utils_oh_1; importlib.reload(pytorch_utils_oh_1); from pytorch_utils_oh_1 import *;

Pytorch utils oh: pytorch_utils_oh_1.py
Pytorch: 0.2.0_4
Loadig pytorch_utils_oh defaults


# Data loading

In [2]:
all_data_org = pd.read_csv('data/en_train_org.csv')

In [3]:
all_data_org[pd.isnull(all_data_org['before'])][:3]
all_data = all_data_org.dropna()
print("Data rows: {},  (dropped none rows: {})".format(len(all_data), len(all_data_org)-len(all_data)))

Data rows: 9918390,  (dropped none rows: 51)


In [4]:
all_data = all_data[all_data['class'] != 'VERBATIM']
print("Data rows: {},  (dropped (verbatim) rows: {})".format(len(all_data), len(all_data_org)-len(all_data)))
all_data = all_data.reset_index(drop=True)
# Note we dropped VERBATIM class. Thats because it had so many weird characters.

Data rows: 9840282,  (dropped (verbatim) rows: 78159)


In [5]:
all_data_sentence_index = all_data.set_index('sentence_id')

In [6]:
number_classes = ['DATE','CARDINAL','MEASURE','ORDINAL','DECIMAL','MONEY', 'DIGIT', 'TELEPHONE', 'TIME', 'FRACTION', 'ADDRESS']

In [7]:
number_data = all_data[all_data['class'].isin(number_classes)]
print("Data rows: {},  (dropped rows: {})".format(len(number_data), len(all_data_org)-len(number_data)))
number_data = number_data.reset_index(drop=True)

Data rows: 448176,  (dropped rows: 9470265)


### More balanced sample

In [15]:
balanced_data_classes_select = list(number_data.groupby('class'))
def balanced_data_randomize(max_len=10000):
    global balanced_data
    balanced_data = pd.concat([v.sample(min(max_len, len(v))) for k, v in balanced_data_classes_select])
balanced_data_randomize()

### Letters all

In [9]:
tmp = sorted(list(set(''.join(number_data['before']))))
letters_all = ['<EOS>', '<SOS>'] + sorted(list(set(tmp)))
letters_all_index = dict((c, i) for i, c in enumerate(letters_all))
print(''.join(letters_all))
print(len(letters_all))

<EOS><SOS> "$%'(),-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz£¥ª²³µº¼½¾Ωμ€⅓⅔⅛⅝⅞
94


### Number words

In [10]:
arr = list(set(list(number_data['after'])))
arr = [s.split(' ') for s in arr]
arr = np.concatenate(arr)
arr = list(set(arr))
number_words = ['<EOS>', '<SOS>'] + arr
number_words_index = dict((c, i) for i, c in enumerate(number_words))
len(number_words)

546

In [11]:
def number_words_to_tensor(words, include_eos=True):
    return words_to_tensor(words, words_lookup_index=number_words_index, include_eos=include_eos)
number_words_to_tensor(['one', 'first']).shape

(1, 3, 546)

In [12]:
number_words_onehot_sos = number_words_to_tensor([SOS_TOKEN], include_eos=False)
number_words_onehot_sos = Variable(torch.from_numpy(number_words_onehot_sos)).cuda()
number_words_onehot_sos.size()

torch.Size([1, 1, 546])

### Sample

In [18]:
def get_random_sample():
    sample_row = balanced_data.iloc[random.randint(1, len(balanced_data)-1)]
    sentence_id = sample_row['class']

    rows = all_data_sentence_index.loc[sample_row['sentence_id']]
    befores = rows.before.values
        
    token_id_idx = list(rows['token_id']).index(sample_row['token_id'])
    befores[token_id_idx] = SAMPLE_WORD_TOKEN

    return sample_row['before'], sample_row['after'], sample_row['class'], befores
            
def tmp():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    print(s_class, ':', s_bef, '->', s_aft)
    print(' '.join(s_sentence))
    print(sentence_word_vectorize(s_sentence).shape)
    print(number_words_to_tensor(s_aft.split(' ')).shape)
tmp()

ORDINAL : 1st -> first
Hei 2 Fun 1 Lyun 1 Oi 3 ( ; Like To Love ) is Candy Lo's <SAMPLE> compilation album .
(1, 21, 50)
(1, 2, 546)


In [19]:
%%timeit
get_random_sample()

414 µs ± 3.49 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Model functions

In [20]:
use_cuda = True

### Encoder

In [21]:
class EncoderRNN(nn.Module):
    def __init__(self, wordvect_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(wordvect_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        if use_cuda:
            var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
            var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [22]:
def get_encoder(debug=False):
    # s_bef, s_aft, s_class, s_word_vs = get_random_sample()
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    
    s_word_vs = sentence_word_vectorize(s_sentence)
    s_string = string_to_tensor(s_bef, letters_all_index)
    target = number_words_to_tensor(s_aft.split(' '))
    
    encoder_rnn = EncoderRNN(wordvect_size=s_word_vs.shape[-1], chars_input_size=len(letters_all),
                                      words_hidden_size=128, chars_hidden_size=128,
                                      words_layers=2, chars_layers=2).cuda()
    
    output_encoded = encoder_rnn(Variable(torch.from_numpy(s_word_vs)).cuda(), Variable(s_string).cuda())
    if debug:
        print('Word vect size:', s_word_vs.shape, '. String vector size:', s_string.size())
        print('Output:', output_encoded.size())
    return encoder_rnn, output_encoded;
encoder_rnn, output_encoded = get_encoder(debug=True)

Word vect size: (1, 15, 50) . String vector size: torch.Size([1, 3, 94])
Output: torch.Size([1, 256])


### Decoder

In [26]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
                         # LSTM would require own hidden included
        
        self.lin_out = nn.Linear(hidden_size, input_size)
        #self.softmax = nn.LogSoftmax()

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden

decoder_rnn = DecoderRNN(input_size=len(number_words), hidden_size=output_encoded.size()[-1], n_layers=1)
decoder_rnn = decoder_rnn.cuda()
decoder_rnn

DecoderRNN (
  (rnn): GRU(546, 256, batch_first=True)
  (lin_out): Linear (256 -> 546)
)

In [27]:
tmp_a, tmp_b = decoder_rnn(number_words_onehot_sos, output_encoded.view(1,1,-1))
print(tmp_a.size())
print(tmp_a.topk(1)[0])

torch.Size([1, 546])
Variable containing:
-6.1919
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]



In [28]:
tmp_input = number_words_to_tensor([number_words[tmp_a.topk(1)[1].data[0][0]]])
tmp_input = Variable(torch.from_numpy(tmp_input)).cuda()
tmp = decoder_rnn(tmp_input, tmp_b)
[t.size() for t in tmp]

[torch.Size([1, 546]), torch.Size([1, 1, 256])]

# Training etc

### Accuracy

In [30]:
def evaluate(encoder, decoder, s_bef, s_sentence, max_length=20):
    encoder.eval()
    decoder.eval()
    s_word_vs = sentence_word_vectorize(s_sentence)
    s_bef_string = string_to_tensor(s_bef, letters_all_index, include_eos=True)

    encoder_output = encoder(Variable(torch.from_numpy(s_word_vs)).cuda(), Variable(s_bef_string).cuda())
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = encoder_output
    decoder_input = number_words_onehot_sos

    decoded_output = []
    for _ in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = number_words[word_index] # Use own prediction as next input
                
        if word == '<EOS>':
            break

        decoded_output.append(word)
        
        decoder_input = number_words_to_tensor([word], include_eos=False)
        decoder_input = Variable(torch.from_numpy(decoder_input)).cuda()

    return ' '.join(decoded_output)

evaluate(encoder_rnn, decoder_rnn, '12th', 'he was <SAMPLE>')

'hundredth yen kilobytes seventh hundredth kilobytes m m m costa costa costa 100 sime 100 sime 100 sime sime 100'

In [31]:
def test_model_accuracy(n_sample=10000, print_wrongs=False):
    balanced_data_randomize()
    n_correct = 0
    for iteration in range(n_sample):
        s_bef, s_aft, s_class, s_sentence = get_random_sample()
        output = evaluate(encoder_rnn, decoder_rnn, s_bef, s_sentence)
        if s_aft == output:
            n_correct += 1
        else:
            if print_wrongs:
                print("{:<20} -> {} \n{:<20} != {}".format(s_bef, output, '', s_aft))
                

    print("Accuracy: {:>4.2%} ({:>8d}/{:>8d})".format(
            n_correct/n_sample, n_correct, n_sample))

test_model_accuracy(4, print_wrongs=True)

25.7 km              -> philippine yen kilobytes seventh 100 kilobytes m m m m costa costa costa 100 sime 100 sime 100 sime sime 
                     != twenty five point seven kilometers
US$1000              -> philippine philippine senior kilobytes yen kilobytes m m m m costa costa costa 100 sime 100 sime 100 sime sime 
                     != one thousand dollars
NZ$1 billion         -> philippine philippine senior kilobytes yen kilobytes m m m m costa costa costa 100 sime 100 sime 100 sime sime 
                     != one billion dollars
16th                 -> philippine to philippine senior to philippine to philippine senior to philippine to philippine senior to philippine to philippine senior to 
                     != sixteenth
Accuracy: 0.00% (       0/       4)


In [43]:
%%timeit
test_model_accuracy

17.5 ns ± 0.79 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


### Training

In [33]:
def train(s_bef, s_aft, s_sentence, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_function,
          use_teacher_forcing, max_length=20):

    s_word_vs = sentence_word_vectorize(s_sentence)
    s_bef_string = string_to_tensor(s_bef, letters_all_index, include_eos=True)
    target_arr = s_aft.split(' ') + ['<EOS>']
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    encoder_output = encoder(Variable(torch.from_numpy(s_word_vs)).cuda(), Variable(s_bef_string).cuda())
    encoder_output = encoder_output.view(1,1,-1)
    decoder_hidden = encoder_output

    decoder_input = number_words_onehot_sos

    decoded_output = []
    for i in range(len(target_arr)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        decoder_target = number_words_index[target_arr[i]]
        #print(decoder_target)
        decoder_target = Variable(torch.LongTensor([decoder_target])).cuda()
        
        # import IPython; IPython.core.debugger.set_trace()
        
        loss += loss_function(decoder_output, decoder_target)

        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = number_words[word_index] # Use own prediction as next input
        decoded_output.append(word)
        
        if use_teacher_forcing:
            word = target_arr[i] # replace input with right target
        else:
            # use output normally as input 
            if word == '<EOS>':
                break

        decoder_input = number_words_to_tensor([word], include_eos=False)
        decoder_input = Variable(torch.from_numpy(decoder_input)).cuda()

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return((loss.data[0] / len(target_arr)), ' '.join(decoded_output))
#print(train())

In [34]:
model_train_iterations = 0
model_train_history = []

In [47]:
def train_iterations(n_iters=100000, lr=0.001, teacher_forcing_ratio=0.5,
                     print_every=10000, plot_every=1000):
    global model_train_iterations
    global model_train_history
    start = time.time()
    decoder_rnn.train()
    encoder_rnn.train()

    current_loss = 0
    current_loss_iter = 0

    encoder_optimizer = torch.optim.Adam(encoder_rnn.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder_rnn.parameters(), lr=lr)
    loss_function = nn.NLLLoss()
    
    for iteration in range(1, n_iters + 1):
        model_train_iterations += 1

        use_teacher_forcing = random.random() < teacher_forcing_ratio
        
        s_bef, s_aft, s_class, s_sentence = get_random_sample()
        
        loss, result = train(s_bef=s_bef, s_aft=s_aft, s_sentence=s_sentence,
                             encoder=encoder_rnn, decoder=decoder_rnn,
                             encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer,
                             loss_function=nn.NLLLoss(), use_teacher_forcing=use_teacher_forcing,
                             max_length=40 )

        current_loss += loss
        current_loss_iter += 1

        # Print iter number, loss, name and guess
        if iteration % print_every == 0:
            teacher_forcing_str = ""
            if use_teacher_forcing:
                teacher_forcing_str = "(forcing)"
            print("{:>6d} {:>4.0%} ({:>8}) {:>7.3f}   | {:>6.2f}: {} -> {} {} ({})".format(
                      model_train_iterations, iteration/n_iters, time_since(start),
                      current_loss/current_loss_iter, loss,
                      s_bef, teacher_forcing_str, result, s_aft))

        # Add current loss avg to list of losses
        if iteration % plot_every == 0:
            model_train_history.append((current_loss / plot_every, lr))
            current_loss = 0
            current_loss_iter = 0
            
        if model_train_iterations % 10000 == 0:
            balanced_data_randomize()
            
        if model_train_iterations % 50000 == 0:
            test_model_accuracy()
            save_model('numbers_gen_2', encoder_rnn, decoder_rnn, model_train_iterations)
    
    # test_model_accuracy(model, n_sample=10000)

In [38]:
train_iterations(n_iters=10, print_every=5, teacher_forcing_ratio=1)

     6  50% (   0m 0s)   6.278   |   6.25: May 17, 1982 -> (forcing) philippine philippine <EOS> <EOS> <EOS> <EOS> (may seventeenth nineteen eighty two)
    11 100% (   0m 0s)   6.241   |   6.16: $1,004 -> (forcing) <EOS> <EOS> <EOS> <EOS> <EOS> (one thousand four dollars)


In [46]:
train_iterations(n_iters=(1401-model_train_iterations), print_every=100, teacher_forcing_ratio=1)

Accuracy: 3.66% (     366/   10000)
Saving: data/models/numbers_gen_2/1350 (encoder/decoder)
Accuracy: 2.64% (     264/   10000)
Saving: data/models/numbers_gen_2/1400 (encoder/decoder)
  1401 100% (  1m 40s)   2.522   |   3.25: 1625 -> (forcing) two point two <EOS> (sixteen twenty five)


In [48]:
train_iterations(n_iters=(10000-model_train_iterations), print_every=1000, teacher_forcing_ratio=0.5)

  2401  12% (  0m 13s)   2.579   |   3.04: 8 km ->  one hundred <EOS> (eight kilometers)
  3401  23% (  0m 27s)   2.350   |   2.55: 58 (1875) 8 -> (forcing) o sil sil sil sil sil sil sil sil sil (five eight sil one eight seven five sil eight)
  4401  35% (  0m 42s)   2.143   |   1.55: 55% -> (forcing) twenty percent percent <EOS> (fifty five percent)
  5401  47% (  0m 56s)   1.985   |   1.34: 2006 -> (forcing) two thousand <EOS> <EOS> (two thousand six)
  6401  58% (  1m 11s)   1.840   |   2.52: C5 -> (forcing) sixty <EOS> <EOS> (c five)
  7401  70% (  1m 25s)   1.714   |   1.34: 15th ->  thirteenth <EOS> (fifteenth)
  8401  81% (  1m 40s)   1.602   |   2.88: 28" -> (forcing) twenty pounds <EOS> <EOS> (twenty eight inches)
  9401  93% (  1m 54s)   1.468   |   1.02: .69 -> (forcing) point nine nine <EOS> (point six nine)


In [49]:
test_model_accuracy(20, print_wrongs=True)

1980s                -> nineteen sixties 
                     != nineteen eighties
319                  -> three hundred eighty 
                     != three one nine
0.80 km              -> zero point five kilometers 
                     != zero point eight o kilometers
October 10, 1864     -> july twenty nineteen ninety 
                     != october tenth eighteen sixty four
607                  -> five hundred 
                     != six o seven
7th                  -> fourth 
                     != seventh
$19,674              -> one thousand thousand hundred hundred eighty dollars 
                     != nineteen thousand six hundred seventy four dollars
$46,953.01           -> forty thousand thousand hundred hundred eighty three dollars 
                     != forty six thousand nine hundred fifty three dollars and one cent
1.2 m                -> one point zero meters 
                     != one point two meters
€100 million         -> one hundred fifty million dollar

In [50]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.5)

 20000  20% (  2m 23s)   0.815   |   0.15: 49th ->  forty ninth <EOS> (forty ninth)
 30000  40% (  4m 53s)   0.569   |   0.08: $250,000 ->  two hundred fifty thousand dollars <EOS> (two hundred fifty thousand dollars)
 40000  60% (  7m 22s)   0.501   |   0.06: 3.10 ->  three point one o <EOS> (three point one o)
 50000  80% (  9m 46s)   0.377   |   0.58: 109th -> (forcing) one hundred nine <EOS> (one hundred ninth)
Accuracy: 71.90% (    7190/   10000)
Saving: data/models/numbers_gen_2/50000 (encoder/decoder)
 60000 100% ( 12m 56s)   0.362   |   0.51: IV -> (forcing) the <EOS> <EOS> (the fourth)


In [51]:
test_model_accuracy(10, print_wrongs=True)

0.39 km/km2          -> zero point three nine square kilometers 
                     != zero point three nine kilometers per square kilometer
131 1770             -> one hundred one one seven sil 
                     != one three one sil one seven seven o
157 thousand         -> one hundred seventy seven percent 
                     != one hundred fifty seven thousand
Accuracy: 70.00% (       7/      10)


In [52]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.4)

 70000  20% (  2m 22s)   0.347   |   0.64: 152 -> (forcing) one hundred two <EOS> (one five two)
 80000  40% (  4m 44s)   0.299   |   0.01: 76 -> (forcing) seventy six <EOS> (seventy six)
 90000  60% (   7m 6s)   0.291   |   0.00: 8th -> (forcing) eighth <EOS> (eighth)
100000  80% (  9m 29s)   0.291   |   0.06: 14.6 million ->  fourteen point six million <EOS> (fourteen point six million)
Accuracy: 81.33% (    8133/   10000)
Saving: data/models/numbers_gen_2/100000 (encoder/decoder)
110000 100% ( 12m 39s)   0.288   |   0.00: 3rd ->  third <EOS> (third)


In [53]:
test_model_accuracy(10, print_wrongs=True)

Accuracy: 100.00% (      10/      10)


In [54]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.3)

120000  20% (  2m 21s)   0.243   |   0.00: 124.0 -> (forcing) one hundred twenty four point zero <EOS> (one hundred twenty four point zero)
130000  40% (  4m 43s)   0.247   |   0.87: 60100 ->  six thousand one hundred <EOS> (sixty thousand one hundred)
140000  60% (   7m 4s)   0.249   |   1.40: 2012/34 -> (forcing) two thousand twelve thousand thousand <EOS> (two thousand twelve thirty fourths)
150000  80% (  9m 26s)   0.244   |   0.01: 17 ->  seventeen <EOS> (seventeen)
Accuracy: 82.89% (    8289/   10000)
Saving: data/models/numbers_gen_2/150000 (encoder/decoder)
160000 100% ( 12m 34s)   0.225   |   0.34: 885 lb ->  eight hundred fifty eight pounds <EOS> (eight hundred eighty five pounds)


In [55]:
test_model_accuracy(20, print_wrongs=True)

70,000 Hz            -> seventy thousand 
                     != seventy thousand hertz
26                   -> two six 
                     != twenty six
0-312-26385-6        -> o sil three one two sil two six five six five sil six 
                     != o sil three one two sil two six three eight five sil six
Accuracy: 85.00% (      17/      20)


In [56]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.2)

170000  20% (  2m 21s)   0.249   |   1.22: 0845 424 2424 -> (forcing) o four four sil four four four four two two four sil <EOS> <EOS> (o eight four five sil four two four sil two four two four)
180000  40% (  4m 44s)   0.253   |   0.04: 9.3 mi ->  nine point three miles <EOS> (nine point three miles)
190000  60% (   7m 5s)   0.221   |   0.58: 3-8138-0175-6 -> (forcing) three sil eight eight three eight sil one one eight five sil six <EOS> (three sil eight one three eight sil o one seven five sil six)
200000  80% (  9m 26s)   0.212   |   0.28: 222 ->  two two twenty <EOS> (two two two)
Accuracy: 84.24% (    8424/   10000)
Saving: data/models/numbers_gen_2/200000 (encoder/decoder)
210000 100% ( 12m 34s)   0.227   |   0.13: I-64 ->  i sixty four <EOS> (i sixty four)


In [57]:
test_model_accuracy(20, print_wrongs=True)

0-9614392-2          -> o sil nine six four one three sil sil two two 
                     != o sil nine six one four three nine two sil two
0-340-22084-8        -> o sil three four o sil o o four four eight sil eight 
                     != o sil three four o sil two two o eight four sil eight
8 000 000m           -> eight million sil 
                     != eight million meters
Accuracy: 85.00% (      17/      20)


In [58]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.2, lr=0.0001)

220000  20% (  2m 21s)   0.166   |   0.03: $45,216 ->  forty five thousand two hundred sixteen dollars <EOS> (forty five thousand two hundred sixteen dollars)
230000  40% (  4m 42s)   0.155   |   0.04: $1.04 billion -> (forcing) one point o four billion dollars <EOS> (one point o four billion dollars)
240000  60% (   7m 3s)   0.129   |   0.00: 11th ->  eleventh <EOS> (eleventh)
250000  80% (  9m 25s)   0.170   |   0.00: 3rd -> (forcing) third <EOS> (third)
Accuracy: 88.98% (    8898/   10000)
Saving: data/models/numbers_gen_2/250000 (encoder/decoder)
260000 100% ( 12m 34s)   0.131   |   0.00: 3 ->  three <EOS> (three)


In [59]:
test_model_accuracy(20, print_wrongs=True)

978-2-343-06151-1    -> nine seven eight sil three sil four three two sil sil one one five one sil one 
                     != nine seven eight sil two sil three four three sil o six one five one sil one
/day                 -> per 
                     != per day
Accuracy: 90.00% (      18/      20)


In [60]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.4, lr=0.0001)

270000  20% (  2m 20s)   0.119   |   0.20: May 13, 1909 -> (forcing) may thirteenth nineteen ninety nine <EOS> (may thirteenth nineteen o nine)
280000  40% (  4m 41s)   0.104   |   0.00: .38 ->  point three eight <EOS> (point three eight)
290000  60% (   7m 2s)   0.142   |   0.00: 4th -> (forcing) fourth <EOS> (fourth)
300000  80% (  9m 22s)   0.101   |   0.24: 20 -> (forcing) twenty <EOS> (twenty)
Accuracy: 90.02% (    9002/   10000)
Saving: data/models/numbers_gen_2/300000 (encoder/decoder)
310000 100% ( 12m 29s)   0.142   |   0.04: 97 -> (forcing) nine seven <EOS> (nine seven)


In [61]:
test_model_accuracy(20, print_wrongs=True)

II                   -> the second 
                     != two
Accuracy: 95.00% (      19/      20)


In [62]:
train_iterations(n_iters=50000, teacher_forcing_ratio=0.2, lr=0.001)

320000  20% (  2m 19s)   0.169   |   0.00: 16 -> (forcing) sixteen <EOS> (sixteen)
330000  40% (  4m 38s)   0.231   |   0.01: 299 ft ->  two hundred ninety nine feet <EOS> (two hundred ninety nine feet)
340000  60% (  6m 58s)   0.215   |   0.01: $185,000 ->  one hundred eighty five thousand dollars <EOS> (one hundred eighty five thousand dollars)
350000  80% (  9m 17s)   0.200   |   0.04: 20:00 ->  twenty hundred <EOS> (twenty hundred)
Accuracy: 86.30% (    8630/   10000)
Saving: data/models/numbers_gen_2/350000 (encoder/decoder)
360000 100% ( 12m 24s)   0.201   |   0.00: 2000 -> (forcing) two thousand <EOS> (two thousand)


In [63]:
test_model_accuracy(20, print_wrongs=True)

$2.00                -> two thousand dollars 
                     != two dollars
Accuracy: 95.00% (      19/      20)


In [64]:
train_iterations(n_iters=100000, teacher_forcing_ratio=0.1, lr=0.0001)

370000  10% (  2m 20s)   0.128   |   4.40: 28 February 1919 ->  the nineteenth of february nineteen nineteen <EOS> (the twenty eighth of february nineteen nineteen)
380000  20% (  4m 40s)   0.170   |   0.00: 42% ->  forty two percent <EOS> (forty two percent)
390000  30% (   7m 0s)   0.121   |   0.00: 1.22 -> (forcing) one point two two <EOS> (one point two two)
400000  40% (  9m 20s)   0.120   |   0.00: 2nd ->  second <EOS> (second)
Accuracy: 90.58% (    9058/   10000)
Saving: data/models/numbers_gen_2/400000 (encoder/decoder)
410000  50% ( 12m 27s)   0.131   |   0.00: 6th ->  sixth <EOS> (sixth)
420000  60% ( 14m 49s)   0.121   |   0.01: March 2001 ->  march two thousand one <EOS> (march two thousand one)
430000  70% (  17m 9s)   0.138   |   0.34: 0-385-08861-2 ->  o sil three eight five sil eight eight eight six one sil two <EOS> (o sil three eight five sil o eight eight six one sil two)
440000  80% ( 19m 30s)   0.103   |   0.82: 978-0-300-17382-6 ->  nine seven eight sil o sil thre

In [65]:
test_model_accuracy(20, print_wrongs=True)

1987                 -> nineteen eighty seven 
                     != one nine eight seven
Accuracy: 95.00% (      19/      20)


In [66]:
train_iterations(n_iters=200000, teacher_forcing_ratio=0.1, lr=0.0001)

470000   5% (  2m 20s)   0.095   |   0.00: 12.3 ->  twelve point three <EOS> (twelve point three)
480000  10% (  4m 39s)   0.127   |   0.00: 2.50 ->  two point five o <EOS> (two point five o)
490000  15% (   7m 0s)   0.102   |   0.00: March 28, 2014 ->  march twenty eighth twenty fourteen <EOS> (march twenty eighth twenty fourteen)
500000  20% (  9m 21s)   0.105   |   0.00: 10,000 m ->  ten thousand meters <EOS> (ten thousand meters)
Accuracy: 90.80% (    9080/   10000)
Saving: data/models/numbers_gen_2/500000 (encoder/decoder)
510000  25% ( 12m 28s)   0.098   |   0.01: 909.6/km² ->  nine hundred nine point six per square kilometers <EOS> (nine hundred nine point six per square kilometers)
520000  30% ( 14m 50s)   0.131   |   0.01: .292 ->  point two nine two <EOS> (point two nine two)
530000  35% ( 17m 10s)   0.113   |   0.00: 13.9% ->  thirteen point nine percent <EOS> (thirteen point nine percent)
540000  40% ( 19m 32s)   0.100   |   0.00: 2nd ->  second <EOS> (second)
550000  45% (

In [67]:
test_model_accuracy(20, print_wrongs=True)

Accuracy: 100.00% (      20/      20)


In [68]:
train_iterations(n_iters=200000, teacher_forcing_ratio=0.1, lr=0.0001)

670000   5% (  2m 21s)   0.090   |   0.47: 0-8108-6829-6 ->  o sil eight one o eight sil two eight eight six sil six <EOS> (o sil eight one o eight sil six eight two nine sil six)
680000  10% (  4m 42s)   0.089   |   0.00: 2nd ->  second <EOS> (second)
690000  15% (   7m 2s)   0.135   |   0.00: C4  ->  c four <EOS> (c four)
700000  20% (  9m 22s)   0.085   |   0.00: 04 ->  o four <EOS> (o four)
Accuracy: 92.72% (    9272/   10000)
Saving: data/models/numbers_gen_2/700000 (encoder/decoder)
710000  25% ( 12m 29s)   0.070   |   0.00: 6 ->  six <EOS> (six)
720000  30% ( 14m 50s)   0.076   |   0.00: 1 ->  one <EOS> (one)
730000  35% ( 17m 10s)   0.115   |   0.00: 157 ->  one hundred fifty seven <EOS> (one hundred fifty seven)
740000  40% ( 19m 30s)   0.088   |   0.00: 15/km² ->  fifteen per square kilometers <EOS> (fifteen per square kilometers)
750000  45% ( 21m 49s)   0.066   |   0.00: 44th ->  forty fourth <EOS> (forty fourth)
Accuracy: 92.71% (    9271/   10000)
Saving: data/models/numb

In [69]:
test_model_accuracy(20, print_wrongs=True)

2325                 -> two three two five 
                     != two thousand three hundred twenty five
Accuracy: 95.00% (      19/      20)


In [70]:
train_iterations(n_iters=200000, teacher_forcing_ratio=0.1, lr=0.0001)

870000   5% (  2m 20s)   0.116   |   0.00: 19.1% ->  nineteen point one percent <EOS> (nineteen point one percent)
880000  10% (  4m 40s)   0.110   |   0.00: 100 ->  one hundred <EOS> (one hundred)
890000  15% (   7m 1s)   0.091   |   0.65: 978-3-000-42033-7 ->  nine seven eight sil o sil o o sil o o four four three three sil seven <EOS> (nine seven eight sil three sil o o o sil four two o three three sil seven)
900000  20% (  9m 20s)   0.093   |   0.00: 100 ->  one hundred <EOS> (one hundred)
Accuracy: 93.38% (    9338/   10000)
Saving: data/models/numbers_gen_2/900000 (encoder/decoder)
910000  25% ( 12m 28s)   0.065   |   0.00: 1.55 million ->  one point five five million <EOS> (one point five five million)
920000  30% ( 14m 48s)   0.098   |   0.08: 3269 ->  three two six nine <EOS> (three two six nine)
930000  35% (  17m 8s)   0.072   |   0.01: 80% ->  eighty percent <EOS> (eighty percent)
940000  40% ( 19m 27s)   0.110   |   3.91: 208Pb ->  two hundred eight pounds <EOS> (two hundr

In [71]:
test_model_accuracy(20, print_wrongs=True)

Accuracy: 100.00% (      20/      20)


In [72]:
train_iterations(n_iters=200000, teacher_forcing_ratio=0, lr=0.0001)

1070000   5% (  2m 20s)   0.072   |   0.00: 35mm ->  thirty five millimeters <EOS> (thirty five millimeters)
1080000  10% (  4m 40s)   0.113   |   0.01: $228,792 ->  two hundred twenty eight thousand seven hundred ninety two dollars <EOS> (two hundred twenty eight thousand seven hundred ninety two dollars)
1090000  15% (   7m 1s)   0.073   |   0.00: 1,400 m2 ->  one thousand four hundred square meters <EOS> (one thousand four hundred square meters)
1100000  20% (  9m 21s)   0.070   |   0.00: .3 ->  point three <EOS> (point three)
Accuracy: 94.48% (    9448/   10000)
Saving: data/models/numbers_gen_2/1100000 (encoder/decoder)
1110000  25% ( 12m 28s)   0.068   |   0.00: 1996 ->  nineteen ninety six <EOS> (nineteen ninety six)
1120000  30% ( 14m 49s)   0.080   |   0.00: 1 ->  one <EOS> (one)
1130000  35% (  17m 9s)   0.066   |   0.00: 44 ->  forty four <EOS> (forty four)
1140000  40% ( 19m 30s)   0.045   |   0.00: 14:10 ->  fourteen ten <EOS> (fourteen ten)
1150000  45% ( 21m 49s)   0.085

In [73]:
test_model_accuracy(20, print_wrongs=True)

Accuracy: 100.00% (      20/      20)


In [74]:
train_iterations(n_iters=400000, teacher_forcing_ratio=0, lr=0.0001)

1270000   2% (  2m 19s)   0.070   |   0.04: 0503040 ->  o five o three o four o <EOS> (o five o three o four o)
1280000   5% (  4m 40s)   0.064   |   0.00: 350 ->  three hundred fifty <EOS> (three hundred fifty)
1290000   8% (   7m 0s)   0.067   |   0.00: 29% ->  twenty nine percent <EOS> (twenty nine percent)
1300000  10% (  9m 21s)   0.070   |   0.00: 1 ->  one <EOS> (one)
Accuracy: 94.86% (    9486/   10000)
Saving: data/models/numbers_gen_2/1300000 (encoder/decoder)
1310000  12% ( 12m 29s)   0.075   |   0.01: 23 June 1972 ->  the twenty third of june nineteen seventy two <EOS> (the twenty third of june nineteen seventy two)
1320000  15% ( 14m 48s)   0.067   |   0.00: $8 Million ->  eight million dollars <EOS> (eight million dollars)
1330000  18% ( 17m 10s)   0.088   |   0.00: £1.1 million ->  one point one million pounds <EOS> (one point one million pounds)
1340000  20% ( 19m 31s)   0.069   |   0.00: 1 ->  one <EOS> (one)
1350000  22% ( 21m 52s)   0.071   |   0.00: 28.1/km2 ->  twe

# 

In [75]:
1+1

2

In [76]:
test_model_accuracy(20, print_wrongs=True)

Accuracy: 100.00% (      20/      20)


In [77]:
train_iterations(n_iters=600000, teacher_forcing_ratio=0, lr=0.0001)

1670000   2% (  2m 20s)   0.066   |   0.00: 83 ->  eighty three <EOS> (eighty three)
1680000   3% (  4m 41s)   0.055   |   0.01: 1997 ->  one nine nine seven <EOS> (one nine nine seven)
1690000   5% (   7m 2s)   0.045   |   0.00: 18 ->  eighteen <EOS> (eighteen)
1700000   7% (  9m 22s)   0.056   |   0.00: 1991 ->  nineteen ninety one <EOS> (nineteen ninety one)
Accuracy: 96.22% (    9622/   10000)
Saving: data/models/numbers_gen_2/1700000 (encoder/decoder)
1710000   8% ( 12m 27s)   0.044   |   0.00: 10.1017 ->  ten point one o one seven <EOS> (ten point one o one seven)
1720000  10% ( 14m 48s)   0.054   |   0.00: July 13, 2012 ->  july thirteenth twenty twelve <EOS> (july thirteenth twenty twelve)
1730000  12% (  17m 8s)   0.046   |   0.07: 1-56316-115 ->  one sil five six three one six sil one one five <EOS> (one sil five six three one six sil one one five)
1740000  13% ( 19m 30s)   0.043   |   0.01: 1990 ->  one nine nine o <EOS> (one nine nine o)
1750000  15% ( 21m 51s)   0.062   | 

In [78]:
test_model_accuracy(20, print_wrongs=True)

Accuracy: 100.00% (      20/      20)


In [79]:
train_iterations(n_iters=600000, teacher_forcing_ratio=0, lr=0.0001)

2270000   2% (  2m 22s)   0.047   |   0.00: 1,500th ->  one thousand five hundredth <EOS> (one thousand five hundredth)
2280000   3% (  4m 43s)   0.030   |   0.00: 1966 ->  nineteen sixty six <EOS> (nineteen sixty six)
2290000   5% (   7m 4s)   0.056   |   0.00: (2)4 ->  two sil four <EOS> (two sil four)
2300000   7% (  9m 25s)   0.059   |   0.00: 68.209 ->  sixty eight point two o nine <EOS> (sixty eight point two o nine)
Accuracy: 96.92% (    9692/   10000)
Saving: data/models/numbers_gen_2/2300000 (encoder/decoder)
2310000   8% ( 12m 33s)   0.056   |   0.00: 153.106 km² ->  one hundred fifty three point one o six square kilometers <EOS> (one hundred fifty three point one o six square kilometers)
2320000  10% ( 14m 55s)   0.041   |   0.02: 75 ->  seven five <EOS> (seven five)
2330000  12% ( 17m 15s)   0.031   |   0.00: 37 ->  thirty seven <EOS> (thirty seven)
2340000  13% ( 19m 36s)   0.037   |   0.00: 17.2 ->  seventeen point two <EOS> (seventeen point two)
2350000  15% ( 21m 58s)  

In [80]:
test_model_accuracy(20, print_wrongs=True)

Accuracy: 100.00% (      20/      20)


In [81]:
train_iterations(n_iters=600000, teacher_forcing_ratio=0, lr=0.0001)

2870000   2% (  2m 21s)   0.051   |   0.00: 48.9% ->  forty eight point nine percent <EOS> (forty eight point nine percent)
2880000   3% (  4m 42s)   0.036   |   0.00: 7/26 ->  seven twenty sixths <EOS> (seven twenty sixths)
2890000   5% (   7m 3s)   0.046   |   0.00: 1881 ->  eighteen eighty one <EOS> (eighteen eighty one)
2900000   7% (  9m 24s)   0.037   |   0.00: 3.8 ->  three point eight <EOS> (three point eight)
Accuracy: 98.01% (    9801/   10000)
Saving: data/models/numbers_gen_2/2900000 (encoder/decoder)
2910000   8% ( 12m 32s)   0.046   |   0.00: 12 ->  twelve <EOS> (twelve)
2920000  10% ( 14m 52s)   0.033   |   0.00: I ->  the first <EOS> (the first)
2930000  12% ( 17m 14s)   0.044   |   0.00: 227th ->  two hundred twenty seventh <EOS> (two hundred twenty seventh)
2940000  13% ( 19m 34s)   0.046   |   0.00: 1340.6 ->  one thousand three hundred forty point six <EOS> (one thousand three hundred forty point six)
2950000  15% ( 21m 55s)   0.028   |   0.00: 8 April 2006 ->  the 

In [117]:
test_model_accuracy(1000, print_wrongs=True)

02                   -> o two 
                     != two
16                   -> sixteen 
                     != one six
1-4012-2733-3        -> one sil four o one two sil two seven three three sil 
                     != one sil four o one two sil two seven three three sil three
VII's                -> the sil 
                     != the seventh's
14 3 2 9 15-26 11    -> one four sil three sil two sil nine sil two sil five sil two one one one 
                     != one four sil three sil two sil nine sil one five sil two six sil one one
CRC2142              -> two thousand one hundred forty two thousand four 
                     != two thousand one hundred forty two costa rican colons
367                  -> three hundred sixty seven 
                     != three six seven
1956                 -> nineteen fifty six six 
                     != one nine five six
31.70 USD            -> thirty one united states dollars and seventy 
                     != thirty one united stat

In [116]:
all_data[all_data['sentence_id'].isin(all_data[all_data['after'] == "two o one o"].sample(1)['sentence_id'])]

Unnamed: 0,sentence_id,token_id,class,before,after
9216905,701386,0,PLAIN,South,South
9216906,701386,1,PLAIN,Africa,Africa
9216907,701386,2,PLAIN,the,the
9216908,701386,3,PLAIN,focus,focus
9216909,701386,4,PLAIN,at,at
9216910,701386,5,PLAIN,London,London
9216911,701386,6,PLAIN,Book,Book
9216912,701386,7,PLAIN,Fair,Fair
9216913,701386,8,PUNCT,",",","
9216914,701386,9,DATE,8 April,the eighth of april


In [111]:
all_data[all_data['sentence_id'].isin(all_data[all_data['after'] == "twenty ten"].sample(1)['sentence_id'])]

Unnamed: 0,sentence_id,token_id,class,before,after
2857532,222878,0,PLAIN,Its,Its
2857533,222878,1,PLAIN,population,population
2857534,222878,2,PLAIN,as,as
2857535,222878,3,PLAIN,of,of
2857536,222878,4,PLAIN,the,the
2857537,222878,5,DATE,2010,twenty ten
2857538,222878,6,PLAIN,Census,Census
2857539,222878,7,PLAIN,was,was
2857540,222878,8,CARDINAL,354,three hundred fifty four
2857541,222878,9,PUNCT,",",","
