In [7]:
%matplotlib inline
import importlib
from pytorch_utils_oh_1 import *

In [8]:
MODEL_SAVE_PATH = 'whole_rnn_1_mod_data'

import pytorch_utils_oh_1; importlib.reload(pytorch_utils_oh_1); from pytorch_utils_oh_1 import *;

# Data loading

In [9]:
all_data = pickle.load(open("data/en_train_not_changed_verb_fix_2.pkl", "rb" ))
all_data_sentence_index = all_data.set_index('sentence_id')

In [10]:
all_data.sample(5)

Unnamed: 0,sentence_id,token_id,class,before,after
5356579,409050,21,VERBATIM,☒,と
5128168,391961,1,NOT_CHANGED,won,won
4736328,362621,2,NOT_CHANGED,time,time
8105442,613902,0,NOT_CHANGED,Fortunato,Fortunato
8640575,653567,6,NOT_CHANGED,to,to


In [11]:
categories_all = all_data["class"].unique()
print(categories_all)
print(len(categories_all))
categories_index = dict((c, i) for i, c in enumerate(categories_all))

['NOT_CHANGED' 'NUMBERS' 'LETTERS' 'PLAIN' 'VERBATIM' 'ELECTRONIC']
6


### Letters all

In [12]:
tmp = sorted(list(set(''.join(all_data['before']))))
characters_all = ['<EOS>', '<SOS>'] + sorted(list(set(tmp)))
characters_all_index = dict((c, i) for i, c in enumerate(characters_all))
print(''.join(characters_all))
print(len(characters_all))

<EOS><SOS> !"#$%&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz|~¡£¥ª«²³µº»¼½¾¿éɒʻˈΩμ—€⅓⅔⅛⅝⅞☒
115


### Common words

In [13]:
common_words, common_words_index = load_common_words_100()
len(common_words)

7381

### More balanced sample

In [14]:
all_data.groupby("class")["class"].count()

class
ELECTRONIC        4964
LETTERS         152981
NOT_CHANGED    9218582
NUMBERS         448172
PLAIN            36472
VERBATIM         56968
Name: class, dtype: int64

In [22]:
sample_data = all_data[~all_data['before'].str.contains(VERBATIM_CHAR)]

In [24]:
len(all_data) - len(sample_data)

45227

In [200]:
balanced_data_classes_select = list(sample_data.groupby('class'))

balanced_data_accessed_counter = 0 
balanced_data_length = 0
def balanced_data_randomize(max_len=20000):
    global balanced_data, balanced_data_length, balanced_data_accessed_counter
    balanced_data = pd.concat([v.sample(min(max_len, len(v))) for k, v in balanced_data_classes_select])
    balanced_data_length = len(balanced_data)
    balanced_data_accessed_counter = 0

def balanced_data_sample_row():
    global balanced_data_accessed_counter
    global balanced_data_last_sample
    balanced_data_accessed_counter += 1
    if balanced_data_accessed_counter/balanced_data_length > 0.2:
        balanced_data_randomize()
    balanced_data_last_sample = balanced_data.iloc[random.randint(1, balanced_data_length-1)]
    return balanced_data_last_sample
    
balanced_data_randomize()

In [32]:
balanced_data.groupby("class")["class"].count()

class
ELECTRONIC      4964
LETTERS        20000
NOT_CHANGED    20000
NUMBERS        20000
PLAIN          20000
VERBATIM       11741
Name: class, dtype: int64

### Samples

In [199]:
def get_random_sample():
    sample_row = balanced_data_sample_row()
    sentence_id = sample_row['class']

    rows = all_data_sentence_index.loc[sample_row['sentence_id']]
    befores = list(rows.before)
        
    token_id_idx = list(rows['token_id']).index(sample_row['token_id'])
    befores[token_id_idx] = SAMPLE_WORD_TOKEN
    
    return sample_row['before'], sample_row['after'], sample_row['class'], befores
            
def tmp():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    print(s_class, ':', s_bef, '->', s_aft)
    print(' '.join(s_sentence))
    print(s_sentence)
    print(words_to_tensor(list(s_sentence), common_words_index).shape)
    print(string_to_tensor(s_bef, characters_all_index).shape)
tmp()

LETTERS : MTB -> m t b
<SAMPLE> 316 was sunk and MTB 313 damaged between Reggio di Calabria and Pellaro and twelve British sailors were killed .
['<SAMPLE>', '316', 'was', 'sunk', 'and', 'MTB', '313', 'damaged', 'between', 'Reggio', 'di', 'Calabria', 'and', 'Pellaro', 'and', 'twelve', 'British', 'sailors', 'were', 'killed', '.']
torch.Size([1, 22, 7381])
torch.Size([1, 4, 115])


# Model

### Encoder

In [41]:
class EncoderRNN(nn.Module):
    def __init__(self, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))

        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [146]:
encoder_rnn = EncoderRNN(words_input_size=len(common_words), chars_input_size=len(characters_all),
                         words_hidden_size=128, chars_hidden_size=128,
                         words_layers=2, chars_layers=2).cuda()
encoder_rnn

EncoderRNN (
  (rnn_words): LSTM(7381, 64, num_layers=2, batch_first=True, bidirectional=True)
  (rnn_chars): LSTM(115, 64, num_layers=2, batch_first=True, bidirectional=True)
)

In [49]:
def test_encoder_single_sample():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, characters_all_index)
    string_t = Variable(string_t).cuda()
    
    return encoder_rnn(words_t, string_t)
    
encoder_output = test_encoder_single_sample()
encoder_output.size()

torch.Size([1, 256])

### Decoder

In [67]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
                         # LSTM would require own hidden included
        
        self.lin_out = nn.Linear(hidden_size, input_size)

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden

In [147]:
decoder_rnn = DecoderRNN(input_size=len(characters_all), hidden_size=encoder_output.size()[-1], n_layers=1)
decoder_rnn = decoder_rnn.cuda()
decoder_rnn

DecoderRNN (
  (rnn): GRU(115, 256, batch_first=True)
  (lin_out): Linear (256 -> 115)
)

In [69]:
character_sos_t = string_to_tensor([SOS_TOKEN], characters_all_index, include_eos=False)
character_sos_t.size()

torch.Size([1, 1, 115])

In [70]:
tmp_a, tmp_b = decoder_rnn(Variable(character_sos_t).cuda(), encoder_output.view(1,1,-1))
print(tmp_a.size())
print(tmp_a.topk(1)[0])

torch.Size([1, 115])
Variable containing:
-4.6601
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]



In [221]:
tmp = tmp_a.data.cpu().numpy()
tmp[0][-1] = 

0.69

In [231]:
tmp_a.data.topk(1)[1][0][0]

114

In [226]:
characters_all[114]

'☒'

In [442]:
def test_model_single_sample(model=None):
    s_bef, s_aft, s_class, s_sentence = sample = get_random_sample()
        
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, characters_all_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = encoder_output
    decoder_input = Variable(character_sos_t).cuda()

    decoded_output = []
    max_length = 20
    for _ in range(max_length):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)
        #return decoder_output

        topv, topi = decoder_output.data.topk(1)
        char_index = topi[0][0]
        char = characters_all[char_index] # Use own prediction as next input
                
        if char == EOS_TOKEN:
            break

        decoded_output.append(char)
        
        decoder_input = string_to_tensor([char], characters_all_index, include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
    
    output = ''.join(decoded_output)
    return output, output, s_aft, sample
    
tmp = test_model_single_sample(None)
tmp

9223372034707292159


IndexError: list index out of range

In [110]:
%%time
test_model_accuracy(encoder_rnn, test_model_single_sample)

Accuracy: 0.00% (       0/   10000)
CPU times: user 8min 57s, sys: 3.83 s, total: 9min 1s
Wall time: 2min 15s


0.0

### Training

In [141]:
def train(s_bef, s_aft, s_sentence, encoder_optimizer, decoder_optimizer, loss_function,
          use_teacher_forcing, max_length=20):

    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, characters_all_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = encoder_output
    decoder_input = Variable(character_sos_t).cuda()
    
    ###
    
    target_arr = list(s_aft) + [EOS_TOKEN]
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    decoded_output = []
    for i in range(len(target_arr)):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)

        decoder_target_i = characters_all_index[target_arr[i]]
        decoder_target_i = Variable(torch.LongTensor([decoder_target_i])).cuda()
        loss += loss_function(decoder_output, decoder_target_i)
        
        topv, topi = decoder_output.data.topk(1)
        char_index = topi[0][0]
        char = characters_all[char_index] 
        decoded_output.append(char)
        
        if use_teacher_forcing:
            char_new = target_arr[i] # replace input with right target
        else:
            char_new = char # Use own prediction as next input
            if char == EOS_TOKEN:
                break
        
        decoder_input = string_to_tensor([char], characters_all_index, include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
        
    ###
    if decoded_output[-1] == EOS_TOKEN:
        decoded_output = decoded_output[:-1]
        
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return ''.join(decoded_output), (loss.data[0] / len(target_arr))

In [142]:
def train_iterations(n_iters=100000, lr=0.001, teacher_forcing_ratio=0.5,
                     print_every=10000, plot_every=1000):

    start = time.time()
    
    decoder_rnn.train()
    encoder_rnn.train()

    current_loss = 0
    current_loss_iter = 0

    encoder_optimizer = torch.optim.Adam(encoder_rnn.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder_rnn.parameters(), lr=lr)
    loss_function = nn.NLLLoss()
    
    for iteration in range(1, n_iters + 1):
        model_training.iterations += 1
        
        use_teacher_forcing = random.random() < teacher_forcing_ratio
        
        s_bef, s_aft, s_class, s_sentence = get_random_sample()
        
        result, loss = train(s_bef=s_bef, s_aft=s_aft, s_sentence=s_sentence,
                             encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer,
                             loss_function=nn.NLLLoss(), use_teacher_forcing=use_teacher_forcing,
                             max_length=40 )
        
        current_loss += loss
        current_loss_iter += 1

        # Print iter number, loss, name and guess
        if iteration % print_every == 0:
            teacher_forcing_str = ""
            if use_teacher_forcing:
                teacher_forcing_str = "(forcing)"
            correct = '✓' if result == s_aft else "✗: {}".format(s_aft)
            
            print("{:>6d} {:>4.0%} ({:>8}) {:>7.3f}   | {:>6.2f}: {} -> {} ({}) {}".format(
                      model_training.iterations, iteration/n_iters, time_since(start),
                      current_loss/current_loss_iter, loss,
                      s_bef, result, correct, teacher_forcing_str))

        # Add current loss avg to list of losses
        if iteration % plot_every == 0:
            model_training.losses.append(current_loss / plot_every)
            model_training.learning_rates.append(lr)
            current_loss = 0
            current_loss_iter = 0
            
        if model_training.iterations % 50000 == 0 or model_training.iterations == 10:
            model_training.save_models()
            acc = test_model_accuracy(encoder_rnn, test_model_single_sample)
            model_training.accuracy.append(acc)
    
    # test_model_accuracy(model, n_sample=10000)

In [148]:
model_training = ModelTraining(MODEL_SAVE_PATH, [encoder_rnn, decoder_rnn])

Save path: data/models/whole_rnn_1_mod_data


In [149]:
train_iterations(n_iters=50, print_every=9, lr=0.0001)

     9  18% (   0m 0s)   4.743   |   4.75: & -> eevv (✗: and) 
Saved model to data/models/whole_rnn_1_mod_data/10_(EncoderRNN/DecoderRNN)
Accuracy: 0.00% (       0/   10000)
    18  36% (  2m 12s)   4.737   |   4.74: & -> eevv (✗: and) (forcing)
    27  54% (  2m 12s)   4.732   |   4.72: & -> eeev (✗: and) 
    36  72% (  2m 13s)   4.726   |   4.69: www.paidtodream.com -> eeeeeeeevvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv (✗: w w w dot p a i d t o d r e a m dot c o m) 
    45  90% (  2m 13s)   4.717   |   4.67: 21 -> eeeeeeeeeee (✗: twenty one) (forcing)


In [150]:
train_iterations(n_iters=(1000-model_training.iterations), print_every=500, lr=0.0001)

   550  53% (   0m 9s)   2.853   |   2.91: 7 -> t      (✗: seven) (forcing)


In [151]:
train_iterations(n_iters=9000, lr=0.0001, print_every=1000)

  2000  11% (  0m 23s)   2.653   |   2.16: & -> t <EOS> (✗: and) (forcing)
  3000  22% (  0m 46s)   2.526   |   2.91: . -> an (✗: .) 
  4000  33% (  1m 11s)   2.457   |   2.33: 1999 -> t e e e e e e e e e e (✗: nineteen ninety nine) (forcing)
  5000  44% (  1m 34s)   2.340   |   2.98: UK -> and (✗: u k) (forcing)
  6000  56% (   2m 0s)   2.305   |   2.72: 2005 -> t e t e eeeeeeeee (✗: two thousand five) (forcing)
  7000  67% (  2m 24s)   2.265   |   1.75: ForumForum-software.org -> t o t t t t o o o o o o d d d d d d d d d d d d d d d  (✗: f o r u m f o r u m d a s h s o f t w a r e dot o r g) (forcing)
  8000  78% (  2m 51s)   2.242   |   2.49: C. -> a (✗: c) 
  9000  89% (  3m 15s)   2.218   |   1.54: - -> and (✗: to) 
 10000 100% (  3m 39s)   2.109   |   1.79: 19 -> t n teeee (✗: nineteen) 


In [152]:
train_iterations(n_iters=90000, print_every=10000)

 20000  11% (   4m 2s)   1.458   |   2.33: recognised -> arnrriizr<EOS> (✗: recognized) (forcing)
 30000  22% (  8m 24s)   1.140   |   0.40: # -> number (✓) (forcing)
 40000  33% ( 12m 37s)   0.890   |   0.66: 2009 -> two thousand sin t (✗: two thousand nine) 
 50000  44% ( 16m 57s)   0.735   |   2.25: 2007-11-23 -> twe swinty si  t                      sin se   i (✗: the twenty third of november two thousand seven) 
Saved model to data/models/whole_rnn_1_mod_data/50000_(EncoderRNN/DecoderRNN)
Accuracy: 48.06% (    4806/   10000)
 60000  56% ( 22m 33s)   0.683   |   1.03: Aodh -> a ot (✗: a o d h) 
 70000  67% ( 26m 51s)   0.609   |   0.19: CN -> c n (✓) (forcing)
 80000  78% ( 31m 10s)   0.547   |   0.01: sr -> senior (✓) (forcing)
 90000  89% ( 35m 27s)   0.505   |   0.00: vol -> volume (✓) (forcing)
100000 100% ( 39m 55s)   0.494   |   0.02: etc -> etcetera (✓) 
Saved model to data/models/whole_rnn_1_mod_data/100000_(EncoderRNN/DecoderRNN)
Accuracy: 62.19% (    6219/   10000)


In [187]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.001)

110475   3% (  5m 38s)   0.435   |   0.54: CSKA -> c m a a (✗: c s k a) 
120475   7% ( 11m 30s)   0.427   |   0.05: NE -> n e (✓) (forcing)
130475  10% ( 17m 21s)   0.437   |   0.44: 1919 -> nineteen ninety nn (✗: nineteen nineteen) (forcing)
140475  13% (  23m 0s)   0.368   |   0.00: & -> and (✓) (forcing)
Saved model to data/models/whole_rnn_1_mod_data/150000_(EncoderRNN/DecoderRNN)
Accuracy: 69.46% (    6946/   10000)
150475  17% ( 30m 41s)   0.397   |   0.00: and -> and (✓) (forcing)
160475  20% ( 36m 16s)   0.373   |   1.62: culture -> culiter (✗: culture) (forcing)
170475  23% ( 41m 51s)   0.404   |   0.00: jr -> junior (✓) (forcing)
180475  27% ( 47m 22s)   0.355   |   0.03: SBE -> s b e (✓) (forcing)
190475  30% ( 52m 52s)   0.345   |   0.00: & -> and (✓) 
Saved model to data/models/whole_rnn_1_mod_data/200000_(EncoderRNN/DecoderRNN)
Accuracy: 71.31% (    7131/   10000)
200475  33% ( 59m 49s)   0.331   |   0.00: & -> and (✓) (forcing)
210475  37% ( 65m 21s)   0.349   |   0.01: 

In [188]:
print_local_wrong_predictions()

LeakyMails.com => l e k a s a l a d d  || l e a k y m a i l s dot c o m 
                  " Argentina : Judge orders all ISPs to block the sites <SAMPLE> and Leakymails.blogspot.com " , OpenNet Initiative , 11 August 2011 .
Nov 2006       => november two thousan || november two thousand six 
                  Frederick , Carl ( <SAMPLE> ) .
labouring      => labounng       || laboring 
                  " On the non existence of sugar in the blood of persons <SAMPLE> under diabetes mellitus " .
481            => four hundred eighty  || four hundred eighty one 
                  It has a population of <SAMPLE> .
.doc           => dot c o m      || dot d o c 
                  " The 36th Annual Saturn Awards Nominations " ( <SAMPLE> ) .
modernised     => morenized      || modernized 
                  Meanwhile , the mission headed by Edwin W. Kemmerer prepared a comprehensive economic overhaul , which <SAMPLE> Ecuador's financial practices .
europeforvisitors.com => eru r r s c s s s r

In [189]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.1, lr=0.001)

410475   3% (  5m 29s)   0.251   |   0.01: # -> number (✓) 
420475   7% ( 10m 54s)   0.318   |   0.19: green -> green (✓) 
430475  10% ( 16m 30s)   0.256   |   0.00: & -> and (✓) 
440475  13% ( 21m 53s)   0.270   |   0.12: USL- -> u s l (✓) (forcing)
Saved model to data/models/whole_rnn_1_mod_data/450000_(EncoderRNN/DecoderRNN)
Accuracy: 73.67% (    7367/   10000)
450475  17% (  29m 0s)   0.305   |   0.08: Land -> Land (✓) 
460475  20% ( 34m 37s)   0.306   |   0.00: U.S. -> u s (✓) 
470475  23% (  40m 7s)   0.258   |   0.41: eyes -> e es (✗: eyes) 
480475  27% ( 45m 29s)   0.265   |   0.00: FM -> f m (✓) 
490475  30% ( 50m 57s)   0.219   |   0.01: st -> saint (✓) 
Saved model to data/models/whole_rnn_1_mod_data/500000_(EncoderRNN/DecoderRNN)
Accuracy: 75.27% (    7527/   10000)
500475  33% ( 56m 12s)   0.281   |   0.01: ltd -> limited (✓) 
510475  37% ( 60m 13s)   0.272   |   0.00: D. -> d (✓) 
520475  40% ( 64m 12s)   0.290   |   0.00: - -> to (✓) (forcing)
530475  43% ( 68m 12s)   0.

In [190]:
print_local_wrong_predictions()

10:00          => tex t ooe      || ten o'clock 
                  Collins officially commenced sea trials at when she departed the ASC wharf at <SAMPLE> on 31 October 1994 .
colonisation   => conooizatioo   || colonization 
                  The pigeon became extinct following human <SAMPLE> of Henderson , an event that had occurred by 1050 CE.
competed       => compeeed       || competed 
                  Majorie Homer Dixon ( born August 10, 1945 ) is a Canadian sprint canoer who <SAMPLE> in the late 1960s and early 1970s .
July 2009      => july two thousand ni || july two thousand nine 
                  This game is still under development but an alpha version was released in the end of <SAMPLE> .
.201           => point two t oo     || point two o one 
                  G.W. B Huntingford , The Historical Geography of Ethiopia ( London : The British Academy , 1989 ) , p <SAMPLE> .
qctimes.com    => s c m i m e s dot c  || q c t i m e s dot c o m 
                  Retrieved 201

In [191]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.0001)

710475   3% (   4m 4s)   0.275   |   0.00: F. -> f (✓) (forcing)
720475   7% (   8m 4s)   0.210   |   6.77: I -> tn (✗: I) 
730475  10% ( 12m 11s)   0.225   |   0.00: VC -> v c (✓) 
740475  13% (  16m 8s)   0.174   |   0.01: 1849 -> eighteen forty nine (✓) 
Saved model to data/models/whole_rnn_1_mod_data/750000_(EncoderRNN/DecoderRNN)
Accuracy: 80.10% (    8010/   10000)
750475  17% ( 21m 22s)   0.201   |   0.00: AD -> a d (✓) (forcing)
760475  20% ( 25m 28s)   0.186   |   1.91: FishBase -> fishbaae (✗: FishBase) (forcing)
770475  23% ( 29m 26s)   0.174   |   0.11: August 14, 2013 -> august fourteenth twenty thirteen (✓) 
780475  27% ( 33m 24s)   0.179   |   0.00: in -> in (✓) 
790475  30% ( 37m 23s)   0.143   |   0.00: A- -> a (✓) (forcing)
Saved model to data/models/whole_rnn_1_mod_data/800000_(EncoderRNN/DecoderRNN)
Accuracy: 80.14% (    8014/   10000)
800475  33% ( 42m 36s)   0.153   |   0.01: Eds -> e d's (✓) (forcing)
810475  37% ( 46m 33s)   0.149   |   0.04: colonises -> coloni

In [192]:
print_local_wrong_predictions()

Coleophoridae  => Colehoporrpe   || Coleophoridae 
                  Coleophora pyrrhulipennella is a moth of the <SAMPLE> family .
PDRacer.com    => p r e r c r r dot c  || p d r a c e r dot c o m 
                  " Puddle Duck Racers with Cabins " <SAMPLE> .
Chinaview.cn   => c h i n n a n e e do || c h i n a v i e w dot c n 
                  <SAMPLE> , 15 August 2008 Nepali PM Prachanda Sworn In .
24 February 2014 => the twenty fourth of || the twenty fourth of february twenty fourteen 
                  ABC News Radio ( <SAMPLE> ) .
December 14, 2011 => december fourteenth  || december fourteenth twenty eleven 
                  Caulfield , Keith ( <SAMPLE> ) .
April 30, 1998 => april thirtieth nige || april thirtieth nineteen ninety eight 
                  Amy Silverman , " Framing Marilyn Zeitlin " ( <SAMPLE> ) .
containing     => contaniing     || containing 
                  Two missile pods each <SAMPLE> two missiles are mounted , one of each side of the turret .
7 Octobe

In [193]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.001)

1010475   3% (  3m 57s)   0.279   |   0.00: jr -> junior (✓) 
1020475   7% (  8m 10s)   0.235   |   0.00: US -> u s (✓) (forcing)
1030475  10% ( 12m 17s)   0.268   |   2.77: 4.5 -> fovry point five (✗: four point five) (forcing)
1040475  13% ( 16m 18s)   0.285   |   0.00: S. -> s (✓) (forcing)
Saved model to data/models/whole_rnn_1_mod_data/1050000_(EncoderRNN/DecoderRNN)
Accuracy: 76.74% (    7674/   10000)
1050475  17% ( 21m 37s)   0.257   |   2.08: 1st -> fixtee (✗: first) 
1060475  20% ( 25m 35s)   0.280   |   0.51: capitalized -> capitizized (✗: capitalized) 
1070475  23% ( 29m 36s)   0.354   |   0.04: 1990s -> nineteen nineties (✓) (forcing)
1080475  27% ( 33m 33s)   0.314   |   0.35: larger -> laarer (✗: larger) 
1090475  30% ( 37m 27s)   0.326   |   0.00: & -> and (✓) (forcing)
Saved model to data/models/whole_rnn_1_mod_data/1100000_(EncoderRNN/DecoderRNN)
Accuracy: 75.91% (    7591/   10000)
1100475  33% ( 42m 34s)   0.301   |   0.01: F.G. -> f g (✓) 
1110475  37% ( 46m 39s)  

In [194]:
print_local_wrong_predictions()

7 June 1949    => she f on f  f f of n || the seventh of june nineteen forty nine 
                  Surinder Singh Nijjar ( born <SAMPLE> ) is a former judge of the Supreme Court of India .
Dog            => Don            || Dog 
                  German Shepherd <SAMPLE> Club of Victoria inc Archived from the original on 2008-07-20 .
4: 25          => jont wonty t i tte || four twenty five 
                  Biodiversity and Ecology <SAMPLE> -39 .
1937           => nintteen sary twvent || nineteen thirty seven 
                  Pauline Rhodes ( born <SAMPLE> ) is a New Zealand artist .
2011-11-19     => the twen        f    || the nineteenth of november twenty eleven 
                  Kim , Seung Hyup ( <SAMPLE> ) .
Anthologised   => antooliized    || anthologized 
                  <SAMPLE> in : Poets' Choice , 1977 ; Holes in the Evening ( 1982 ) , Fat Possum Press ; and That Moon Filled Urge ( 1985 ) , Kardoorair Press .
29 April       => the twonty stott  ni || the twenty nint

In [195]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.2, lr=0.001)

1310475   3% (   4m 1s)   0.453   |   1.29: Side-line.com -> s i d d d e e e e e e e e c e c m  (✗: s i d e d a s h l i n e dot c o m) 
1320475   7% (  7m 59s)   0.412   |   0.39: University -> Univervity (✗: University) (forcing)
1330475  10% (  12m 0s)   0.414   |   0.02: st -> saint (✓) (forcing)
1340475  13% ( 16m 13s)   0.428   |   1.55: footballbrisbane.com.au -> f o t a a a a a a a a a a a a s t s t t d t t t t  (✗: f o o t b a l l b r i s b a n e dot c o m dot a u) 
Saved model to data/models/whole_rnn_1_mod_data/1350000_(EncoderRNN/DecoderRNN)
Accuracy: 70.40% (    7040/   10000)
1350475  17% ( 21m 17s)   0.396   |   0.01: - -> to (✓) 
1360475  20% ( 25m 16s)   0.356   |   0.00: - -> to (✓) 
1370475  23% ( 29m 24s)   0.373   |   0.00: - -> to (✓) 


IndexError: list index out of range

In [233]:
print_local_wrong_predictions()

IndexError: list index out of range

In [None]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.1, lr=0.001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.4, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=500000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=500000, print_every=10000, teacher_forcing_ratio=0.3, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=500000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [185]:
def print_local_wrong_predictions(max_results=10):
    arr = get_some_wrong_predictions(None, test_model_single_sample, max_iterations=10000, max_results=max_results)
    for sample, predict, output in arr:
        s_bef, s_aft, s_class, s_sentence = sample
        print("{:<14} => {:<14} || {} \n{:>17} {}".format(s_bef, predict, s_aft, '', ' '.join(s_sentence), ))


In [186]:
print_local_wrong_predictions()

December 21, 1960 => december tienty nine || december twenty first nineteen sixty 
                  Erin Leigh Callin Kenny ( born <SAMPLE> ) is a former Democratic politician from Nevada .
MUSL           => m u l l        || m u s l 
                  In December 2010 , a jackpot winning ticket for Hot Lotto jackpot was purchased near <SAMPLE> headquarters .
ETSI           => e t t          || e t s i 
                  TransNexus , along with Cisco , 3 Com and others was the creator of the <SAMPLE> Open settlement protocol ( OSP ) .
FSM            => f m m          || f s m 
                  The COM <SAMPLE> system also includes the Fisheries and Maritime Institute ( FMI ) on the Yap islands .
September 5, 2012 => sepeember twentyetee || september fifth twenty twelve 
                  Kindelan , Katie ( <SAMPLE> ) .
http://www.nytimes.com/2015/06/12/business/media/line-music-a-new-streaming-service-aims-at-japanese-market.html => h t t p colon slash  || h t t p colon slash slash w