In [1]:
%matplotlib inline
import importlib
from pytorch_utils_oh_2 import *

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
MODEL_SAVE_PATH = 'numbers_gen_5_2_layer_output'

In [None]:
import pytorch_utils_oh_2; importlib.reload(pytorch_utils_oh_2); from pytorch_utils_oh_2 import *;

# Data loading

In [4]:
all_data = pickle.load(open("data/en_train_fixed_1.pkl", "rb" ))
all_data_sentence_index = all_data.set_index('sentence_id')

In [5]:
all_data.sample(2)

Unnamed: 0,sentence_id,token_id,class,before,after,class_org
963880,76496,14,NOT_CHANGED,.,.,PUNCT
1948046,151964,4,NOT_CHANGED,),),PUNCT


In [6]:
categories_all = all_data["class"].unique()
print(categories_all)
print(len(categories_all))
categories_index = dict((c, i) for i, c in enumerate(categories_all))

['NOT_CHANGED' 'NUMBERS' 'LETTERS' 'PLAIN' 'VERBATIM' 'ELECTRONIC']
6


### Utils stuff

In [7]:
chars_normal, chars_normal_index = load_characters_pkl('data/en_features/chars_normal.pkl')
print(''.join(chars_normal))

<SOS><EOS>☒ !"#$%&'(),-./0123456789:;ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~£¥ª²³µº¼½¾éɒʻˈΩμ—€⅓⅔⅛


In [8]:
common_words, common_words_index = load_common_words_10k()
len(common_words)
common_words[0:10]

8192

['<EOS>', '<SOS>', '<UNK>', '<0000>', '<SAMPLE>', '.', ',', 'the', '"', 'of']

### More balanced sample

In [9]:
number_data = all_data[all_data['class'] == 'NUMBERS']
print("Data rows: {},  (dropped rows: {})".format(len(number_data), len(all_data)-len(number_data)))
number_data = number_data.reset_index(drop=True)

Data rows: 448172,  (dropped rows: 9470020)


In [10]:
balanced_data = number_data

balanced_data_length = len(balanced_data)
def balanced_data_sample_row():
    global balanced_data_last_sample
    balanced_data_last_sample = balanced_data.iloc[random.randint(1, balanced_data_length-1)]
    return balanced_data_last_sample

In [11]:
 balanced_data_sample_row()

sentence_id      639889
token_id             22
class           NUMBERS
before               12
after            twelve
class_org      CARDINAL
Name: 383543, dtype: object

### Number words

In [23]:
arr = list(set(list(number_data['after'])))
arr = [s.split(' ') for s in arr]
arr = np.concatenate(arr)
arr = sorted(list(set(arr)))
number_words = [EOS_TOKEN, SOS_TOKEN, UNKNOWN_WORD_TOKEN, NUMBER_WORD_TOKEN, SAMPLE_WORD_TOKEN] + arr
number_words_index = dict((c, i) for i, c in enumerate(number_words))
len(number_words)

511

In [24]:
def number_words_to_tensor(words, include_eos=True):
    return words_to_tensor(words, words_lookup_index=number_words_index, include_eos=include_eos)
number_words_to_tensor(['one', 'first']).shape

torch.Size([1, 3, 511])

In [25]:
number_words_index['first']

133

In [26]:
number_words_onehot_sos = number_words_to_tensor([SOS_TOKEN], include_eos=False)
#number_words_onehot_sos = Variable(torch.from_numpy(number_words_onehot_sos)).cuda()
number_words_onehot_sos.size()

torch.Size([1, 1, 511])

### Samples

In [27]:
def get_random_sample():
    sample_row = balanced_data_sample_row()
    sentence_id = sample_row['class']

    rows = all_data_sentence_index.loc[sample_row['sentence_id']]
    befores = list(rows.before)
        
    token_id_idx = list(rows['token_id']).index(sample_row['token_id'])
    befores[token_id_idx] = SAMPLE_WORD_TOKEN
    
    return sample_row['before'], sample_row['after'], sample_row['class'], befores
            
def tmp():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    print(s_class, ':', s_bef, '->', s_aft)
    print(' '.join(s_sentence))
    print(s_sentence)
    print(words_to_tensor(list(s_sentence), common_words_index).shape)
    print(string_to_tensor(s_bef, chars_normal_index).shape)
    print(number_words_to_tensor(s_aft.split(' ')).shape)
tmp()

NUMBERS : 1195 -> eleven ninety five
In <SAMPLE> Henricus le Scotte witnessed a charter by David , Earl of Strathearn .
['In', '<SAMPLE>', 'Henricus', 'le', 'Scotte', 'witnessed', 'a', 'charter', 'by', 'David', ',', 'Earl', 'of', 'Strathearn', '.']
torch.Size([1, 16, 8192])
torch.Size([1, 5, 104])
torch.Size([1, 4, 511])


# Model functions

In [28]:
use_cuda = True

### Encoder

In [29]:
class EncoderRNN(nn.Module):
    def __init__(self, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [30]:
encoder_rnn = EncoderRNN(words_input_size=len(common_words), chars_input_size=len(chars_normal),
                         words_hidden_size=128, chars_hidden_size=128,
                         words_layers=2, chars_layers=2).cuda()
encoder_rnn

EncoderRNN (
  (rnn_words): LSTM(8192, 64, num_layers=2, batch_first=True, bidirectional=True)
  (rnn_chars): LSTM(104, 64, num_layers=2, batch_first=True, bidirectional=True)
)

In [31]:
def test_encoder_single_sample():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    return encoder_rnn(words_t, string_t)
    
encoder_output = test_encoder_single_sample()
encoder_output.size()

torch.Size([1, 256])

### Decoder

In [85]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
                         # LSTM would require own hidden included
        
        self.lin_out = nn.Linear(hidden_size, input_size)
        #self.softmax = nn.LogSoftmax()

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden
    
    def init_rest_hidden(self, input_var):
        hid_var = Variable(torch.zeros(self.n_layers - 1, 1, self.hidden_size)).cuda()
        res = torch.cat((input_var, hid_var), 0)
        return res
        

decoder_rnn = DecoderRNN(input_size=len(number_words), hidden_size=encoder_output.size()[-1], n_layers=2)
decoder_rnn = decoder_rnn.cuda()
decoder_rnn

DecoderRNN (
  (rnn): GRU(511, 256, num_layers=2, batch_first=True)
  (lin_out): Linear (256 -> 511)
)

In [86]:
decoder_rnn.init_rest_hidden(encoder_output.view(1,1,-1)).size()

torch.Size([2, 1, 256])

In [90]:
tmp_hiddens = decoder_rnn.init_rest_hidden(encoder_output.view(1,1,-1))
tmp_a, tmp_b = decoder_rnn(Variable(number_words_onehot_sos).cuda(), tmp_hiddens)
print(tmp_a.size())
print(tmp_b.size())
print(tmp_a.topk(1)[1])
print(number_words[tmp_a.topk(1)[1].data[0][0]])

torch.Size([1, 511])
torch.Size([2, 1, 256])
Variable containing:
 476
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

twos


In [91]:
tmp_a, tmp_b = decoder_rnn(Variable(number_words_onehot_sos).cuda(), tmp_b)
print(tmp_a.size())
print(tmp_a.topk(1)[1])
print(number_words[tmp_a.topk(1)[1].data[0][0]])

torch.Size([1, 511])
Variable containing:
 304
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

nineteens


# Training etc

### Accuracy

In [94]:
def test_model_single_sample(model=None):
    s_bef, s_aft, s_class, s_sentence = sample = get_random_sample()
        
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = decoder_rnn.init_rest_hidden(encoder_output)
    decoder_input = Variable(number_words_onehot_sos).cuda()

    decoded_output = []
    max_length = 20
    for _ in range(max_length):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)
        #return decoder_output

        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = number_words[word_index] # Use own prediction as next input
                
        if word == EOS_TOKEN:
            break

        decoded_output.append(word)
        
        decoder_input = number_words_to_tensor([word], include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
    
    output = ' '.join(decoded_output)
    return output, output, s_aft, sample
    
tmp = test_model_single_sample(None)
tmp

('minutes saint saint saint saint nineteens nineteens nineteens eighties eighties eighties somerville somerville nineteens nineteens eighties eighties somerville nineteens nineteens',
 'minutes saint saint saint saint nineteens nineteens nineteens eighties eighties eighties somerville somerville nineteens nineteens eighties eighties somerville nineteens nineteens',
 'the twenty sixth of june nineteen eleven',
 ('26 June 1911',
  'the twenty sixth of june nineteen eleven',
  'NUMBERS',
  ['<SAMPLE>', '.']))

In [105]:
def print_local_wrong_predictions(max_results=10):
    arr = get_some_wrong_predictions(None, test_model_single_sample, max_iterations=10000, max_results=max_results)
    for sample, predict, output in arr:
        s_bef, s_aft, s_class, s_sentence = sample
        print("{:<14} => {:<14} || {} \n{:>17} {}".format(s_bef, predict, s_aft, '', ' '.join(s_sentence), ))


In [106]:
print_local_wrong_predictions(2)

XI             => one            || the eleventh 
                  Gran cronica de Alfonso <SAMPLE> .
45192          => four thousand seven hundred twenty two || forty five thousand one hundred ninety two 
                  Lieutenant Colonel John Cameron , TD ( <SAMPLE> ) , Royal Regiment of Artillery , Territorial Army ( now TARO ) .


In [61]:
%%time
test_model_accuracy(encoder_rnn, test_model_single_sample)

Accuracy: 0.00% (       0/   10000)
CPU times: user 11min 4s, sys: 7 s, total: 11min 11s
Wall time: 2min 57s


0.0

### Training

In [95]:
def train(s_bef, s_aft, s_sentence, encoder_optimizer, decoder_optimizer, loss_function,
          use_teacher_forcing, max_length=20):
    
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = decoder_rnn.init_rest_hidden(encoder_output)
    decoder_input = Variable(number_words_onehot_sos).cuda()
    
    ###
    
    target_arr = s_aft.split(' ') + [EOS_TOKEN]
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    
    decoded_output = []
    for i in range(len(target_arr)):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)

        decoder_target_i = number_words_index[target_arr[i]]
        decoder_target_i = Variable(torch.LongTensor([decoder_target_i])).cuda()
        loss += loss_function(decoder_output, decoder_target_i)
        
        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = number_words[word_index] # Use own prediction as next input
        decoded_output.append(word)
        
        if use_teacher_forcing:
            word = target_arr[i] # replace input with right target
        else:
            # use output normally as input 
            if word == EOS_TOKEN:
                break
                
        decoder_input = number_words_to_tensor([word], include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
        
    if decoded_output[-1] == EOS_TOKEN:
        decoded_output = decoded_output[:-1]
        
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return ' '.join(decoded_output), (loss.data[0] / len(target_arr))


In [96]:
def train_iterations(n_iters=100000, lr=0.001, teacher_forcing_ratio=0.5,
                     print_every=10000, plot_every=1000):

    start = time.time()
    
    decoder_rnn.train()
    encoder_rnn.train()

    current_loss = 0
    current_loss_iter = 0

    encoder_optimizer = torch.optim.Adam(encoder_rnn.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder_rnn.parameters(), lr=lr)
    loss_function = nn.NLLLoss()
    
    for iteration in range(1, n_iters + 1):
        model_training.iterations += 1
        
        use_teacher_forcing = random.random() < teacher_forcing_ratio
        
        s_bef, s_aft, s_class, s_sentence = get_random_sample()
        
        result, loss = train(s_bef=s_bef, s_aft=s_aft, s_sentence=s_sentence,
                             encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer,
                             loss_function=nn.NLLLoss(), use_teacher_forcing=use_teacher_forcing,
                             max_length=40 )
        
        current_loss += loss
        current_loss_iter += 1

        # Print iter number, loss, name and guess
        if iteration % print_every == 0:
            teacher_forcing_str = ""
            if use_teacher_forcing:
                teacher_forcing_str = "(forcing)"
            correct = '✓' if result == s_aft else "✗: {}".format(s_aft)
            
            print("{:>6d} {:>4.0%} ({:>8}) {:>7.3f}   | {:>6.2f}: {} -> {} ({}) {}".format(
                      model_training.iterations, iteration/n_iters, time_since(start),
                      current_loss/current_loss_iter, loss,
                      s_bef, result, correct, teacher_forcing_str))

        # Add current loss avg to list of losses
        if iteration % plot_every == 0:
            model_training.losses.append(current_loss / plot_every)
            model_training.learning_rates.append(lr)
            current_loss = 0
            current_loss_iter = 0
            
        if model_training.iterations % 50000 == 0 or model_training.iterations == 10:
            model_training.save_models()
            acc = test_model_accuracy(encoder_rnn, test_model_single_sample)
            model_training.accuracy.append(acc)
    
    # test_model_accuracy(model, n_sample=10000)

In [97]:
model_training = ModelTraining(MODEL_SAVE_PATH, [encoder_rnn, decoder_rnn])

Save path: data/models/numbers_gen_5_2_layer_output


In [98]:
train_iterations(n_iters=50, print_every=9, lr=0.0001)

     9  18% (   0m 0s)   5.141   |   1.55: 350 ->  (✗: three hundred fifty) 
Saved model to data/models/numbers_gen_5_2_layer_output/10_(EncoderRNN/DecoderRNN)
Accuracy: 0.00% (       0/   10000)
    18  36% (  0m 46s)   3.891   |   6.14: $16 million -> <EOS> <EOS> <EOS> (✗: sixteen million dollars) (forcing)
    27  54% (  0m 46s)   3.968   |   6.20: 26 January 2013 -> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> (✗: the twenty sixth of january twenty thirteen) (forcing)
    36  72% (  0m 46s)   4.314   |   3.09: 3 ->  (✗: three) 
    45  90% (  0m 46s)   4.176   |   6.11: March 21, 2006 -> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> (✗: march twenty first two thousand six) (forcing)


In [99]:
train_iterations(n_iters=(1000-model_training.iterations), print_every=500, lr=0.0001)

   550  53% (   0m 9s)   2.881   |   3.21: 50 -> nineteen (✗: fifty) (forcing)


In [100]:
train_iterations(n_iters=9000, lr=0.0001, print_every=1000)

  2000  11% (  0m 20s)   2.622   |   3.01: 15 -> two (✗: fifteen) (forcing)
  3000  22% (  0m 42s)   2.535   |   2.19: III -> two (✗: three) 
  4000  33% (   1m 3s)   2.393   |   1.55: 107 -> two hundred (✗: one hundred seven) 
  5000  44% (  1m 23s)   2.370   |   2.39: 90 -> two (✗: ninety) (forcing)
  6000  56% (  1m 45s)   2.287   |   2.15: 266 -> two hundred <EOS> <EOS> (✗: two hundred sixty six) (forcing)
  7000  67% (   2m 6s)   2.228   |   1.14: 1988 -> nineteen ninety (✗: nineteen eighty eight) 
  8000  78% (  2m 27s)   2.209   |   1.70: 2011 -> two thousand (✗: twenty eleven) (forcing)
  9000  89% (  2m 48s)   2.093   |   3.16: January 20, 2012 -> the twenty of of twenty (✗: january twentieth twenty twelve) 
 10000 100% (  3m 10s)   2.094   |   1.19: 1 -> one (✓) 


In [101]:
train_iterations(n_iters=90000, print_every=10000)

 20000  11% (  3m 36s)   1.217   |   1.39: 1891 -> nineteen forty one (✗: eighteen ninety one) (forcing)
 30000  22% (  7m 25s)   0.768   |   0.00: 5 -> five (✓) (forcing)
 40000  33% ( 11m 19s)   0.501   |   4.13: I -> one <EOS> (✗: the first) (forcing)
 50000  44% ( 15m 16s)   0.366   |   0.00: 2007 -> two thousand seven (✓) 
Saved model to data/models/numbers_gen_5_2_layer_output/50000_(EncoderRNN/DecoderRNN)
Accuracy: 73.87% (    7387/   10000)
 60000  56% ( 20m 21s)   0.320   |   0.03: 2015 -> twenty fifteen (✓) (forcing)
 70000  67% ( 24m 47s)   0.272   |   0.00: December 2009 -> december two thousand nine (✓) 
 80000  78% ( 28m 32s)   0.219   |   0.00: 4 -> four (✓) (forcing)
 90000  89% ( 32m 45s)   0.242   |   4.11: 1 kg -> eleven p (✗: one kilogram) (forcing)
100000 100% ( 36m 51s)   0.177   |   0.02: 3.1 -> three point one (✓) 
Saved model to data/models/numbers_gen_5_2_layer_output/100000_(EncoderRNN/DecoderRNN)
Accuracy: 88.56% (    8856/   10000)


In [102]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.001)

110000   3% (   3m 3s)   0.229   |   0.00: 3 -> three (✓) 
120000   7% (  5m 55s)   0.233   |   0.00: 133 -> one hundred thirty three (✓) (forcing)
130000  10% (   9m 2s)   0.183   |   0.00: July 10, 2012 -> july tenth twenty twelve (✓) 
140000  13% ( 11m 49s)   0.142   |   0.00: 1894 -> eighteen ninety four (✓) 
150000  17% ( 14m 40s)   0.182   |   0.00: 1st -> first (✓) (forcing)
Saved model to data/models/numbers_gen_5_2_layer_output/150000_(EncoderRNN/DecoderRNN)
Accuracy: 90.55% (    9055/   10000)
160000  20% ( 18m 29s)   0.148   |   0.00: 2 -> two (✓) (forcing)
170000  23% ( 21m 21s)   0.169   |   0.00: 2004 -> two thousand four (✓) (forcing)
180000  27% ( 24m 16s)   0.152   |   0.00: September 26, 2006 -> september twenty sixth two thousand six (✓) 
190000  30% ( 27m 15s)   0.174   |   0.01: 298 -> two hundred ninety eight (✓) 
200000  33% (  30m 7s)   0.139   |   0.00: 12th -> twelfth (✓) (forcing)
Saved model to data/models/numbers_gen_5_2_layer_output/200000_(EncoderRNN/Deco

In [107]:
print_local_wrong_predictions()

-8543          => minus eight thousand four hundred forty three || minus eight thousand five hundred forty three 
                  The result was the identification of BAY 41 -2272 and BAY 41 <SAMPLE> .
2010-07-08     => the eighth of april twenty eighth || the eighth of july twenty ten 
                  Sperling , Nicole ( <SAMPLE> ) .
10930          => ten thousand nine hundred three || ten thousand nine hundred thirty 
                  Asteroid <SAMPLE> Jinyong ( 1998 CR2 ) is named after him .
169-70 ISBN 1611493528 => one two sil six two sil six three sil three sil three || one six nine sil seven o sil i s b n sil one six one one four nine three five two eight 
                  <SAMPLE> Roger Moorhouse , The Devils' Alliance : Hitler's Pact with Stalin , 1939 - 1941 .
60             => sixty          || six o 
                  Duncan Island ( 36 LA <SAMPLE> , 61 ) is a prehistoric archaeological site located in the Susquehanna River at Martic Township in Lancaster County , Pe

In [None]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.1, lr=0.001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=100000, print_every=10000, teacher_forcing_ratio=0, lr=0.001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()