In [1]:
%matplotlib inline
import importlib
from pytorch_utils_oh_2 import *

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [213]:
MODEL_SAVE_PATH = 'whole_gen_6_after_words'

In [4]:
import pytorch_utils_oh_2; importlib.reload(pytorch_utils_oh_2); from pytorch_utils_oh_2 import *;

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


# Data loading

In [5]:
all_data = pickle.load(open("data/en_train_fixed_1.pkl", "rb" ))
all_data_sentence_index = all_data.set_index('sentence_id')

In [6]:
categories_all = sorted(all_data["class"].unique())
print(categories_all)
print(len(categories_all))
categories_index = dict((c, i) for i, c in enumerate(categories_all))

['ELECTRONIC', 'LETTERS', 'NOT_CHANGED', 'NUMBERS', 'PLAIN', 'VERBATIM']
6


### Utils stuff

In [7]:
chars_normal, chars_normal_index = load_characters_pkl('data/en_features/chars_normal.pkl')
print(''.join(chars_normal))

<SOS><EOS>☒ !"#$%&'(),-./0123456789:;ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~£¥ª²³µº¼½¾éɒʻˈΩμ—€⅓⅔⅛


common_words, common_words_index = load_common_words_10k()
len(common_words)
common_words[0:10]

In [8]:
wv_vecs, wv_words, wv_idx = load_glove('/home/ohu/koodi/data/glove_wordvec/glove.6B.50d.txt')

### After words handling

In [9]:
words_after_common = pickle.load(open("data/en_features/words_after_all.pkl", "rb"))
words_after_common = [EOS_TOKEN, SOS_TOKEN, UNKNOWN_WORD_TOKEN, NUMBER_WORD_TOKEN, SAMPLE_WORD_TOKEN] + words_after_common
words_after_common[0:7]
words_after_index = dict((c, i) for i, c in enumerate(words_after_common))

['<EOS>', '<SOS>', '<UNK>', '<0000>', '<SAMPLE>', 'two', 'twenty']

In [10]:
words_after_by_length = sorted(words_after_common, key=len, reverse=True)
words_after_regex = re.compile('(' + ')|('.join(words_after_by_length) + ')')

In [208]:
def after_sentence_to_word_indexes(sentence, include_eos=True):
    reg = re.finditer(words_after_regex, sentence)
    arr = [words_after_index[s[0]] for s in reg]
    if include_eos:
        arr += [words_after_index[EOS_TOKEN]]
    return arr
tmp = after_sentence_to_word_indexes('one plus two equals one hundred')
tmp
[words_after_common[s] for s in tmp]

[9, 1212, 5, 1252, 9, 10, 0]

['one', 'plus', 'two', 'equals', 'one', 'hundred', '<EOS>']

def after_sentence_to_tensor(sentence, include_eos=True):
    arr = after_sentence_to_word_indexes(sentence, include_eos)
    tensor = torch.zeros(len(arr), len(words_after_index))
    tensor.scatter_(1, torch.LongTensor(arr).view(-1,1) , 1)
    return torch.unsqueeze(tensor, 0)
tmp = after_sentence_to_tensor('one plus two equals one hundred')
tmp.size()
[words_after_common[i[0]] for i in tmp.topk(1)[1][0]]

In [188]:
onehot_sos = torch.zeros(1, 1, len(words_after_index))
onehot_sos[0, 0, words_after_index[SOS_TOKEN]] = 1

onehot_sos.size()

torch.Size([1, 1, 1351])

### More balanced sample

In [141]:
sample_data = all_data[all_data['class'] != 'NOT_CHANGED']
print("Data rows: {},  (dropped rows: {})".format(len(sample_data), len(all_data)-len(sample_data)))
sample_data = sample_data.reset_index(drop=True)

Data rows: 659544,  (dropped rows: 9258648)


In [142]:
balanced_data_classes_select = list(sample_data.groupby('class'))

balanced_data_accessed_counter = 0 
balanced_data_length = 0
def balanced_data_randomize(max_len=20000):
    global balanced_data, balanced_data_length, balanced_data_accessed_counter
    balanced_data = pd.concat([v.sample(min(max_len, len(v))) for k, v in balanced_data_classes_select])
    balanced_data_length = len(balanced_data)
    balanced_data_accessed_counter = 0

def balanced_data_sample_row():
    global balanced_data_accessed_counter
    global balanced_data_last_sample
    balanced_data_accessed_counter += 1
    if balanced_data_accessed_counter/balanced_data_length > 0.2:
        balanced_data_randomize()
    balanced_data_last_sample = balanced_data.iloc[random.randint(1, balanced_data_length-1)]
    return balanced_data_last_sample
    
balanced_data_randomize()

In [143]:
%%timeit
balanced_data_sample_row()

102 µs ± 3.26 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [144]:
#all_data.groupby('class')['class'].count()
#sample_data.groupby('class')['class'].count()
balanced_data.groupby('class')['class'].count()

class
ELECTRONIC     4964
LETTERS       20000
NUMBERS       20000
PLAIN         20000
VERBATIM      16950
Name: class, dtype: int64

In [145]:
 balanced_data_sample_row()

sentence_id                         507358
token_id                                10
class                              NUMBERS
before                               1.63%
after          one point six three percent
class_org                          MEASURE
Name: 444646, dtype: object

### Samples

In [193]:
def get_random_sample():
    sample_row = balanced_data_sample_row()
    sentence_id = sample_row['class']

    rows = all_data_sentence_index.loc[sample_row['sentence_id']]
    befores = list(rows.before)
        
    token_id_idx = list(rows['token_id']).index(sample_row['token_id'])
    befores[token_id_idx] = SAMPLE_WORD_TOKEN
    
    return sample_row['before'], sample_row['after'], sample_row['class'], befores
            
def tmp():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    print(s_class, ':', s_bef, '->', s_aft)
    print(' '.join(s_sentence))
    print(s_sentence)
    print(string_to_tensor(s_bef, chars_normal_index).shape)
    #tmp = after_sentence_to_tensor(s_aft)
    tmp = after_sentence_to_word_indexes(s_aft)
    print(tmp)
    print(' '.join([words_after_common[i] for i in tmp]))
    #print([words_after_common[i[0]] for i in tmp.topk(1)[1][0]])
tmp()

NUMBERS : 3 February 2008 -> the third of february two thousand eight
Archived from the original on <SAMPLE> .
['Archived', 'from', 'the', 'original', 'on', '<SAMPLE>', '.']
torch.Size([1, 16, 104])
[11, 76, 12, 72, 5, 8, 16, 0]
the third of february two thousand eight <EOS>


In [227]:
%%timeit
get_random_sample()

787 µs ± 116 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Model functions

In [45]:
use_cuda = True

### Encoder

In [195]:
class EncoderRNN(nn.Module):
    def __init__(self, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [196]:
encoder_rnn = EncoderRNN(words_input_size=wv_vecs.shape[-1], chars_input_size=len(chars_normal),
                         words_hidden_size=128, chars_hidden_size=256,
                         words_layers=1, chars_layers=1).cuda()
encoder_rnn

EncoderRNN (
  (rnn_words): LSTM(50, 64, batch_first=True, bidirectional=True)
  (rnn_chars): LSTM(104, 128, batch_first=True, bidirectional=True)
)

In [197]:
def test_encoder_single_sample():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    
    words_t = Variable(words_to_word_vectors_tensor(list(s_sentence), wv_vecs, wv_idx)).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    return encoder_rnn(words_t, string_t)
    
encoder_output = test_encoder_single_sample()
encoder_output.size()

torch.Size([1, 384])

### Decoder

In [198]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
        
        self.lin_out = nn.Linear(hidden_size, input_size)
        #self.softmax = nn.LogSoftmax()

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden
    
    def init_rest_hidden(self, input_var):
        if self.n_layers > 1:
            hid_var = Variable(torch.zeros(self.n_layers - 1, 1, self.hidden_size)).cuda()
            res = torch.cat((input_var, hid_var), 0)
            return res
        else:
            return input_var
        

decoder_rnn = DecoderRNN(input_size=len(words_after_common), hidden_size=128+256, n_layers=1)
decoder_rnn = decoder_rnn.cuda()
decoder_rnn

DecoderRNN (
  (rnn): GRU(1351, 384, batch_first=True)
  (lin_out): Linear (384 -> 1351)
)

In [199]:
tmp_hiddens = decoder_rnn.init_rest_hidden(encoder_output.view(1,1,-1))
tmp_a, tmp_b = decoder_rnn(Variable(onehot_sos).cuda(), tmp_hiddens)
print(tmp_a.size())
print(tmp_b.size())
print(tmp_a.topk(1)[1])
print(words_after_common[tmp_a.topk(1)[1].data[0][0]])

torch.Size([1, 1351])
torch.Size([1, 1, 384])
Variable containing:
 1176
[torch.cuda.LongTensor of size 1x1 (GPU 0)]

lumens


# Training etc

### Accuracy

In [200]:
def test_model_single_sample(model=None):
    s_bef, s_aft, s_class, s_sentence = sample = get_random_sample()
        
    words_t = Variable(words_to_word_vectors_tensor(list(s_sentence), wv_vecs, wv_idx)).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = decoder_rnn.init_rest_hidden(encoder_output)
    decoder_input = Variable(onehot_sos).cuda()

    decoded_output = []
    max_length = 20
    for _ in range(max_length):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)
        #return decoder_output

        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = words_after_common[word_index] # Use own prediction as next input
                
        if word == EOS_TOKEN:
            break

        decoded_output.append(word)
        
        decoder_input = torch.zeros(1, 1, len(words_after_index))
        decoder_input[0, 0, word_index] = 1
        decoder_input = Variable(decoder_input).cuda()
    
    output = ' '.join(decoded_output)
    return output, output, s_aft, sample
    
tmp = test_model_single_sample(None)
tmp

('lead lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens',
 'lead lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens',
 'p l o s',
 ('PLOS',
  'p l o s',
  'LETTERS',
  ['<SAMPLE>', 'ONE', 'e', '45', ':', 'e', '45', '.']))

In [201]:
def print_local_wrong_predictions(max_results=10):
    arr = get_some_wrong_predictions(None, test_model_single_sample, max_iterations=10000, max_results=max_results)
    for sample, predict, output in arr:
        s_bef, s_aft, s_class, s_sentence = sample
        print("{:<14} => {:<14} || {} \n{:>17} {}".format(s_bef, predict, s_aft, '', ' '.join(s_sentence), ))


In [202]:
print_local_wrong_predictions(2)

7              => capitalizing heer lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter || seven 
                  " Lego Harry Potter Years 5 - <SAMPLE> Review " .
PRR            => capitalizing heer lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter lumens kohlschutter || p r r 
                  " <SAMPLE> CHRONOLOGY 1971 ( June 2005 Edition ) " ( PDF ) .


In [203]:
%%time
test_model_accuracy(encoder_rnn, test_model_single_sample, n_sample=1000)

Accuracy: 0.00% (       0/    1000)
CPU times: user 12 s, sys: 200 ms, total: 12.2 s
Wall time: 12.2 s


0.0

### Training

In [214]:
def train(s_bef, s_aft, s_sentence, encoder_optimizer, decoder_optimizer, loss_function,
          use_teacher_forcing, max_length=20):
    
    words_t = Variable(words_to_word_vectors_tensor(list(s_sentence), wv_vecs, wv_idx)).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = encoder_rnn(words_t, string_t)
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = decoder_rnn.init_rest_hidden(encoder_output)
    decoder_input = Variable(onehot_sos).cuda()
    
    ###
    
    target_arr = after_sentence_to_word_indexes(s_aft, include_eos=True)
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    
    decoded_output = []
    for i in range(len(target_arr)):
        decoder_output, decoder_hidden = decoder_rnn(decoder_input, decoder_hidden)

        #decoder_target_i = chars_after_index[target_arr[i]]
        decoder_target_i = target_arr[i]
        decoder_target_i = Variable(torch.LongTensor([decoder_target_i])).cuda()
        loss += loss_function(decoder_output, decoder_target_i)
        
        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = words_after_common[word_index] # Use own prediction as next input
        decoded_output.append(word)
        
        if use_teacher_forcing:
            word_index = target_arr[i] # replace input with right target
        else:
            # use output normally as input 
            if word == EOS_TOKEN:
                break
                
        decoder_input = torch.zeros(1, 1, len(words_after_index))
        decoder_input[0, 0, word_index] = 1
        decoder_input = Variable(decoder_input).cuda()
        
    if decoded_output[-1] == EOS_TOKEN:
        decoded_output = decoded_output[:-1]
        
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return ' '.join(decoded_output), (loss.data[0] / len(target_arr))


In [215]:
def train_iterations(n_iters=100000, lr=0.001, teacher_forcing_ratio=0.5,
                     print_every=10000, plot_every=1000):

    start = time.time()
    
    decoder_rnn.train()
    encoder_rnn.train()

    current_loss = 0
    current_loss_iter = 0

    encoder_optimizer = torch.optim.Adam(encoder_rnn.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder_rnn.parameters(), lr=lr)
    loss_function = nn.NLLLoss()
    
    for iteration in range(1, n_iters + 1):
        model_training.iterations += 1
        
        use_teacher_forcing = random.random() < teacher_forcing_ratio
        
        s_bef, s_aft, s_class, s_sentence = get_random_sample()
        
        result, loss = train(s_bef=s_bef, s_aft=s_aft, s_sentence=s_sentence,
                             encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer,
                             loss_function=nn.NLLLoss(), use_teacher_forcing=use_teacher_forcing,
                             max_length=40 )
        
        current_loss += loss
        current_loss_iter += 1

        # Print iter number, loss, name and guess
        if iteration % print_every == 0:
            teacher_forcing_str = ""
            if use_teacher_forcing:
                teacher_forcing_str = "(forcing)"
            correct = '✓' if result == s_aft else "✗: {}".format(s_aft)
            
            print("{:>6d} {:>4.0%} ({:>8}) {:>7.3f}   | {:>6.2f}: {} -> {} ({}) {}".format(
                      model_training.iterations, iteration/n_iters, time_since(start),
                      current_loss/current_loss_iter, loss,
                      s_bef, result, correct, teacher_forcing_str))

        # Add current loss avg to list of losses
        if iteration % plot_every == 0:
            model_training.losses.append(current_loss / plot_every)
            model_training.learning_rates.append(lr)
            current_loss = 0
            current_loss_iter = 0
            
        if model_training.iterations % 50000 == 0 or model_training.iterations == 10:
            model_training.save_models()
            acc = test_model_accuracy(encoder_rnn, test_model_single_sample)
            model_training.accuracy.append(acc)
    
    # test_model_accuracy(model, n_sample=10000)

In [216]:
model_training = ModelTraining(MODEL_SAVE_PATH, [encoder_rnn, decoder_rnn])

Save path: data/models/whole_gen_6_after_words


In [217]:
train_iterations(n_iters=50, print_every=9, lr=0.0001)

     9  18% (   0m 0s)   7.209   |   7.19: τ -> lumens lumens (✗: tau) (forcing)
Saved model to data/models/whole_gen_6_after_words/10_(EncoderRNN/DecoderRNN)
Accuracy: 0.00% (       0/   10000)
    18  36% (   2m 7s)   7.198   |   7.19: A. K. -> lumens two lumens (✗: a k) 
    27  54% (   2m 7s)   7.190   |   7.15: F.C. -> two two (✗: f c) (forcing)
    36  72% (   2m 7s)   6.979   |   7.12: st -> <EOS> (✗: saint) (forcing)
    45  90% (   2m 8s)   6.774   |   7.16: 16 April 2005 -> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> (✗: the sixteenth of april two thousand five) (forcing)


In [218]:
train_iterations(n_iters=(1000-model_training.iterations), print_every=500, lr=0.0001)

   550  53% (   0m 6s)   3.506   |   5.94: 21 June 2014 -> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> (✗: the twenty first of june twenty fourteen) (forcing)


In [219]:
train_iterations(n_iters=9000, lr=0.0001, print_every=1000)

  2000  11% (  0m 13s)   2.916   |   5.27: 6653084 -> to <EOS> <EOS> <EOS> <EOS> <EOS> e e e e (✗: six million six hundred fifty three thousand eighty four) (forcing)
  3000  22% (  0m 26s)   2.676   |   3.28: ι -> and (✗: iota) (forcing)
  4000  33% (  0m 39s)   2.566   |   2.25: 1424 -> nineteen (✗: fourteen twenty four) 
  5000  44% (  0m 51s)   2.407   |   0.72: - -> and (✗: to) (forcing)
  6000  56% (   1m 4s)   2.351   |   0.18: & -> and (✓) 
  7000  67% (  1m 18s)   2.275   |   2.60: ST -> p (✗: saint) 
  8000  78% (  1m 31s)   2.148   |   3.35: 9 April 2013 -> the twenty twenty twenty two <EOS> (✗: the ninth of april twenty thirteen) (forcing)
  9000  89% (  1m 44s)   2.122   |   2.99: 5 -> to (✗: five) 
 10000 100% (  1m 57s)   2.061   |   2.46: TV -> p (✗: t v) 


In [220]:
train_iterations(n_iters=90000, print_every=10000)

 20000  11% (  2m 20s)   1.443   |   1.89: kilometres -> kilometers (✓) 
 30000  22% (  4m 47s)   0.946   |   3.47: $32,083 -> two thousand three hundred eighty (✗: thirty two thousand eighty three dollars) 
 40000  33% (  7m 22s)   0.712   |   0.01: colours -> colors (✓) (forcing)
 50000  44% (  10m 7s)   0.653   |   0.90: MG -> g m (✗: m g) 
Saved model to data/models/whole_gen_6_after_words/50000_(EncoderRNN/DecoderRNN)
Accuracy: 68.06% (    6806/   10000)
 60000  56% ( 13m 22s)   0.587   |   7.15: OO -> o o (✗: oxygen monoxide) 
 70000  67% ( 15m 45s)   0.499   |   0.20: A.V.V. -> a v v (✓) 
 80000  78% (  18m 9s)   0.549   |   6.45: 3½ -> thirty hundred <EOS> <EOS> (✗: three and a half) (forcing)
 90000  89% ( 20m 37s)   0.511   |   0.00: & -> and (✓) 
100000 100% (  23m 6s)   0.484   |   0.00: US -> u s (✓) 
Saved model to data/models/whole_gen_6_after_words/100000_(EncoderRNN/DecoderRNN)
Accuracy: 77.89% (    7789/   10000)


In [221]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.001)

110000   3% (  2m 26s)   0.487   |   0.31: Spike.com -> s p i k e dot c o m (✓) (forcing)
120000   7% (  4m 58s)   0.386   |   0.00: B. -> b (✓) (forcing)
130000  10% (  7m 22s)   0.421   |   0.00: bros -> brothers (✓) 
140000  13% (  9m 51s)   0.413   |   0.00: 2006 -> two thousand six (✓) (forcing)
150000  17% ( 12m 15s)   0.384   |   0.00: & -> and (✓) (forcing)
Saved model to data/models/whole_gen_6_after_words/150000_(EncoderRNN/DecoderRNN)
Accuracy: 80.19% (    8019/   10000)
160000  20% ( 15m 17s)   0.394   |   0.00: US -> u s (✓) 
170000  23% ( 17m 40s)   0.336   |   0.00: catalogue -> catalog (✓) 
180000  27% (  20m 1s)   0.409   |   0.35: UCSF -> u c f f (✗: u c s f) 
190000  30% ( 22m 37s)   0.369   |   0.00: - -> to (✓) 
200000  33% (  25m 5s)   0.383   |   1.02: www.samizdat.com -> w w w dot a a m m d a a dot dot c o m (✗: w w w dot s a m i z d a t dot c o m) (forcing)
Saved model to data/models/whole_gen_6_after_words/200000_(EncoderRNN/DecoderRNN)
Accuracy: 82.31% (    8

In [222]:
print_local_wrong_predictions()

WordPress.com  => w o r d p r s s e r dot c c m || w o r d p r e s s dot c o m 
                  Kirk of the Antarctic ( Blog at <SAMPLE> ) .
ι              => epsilon        || iota 
                  This term derives from μ ε λ <SAMPLE> ζ ε ι ( melizei , " to cut " ) .
1,373          => thirteen seventy three || one thousand three hundred seventy three 
                  As of the census of 2010 , there were 3,810 people , <SAMPLE> households , and 1,121 families residing in the village .
δ              => tau            || delta 
                  " Bound pillar " , from Greek <SAMPLE> ε σ μ ο ς ( desmos ) , bond , and σ τ ῦ λ ο ς ( stulos ) , pillar , referring to the shape of the molars .
$39,789        => thirty seven thousand nine hundred eighty three dollars || thirty nine thousand seven hundred eighty nine dollars 
                  The median income for a household in the CDP was <SAMPLE> , and the median income for a family was $46,035 .
http://www.altpress.com/news/8375.h

In [223]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.3, lr=0.001)

410000   3% (  2m 48s)   0.332   |   0.11: DG -> d g (✓) 
420000   7% (  5m 41s)   0.302   |   0.00: 2003 -> two thousand three (✓) (forcing)
430000  10% (  8m 36s)   0.269   |   0.00: μ -> mu (✓) (forcing)
440000  13% ( 11m 44s)   0.319   |   0.00: & -> and (✓) (forcing)
450000  17% ( 14m 30s)   0.299   |   0.00: & -> and (✓) 
Saved model to data/models/whole_gen_6_after_words/450000_(EncoderRNN/DecoderRNN)
Accuracy: 83.54% (    8354/   10000)
460000  20% (  18m 7s)   0.309   |   0.93: PatentlyO.com -> p n t t l t o y dot c c o m (✗: p a t e n t l y o dot c o m) (forcing)
470000  23% (  21m 2s)   0.321   |   0.02: 1 -> one (✓) (forcing)
480000  27% (  24m 3s)   0.312   |   0.00: & -> and (✓) (forcing)
490000  30% ( 26m 56s)   0.267   |   0.78: 0 -> o (✗: zero) 
500000  33% ( 29m 48s)   0.301   |   0.00: 1931 -> nineteen thirty one (✓) (forcing)
Saved model to data/models/whole_gen_6_after_words/500000_(EncoderRNN/DecoderRNN)
Accuracy: 84.41% (    8441/   10000)
510000  37% ( 33m 32s) 

In [224]:
print_local_wrong_predictions()

Worldconstructiontoday.com => w o r l d c o t o n d o c o c o t c o m || w o r l d c o n s t r u c t i o n t o d a y dot c o m 
                  <SAMPLE> ( 6 April 2011 ) .
TheFutonCritic.com => t h e f u t i n c i o c i t c c c dot c o || t h e f u t o n c r i t i c dot c o m 
                  " Breaking News — Bravo's ' Live Earth ' Coverage Marks Network's Best Saturday — Ever — Across All Key Demos — <SAMPLE> " .
ο              => alpha          || omicron 
                  These two words are ( a ) metamellomai / μ ε τ α μ ε λ λ <SAMPLE> μ α ι or ( b ) metanoeo / μ ε τ α ν ο ἐ ω and its cognate metanoia / μ ε τ α ν ο ι α .
http://asstr.org => h t t p colon colon slash slash s s t s dot g r g || h t t p colon slash slash a s s t r dot o r g 
                  The huge site <SAMPLE> archives and indexes erotic and pornographic stories posted to the Usenet group alt.sex.stories .
Salem-News.com => s a l e e m s e s dot c o m s dot c o m || s a l e m d a s h n e w s dot c o m 
    

In [225]:
train_iterations(n_iters=100000, print_every=10000, teacher_forcing_ratio=0.5, lr=0.0001)

710000  10% (  2m 54s)   0.240   |   0.00: 1781 -> seventeen eighty one (✓) 
720000  20% (  5m 48s)   0.200   |   0.00: colonisation -> colonization (✓) (forcing)
730000  30% (  8m 50s)   0.212   |   0.01: 1985 -> nineteen eighty five (✓) (forcing)
740000  40% ( 11m 42s)   0.206   |   0.00: - -> to (✓) (forcing)
750000  50% ( 14m 32s)   0.216   |   0.00: NWTF -> n w t f (✓) 
Saved model to data/models/whole_gen_6_after_words/750000_(EncoderRNN/DecoderRNN)
Accuracy: 87.22% (    8722/   10000)
760000  60% ( 18m 12s)   0.202   |   0.00: T. -> t (✓) (forcing)
770000  70% ( 20m 53s)   0.245   |   0.00: P- -> p (✓) (forcing)
780000  80% ( 23m 16s)   0.189   |   1.14: http://www.msnbc.msn.com/id/42103936/ -> h t t p colon slash slash w w w dot b b s c c dot com e slash e com slash c n slash t i u r t e o slash n e slash n h r e e slash i n e e h r e e s i x o o (✗: h t t p colon slash slash w w w dot m s n b c dot m s n dot com slash i d slash f o u r t w o o n e o t h r e e n i n e t h r e e

In [228]:
print_local_wrong_predictions()

2012           => two thousand twelve || twenty twelve 
                  Tom Shutt from Case Western Reserve University was LUX co spokesperson between 2007 - <SAMPLE> .
st             => saint          || street 
                  Grandstands for Charleston Athletic Association Park ( velodrome ) ( 1898 ) , northwest corner of Meeting st and Sheppard <SAMPLE> , charleston south carolina .
ο              => iota           || omicron 
                  " Ε θ ν ι κ ο κ α ι Κ α π ο δ ι σ τ ρ ι α κ ο Π α ν ε π ι σ τ η μ ι ο Α θ η ν ω ν , Τ μ η μ α Μ ο υ σ ι κ ω ν Σ π ο υ δ ω ν , Β ι β λ ι ο θ η κ η Τ μ η μ α τ <SAMPLE> ς Μ ο υ σ ι κ ω ν Σ π ο υ δ ω ν , Gregorios Protopsaltes Archive , Dossier 137 " .
will.i.amno    => w i l l dot i dot a n || w i l l dot i dot a m n o 
                  " The Jackass Song " uploaded by will.i.amBlog written by <SAMPLE> byline ( 2009-01-19 ) .
ι              => sigma          || iota 
                  Great Synaxaristes : ( Greek ) Ὁ Ὅ σ ι ο ς Β α σ ι λ ε

In [229]:
train_iterations(n_iters=100000, print_every=10000, teacher_forcing_ratio=0.1, lr=0.001)

810000  10% (   3m 4s)   0.263   |   1.46: Malayalamcinema.com -> m a l a m a m a a m m a m m m o (✗: m a l a y a l a m c i n e m a dot c o m) 
820000  20% (  5m 40s)   0.279   |   0.00: A. -> a (✓) 
830000  30% (  8m 16s)   0.222   |   0.00: & -> and (✓) 
840000  40% ( 10m 45s)   0.306   |   0.35: 5000 -> five thousand hundred (✗: five thousand) 
850000  50% ( 13m 19s)   0.325   |   0.00: 1974 -> nineteen seventy four (✓) 
Saved model to data/models/whole_gen_6_after_words/850000_(EncoderRNN/DecoderRNN)
Accuracy: 85.97% (    8597/   10000)
860000  60% ( 16m 26s)   0.267   |   0.19: Aatw.com -> a a t w dot c o m (✓) 
870000  70% ( 18m 52s)   0.274   |   0.00: 2010 -> twenty ten (✓) 
880000  80% ( 21m 15s)   0.238   |   0.00: colour -> color (✓) 
890000  90% ( 23m 46s)   0.282   |   0.55: PILYA -> p y l y a a (✗: p i l y a) (forcing)
900000 100% ( 26m 12s)   0.279   |   0.00: st -> saint (✓) 
Saved model to data/models/whole_gen_6_after_words/900000_(EncoderRNN/DecoderRNN)
Accuracy: 85.

In [230]:
print_local_wrong_predictions()

Β              => omicron        || beta 
                  Babrius ( Greek : <SAMPLE> α β ρ ι ο ς , Babrios ; florida .
www.worldcat.org => w w w dot o o r d a r c dot o g || w w w dot w o r l d c a t dot o r g 
                  WorldCat ( <SAMPLE> ; OCLC 45342846 ) .
st             => saint          || street 
                  Born in Gun <SAMPLE> of Polish Jewish parents .
pp.vii-viii    => p p dot i i i i i i i i i i i i i i i i i || p p dot v i i d a s h v i i i 
                  Preface , Castle Tubin ( Norwood , Mass : Press One , 2006 ) , <SAMPLE> .
Eds            => e d s          || e d's 
                  Bruckman , A.S. , Guzdial , M. , Kolodner , J.L. , Ram , A. , ( <SAMPLE> .
Viddsee.com    => d i d d a e e e dot c o m || v i d d s e e dot c o m 
                  " <SAMPLE> Site Info " .
29.530589      => two hundred fifty five million five hundred eighty five hundred || twenty nine point five three o five eight nine 
                  It turns with a modeled rotatio

In [231]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0.1, lr=0.0005)

910000   5% (  2m 29s)   0.253   |   1.19: ε -> omicron (✗: epsilon) 
920000  10% (   5m 0s)   0.226   |   0.00: & -> and (✓) 
930000  15% (  7m 35s)   0.283   |   0.22: specialise -> specialize (✓) 
940000  20% (  10m 4s)   0.235   |   1.60: κ -> omicron (✗: kappa) 
950000  25% ( 12m 32s)   0.244   |   0.00: K. -> k (✓) 
Saved model to data/models/whole_gen_6_after_words/950000_(EncoderRNN/DecoderRNN)
Accuracy: 86.40% (    8640/   10000)
960000  30% ( 15m 43s)   0.237   |   0.00: 3.8 -> three point eight (✓) 
970000  35% ( 18m 13s)   0.194   |   0.00: AJ -> a j (✓) 
980000  40% ( 20m 42s)   0.253   |   0.01: st -> saint (✓) (forcing)
990000  45% (  23m 8s)   0.255   |   0.00: PDF -> p d f (✓) 
1000000  50% ( 25m 35s)   0.267   |   1.37: Weltfussball.de -> w e l t f u s e l l l l d dot d l (✗: w e l t f u s s b a l l dot d e) 
Saved model to data/models/whole_gen_6_after_words/1000000_(EncoderRNN/DecoderRNN)
Accuracy: 87.71% (    8771/   10000)
1010000  55% ( 28m 40s)   0.242   |   3.1

In [232]:
print_local_wrong_predictions()

81-7017-211    => one sil sil sil one one one one one sil one sil || eight one sil seven o one seven sil two one one 
                  ISBN <SAMPLE> - X . William Hunter Workman ; Fanny Bullock Workman ( 1904 ) .
Gamlehaugen.no => g a m l e a l n n g n e dot dot n n || g a m l e h a u g e n dot n o 
                  " <SAMPLE> Parken " ( in Norwegian ) .
Savour         => savior         || savor 
                  " <SAMPLE> John Keats' poetry in garden where he wrote " .
NO             => nitrogen monoxide || n o 
                  The H 2 <SAMPLE> campaign had been conducted through an Internet memo to distributors and restaurants .
#              => number         || hash 
                  Paper <SAMPLE> IAC- 12- A 1 .
http://www.stuff.co.nz/national/politics/64209246/rizalman-report-may-see-disciplinary-action => h t t p colon slash slash w w w dot s f u i dot dot dot dot slash || h t t p colon slash slash w w w dot s t u f f dot c o dot n z slash n a t i o n a l slash p o l i t

In [None]:
train_iterations(n_iters=300000, print_every=10000, teacher_forcing_ratio=0.1, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=200000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=500000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=500000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=500000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=500000, print_every=10000, teacher_forcing_ratio=0, lr=0.0001)

In [None]:
print_local_wrong_predictions()