In [2]:
import os
import time
import random
import io
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict, Counter

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from utils.standard_hparams_utils import standard_hparams
from train import train
from nmt import create_hparams, create_or_load_hparams

%matplotlib inline
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


In [3]:
root_data_path = "/home/sven/dialog_corpus"
# root_data_path = "E:\\Documents\\dialog_corpus"
full_path = os.path.join(root_data_path, "movie_lines.txt")
train_path = os.path.join(root_data_path, "train.tgt")
perturbed_train_path = os.path.join(root_data_path, "train.src")
val_path = os.path.join(root_data_path, "val.tgt")
perturbed_val_path = os.path.join(root_data_path, "val.src")
test_path = os.path.join(root_data_path, "test.tgt")
perturbed_test_path = os.path.join(root_data_path, "test.src")
tgt_vocab_path = os.path.join(root_data_path, "vocab.tgt")
src_vocab_path = os.path.join(root_data_path, "vocab.src")
model_path = os.path.join(root_data_path, "dialog_correcter_model_testnltk")

In [22]:
with open(os.path.join(root_data_path, "movie_lines_raw.txt"), "r") as raw_data, \
        io.open(full_path, "w", encoding='utf-8') as out:
    for line in raw_data:
        parts = line.split(" +++$+++ ")
        dialog_line = parts[-1]
        s = dialog_line.strip().lower()
        preprocessed_line = " ".join(nltk.word_tokenize(s))
        if preprocessed_line:
            out.write(unicode("\n" + preprocessed_line))

In [45]:
import ast
import dis
import astunparse
ast_path = "E:\\Downloads\\py150\\python100k_train.json"
with open(ast_path, "r") as ast_data:
    ast_tree = ast_data.readline()
    print(astunparse.unparse((ast.parse(ast_tree).body)))

AttributeError: 'Unparser' object has no attribute '_str'

In [4]:
DROPOUT_TOKENS = {"a", "an", "the", "'ll", "'s", "'m", "'ve"}

REPLACEMENTS = {"there": "their", "their": "there", "then": "than", "than": "then"}

count = 0
with open(full_path, "r") as raw_data, \
        io.open(tgt_vocab_path, "w", encoding="utf-8") as tgt_vocab, \
        io.open(src_vocab_path, "w", encoding="utf-8") as src_vocab:
    word_counter = Counter(raw_data.read().split())
    most_frequent_words = [ word for (word, frequency) in word_counter.most_common(50000)]
    tgt_vocab.write(unicode("\n".join(most_frequent_words)))
    src_vocab.write(unicode("\n".join(most_frequent_words)))
    
with open(full_path, "r") as raw_data:
    for line in raw_data:
        count += 1
            
train_count = 0
val_count = 0
test_count = 0
with open(full_path, "r") as raw_data, \
            io.open(train_path, "w", encoding="utf-8") as tgt_train, \
            io.open(perturbed_train_path, "w", encoding="utf-8") as src_train, \
            io.open(val_path, "w", encoding="utf-8") as tgt_val, \
            io.open(perturbed_val_path, "w", encoding="utf-8") as src_val, \
            io.open(test_path, "w", encoding="utf-8") as tgt_test, \
            io.open(perturbed_test_path, "w", encoding="utf-8") as src_test:
        for i, line in enumerate(raw_data):
            if not line.strip(): continue
            tokens = line.lower().strip().split()
            source = []
            target = []

            for token in tokens:
                target.append(token)

                # Randomly dropout some words from the input.
                dropout_token = (token in DROPOUT_TOKENS and
                                random.random() < 0.25)
                replace_token = (token in REPLACEMENTS and
                                random.random() < 0.25)

                if replace_token:
                    source.append(REPLACEMENTS[token])
                elif not dropout_token:
                    source.append(token)
                    
            if i+1 < 0.998*count:
                tgt_train.write(unicode(" ".join(target) + "\n"))
                src_train.write(unicode(" ".join(source) + "\n"))
                train_count += 1
            elif i+1 < 0.999*count:
                tgt_val.write(unicode(" ".join(target) + "\n"))
                src_val.write(unicode(" ".join(source) + "\n"))
                val_count += 1
            else:
                tgt_test.write(unicode(" ".join(target) + "\n"))
                src_test.write(unicode(" ".join(source) + "\n"))
                test_count += 1
print("Train Lines: {}/{}, {}%".format(train_count, count, train_count * 100.0 / count))
print("Val Lines: {}/{}, {}%".format(val_count, count, val_count * 100.0 / count))
print("Test Lines: {}/{}, {}%".format(test_count, count, test_count * 100.0 / count))

Train Lines: 303837/304447, 99.7996367184%
Val Lines: 304/304447, 0.0998531764149%
Test Lines: 305/304447, 0.100181640811%


In [5]:
with open(full_path, "r") as raw_data:
    word_counter = Counter(raw_data.read().split())
    print(len(list(word_counter.keys())))

483441


In [24]:
count=0
with open(train_path, "r") as infile:
    for line in infile:
        if not line.strip(): 
            count += 1
print(count)        

0


In [5]:
class objectview(object):
    def __init__(self, d):
        self.__dict__ = d

In [8]:
standard_flags = standard_hparams()
flags = {
    "attention": "scaled_luong",
    "src": "src",
    "tgt": "tgt",
    "vocab_prefix": os.path.join(root_data_path, "vocab"),
    "train_prefix": os.path.join(root_data_path, "train"),
    "dev_prefix": os.path.join(root_data_path, "val"),
    "test_prefix": os.path.join(root_data_path, "test"),
    "out_dir": os.path.join(root_data_path, 'nmt_attention_model'),
    "num_train_steps": 12000,
    "steps_per_stats": 100,
    "num_layers": 2,
    "num_units": 128,
    "dropout": 0.2,
    "metrics": "bleu"
}
standard_flags.update(flags)
hparams = create_or_load_hparams(standard_flags["out_dir"], \
                                 create_hparams(objectview(standard_flags)), \
                                 standard_flags["hparams_path"], \
                                 save_hparams=False)

# hparams:
  src=src
  tgt=tgt
  train_prefix=/home/sven/dialog_corpus/train
  dev_prefix=/home/sven/dialog_corpus/val
  test_prefix=/home/sven/dialog_corpus/test
  out_dir=/home/sven/dialog_corpus/nmt_attention_model
# Vocab file /home/sven/dialog_corpus/vocab.src exists
The first 3 vocab words [., ,, you] are not [<unk>, <s>, </s>]
# Vocab file /home/sven/dialog_corpus/vocab.tgt exists
The first 3 vocab words [., ,, you] are not [<unk>, <s>, </s>]
  attention=scaled_luong
  attention_architecture=standard
  avg_ckpts=False
  batch_size=128
  beam_width=0
  best_bleu=0
  best_bleu_dir=/home/sven/dialog_corpus/nmt_attention_model/best_bleu
  check_special_token=True
  colocate_gradients_with_ops=True
  decay_scheme=
  dev_prefix=/home/sven/dialog_corpus/val
  dropout=0.2
  embed_prefix=None
  encoder_type=uni
  eos=</s>
  epoch_step=0
  forget_bias=1.0
  infer_batch_size=32
  init_op=uniform
  init_weight=0.1
  learning_rate=1.0
  length_penalty_weight=0.0
  log_device_placement=False


## Train

In [None]:
train(hparams)

# creating train graph ...
  num_layers = 2, num_residual_layers=0
  cell 0  LSTM, forget_bias=1  DropoutWrapper, dropout=0.2   DeviceWrapper, device=/gpu:0
  cell 1  LSTM, forget_bias=1  DropoutWrapper, dropout=0.2   DeviceWrapper, device=/gpu:0
  cell 0  LSTM, forget_bias=1  DropoutWrapper, dropout=0.2   DeviceWrapper, device=/gpu:0
  cell 1  LSTM, forget_bias=1  DropoutWrapper, dropout=0.2   DeviceWrapper, device=/gpu:0
  learning_rate=1, warmup_steps=0, warmup_scheme=t2t
  decay_scheme=, start_decay_step=12000, decay_steps 0, decay_factor 1
# Trainable variables
  embeddings/encoder/embedding_encoder:0, (50003, 128), /device:CPU:0
  embeddings/decoder/embedding_decoder:0, (50003, 128), /device:CPU:0
  dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0, (256, 512), /device:GPU:0
  dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0, (512,), /device:GPU:0
  dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0, (256, 5

  step 1900 lr 1 step-time 4.05s wps 0.85K ppl 4.06 gN 8.76 bleu 0.00, Sat Jun  2 00:41:47 2018
  step 2000 lr 1 step-time 4.00s wps 0.86K ppl 4.49 gN 25.35 bleu 0.00, Sat Jun  2 00:48:28 2018
# Save eval, global step 2000
INFO:tensorflow:Restoring parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-2000
  loaded infer model parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-2000, time 0.38s
  # 130
    src: why did you do that ?
    ref: why did you do that ?
    nmt: why did you do that ?
INFO:tensorflow:Restoring parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-2000
  loaded eval model parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-2000, time 0.16s
  eval dev: perplexity 3.61, time 4s, Sat Jun  2 00:48:34 2018.
  eval test: perplexity 6.60, time 4s, Sat Jun  2 00:48:38 2018.
  step 2100 lr 1 step-time 4.03s wps 0.86K ppl 4.30 gN 14.59 bleu 0.00, Sat Jun  2 00:55:21 2018
  step 22

    ref: you ca n't win 'em all .
    nmt: you ca n't win 'em all .
INFO:tensorflow:Restoring parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-5000
  loaded eval model parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-5000, time 0.33s
  eval dev: perplexity 2.25, time 4s, Sat Jun  2 04:10:33 2018.
  eval test: perplexity 3.24, time 5s, Sat Jun  2 04:10:38 2018.
INFO:tensorflow:Restoring parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-5000
  loaded infer model parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-5000, time 0.13s
  # 147
    src: but look at what 's been done with hearts and kidneys !
    ref: but look at what 's been done with hearts and kidneys !
    nmt: but look at what 's been done with picard and wilkins !
INFO:tensorflow:Restoring parameters from /home/sven/dialog_corpus/nmt_attention_model/translate.ckpt-5000
  loaded infer model parameters from /home/sven/dial

  step 8100 lr 1 step-time 3.98s wps 0.86K ppl 2.01 gN 92.10 bleu 81.38, Sat Jun  2 07:38:12 2018
  step 8200 lr 1 step-time 3.95s wps 0.87K ppl 2.12 gN 115.70 bleu 81.38, Sat Jun  2 07:44:48 2018
  step 8300 lr 1 step-time 3.97s wps 0.87K ppl 2.33 gN 216.71 bleu 81.38, Sat Jun  2 07:51:25 2018
  step 8400 lr 1 step-time 3.97s wps 0.86K ppl 1.95 gN 32.72 bleu 81.38, Sat Jun  2 07:58:02 2018
  step 8500 lr 1 step-time 4.04s wps 0.87K ppl 2.07 gN 161.12 bleu 81.38, Sat Jun  2 08:04:46 2018


In [12]:
with open(test_path, "r") as tgt_data, \
        open(perturbed_test_path, "r") as src_data, \
        open(os.path.join(root_data_path, "nmt_attention_model/output_test"), "r") as nmt_data:
    for i in range(100):
        print(src_data.readline().strip())
        print(tgt_data.readline().strip())
        print(nmt_data.readline().strip())
        print()

so you 're on your way to tir asleen , huh ? i hate to tell you this , willow , but tir asleen dos n't exist .
so you 're on your way to tir asleen , huh ? i hate to tell you this , willow , but tir asleen dos n't exist .
so you 're on your way to attract livingston , huh ? i hate to tell you this , languages , but contacted livingston g n't exist .

i did ?
i did ?
i did ?

madmartigan ! you saved her life !
madmartigan ! you saved her life !
hooked ! you saved her life !

when i left the crossroads , i got ambushed by an elf !
when i left the crossroads , i got ambushed by an elf !
when i left the needle , i got the languages by an <unk> !

elora danan !
elora danan !
rok discipline !

now willow , i know you 're gon na blame me for this but it was n't my fault ... !
now willow , i know you 're gon na blame me for this but it was n't my fault ... !
now languages , i know you 're gon na blame me for this but it was n't my fault ... !

i thought you had her !
i thought you had her !
i 

In [23]:
data_reader = MovieDialogReader(config, train_path)

In [24]:
tf.reset_default_graph()
train(data_reader, train_path, val_path, model_path)

Reading data; train = C:\Users\svenk\Documents\dialog_corpus\train.txt, test = C:\Users\svenk\Documents\dialog_corpus\val.txt
Creating 4 layers of 512 units.
Reading model parameters from C:\Users\svenk\Documents\dialog_corpus\dialog_correcter_model_testnltk\translate.ckpt-10800
INFO:tensorflow:Restoring parameters from C:\Users\svenk\Documents\dialog_corpus\dialog_correcter_model_testnltk\translate.ckpt-10800
Training bucket sizes: [198232, 85878, 50002, 70314]
Total train size: 404426.0
global step 10900 learning rate 0.3850 step-time 6.70 perplexity 21.27
  eval: bucket 0 perplexity 3.61
  eval: bucket 1 perplexity 40.10
  eval: bucket 2 perplexity 120.03
  eval: bucket 3 perplexity 175.38
global step 11000 learning rate 0.3850 step-time 5.66 perplexity 21.15
  eval: bucket 0 perplexity 7.59
  eval: bucket 1 perplexity 71.84
  eval: bucket 2 perplexity 172.14
  eval: bucket 3 perplexity 361.24
global step 11100 learning rate 0.3850 step-time 5.85 perplexity 29.12
  eval: bucket 0 pe

  eval: bucket 0 perplexity 2.77
  eval: bucket 1 perplexity 21.29
  eval: bucket 2 perplexity 70.76
  eval: bucket 3 perplexity 139.60
global step 14700 learning rate 0.3517 step-time 5.98 perplexity 9.24
  eval: bucket 0 perplexity 2.55
  eval: bucket 1 perplexity 25.23
  eval: bucket 2 perplexity 43.40
  eval: bucket 3 perplexity 81.04


KeyboardInterrupt: 

## Decode sentences

In [25]:
data_reader = MovieDialogReader(config, train_path, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1)

In [26]:
corrective_tokens = get_corrective_tokens(data_reader, train_path)

In [5]:
import pickle
with open(os.path.join(root_data_path, "corrective_tokens.pickle"), "wb") as f:
    pickle.dump(corrective_tokens, f)

In [6]:
import pickle
with open(os.path.join(root_data_path, "token_to_id.pickle"), "wb") as f:
    pickle.dump(data_reader.token_to_id, f)

In [27]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

Reading model parameters from C:\Users\svenk\Documents\dialog_corpus\dialog_correcter_model_testnltk\translate.ckpt-14700
INFO:tensorflow:Restoring parameters from C:\Users\svenk\Documents\dialog_corpus\dialog_correcter_model_testnltk\translate.ckpt-14700


In [28]:
# Test a sample from the test dataset.
decoded = decode_sentence(sess, model, data_reader, "you must have girlfriend", corrective_tokens=corrective_tokens)

Input: you must have girlfriend
Output: you must an an



In [9]:
decoded

['you', 'must', 'have', 'the', 'must', 'have']

In [29]:
decoded = decode_sentence(sess, model, data_reader,
                          "did n't you say that they 're going to develop this revolutionary new thing ...",
                          corrective_tokens=corrective_tokens)

Input: did n't you say that they 're going to develop this revolutionary new thing ...
Output: did you 're to develop to revolutionary ... you 're to UNK ...



In [31]:
decode_sentence(sess, model, data_reader, "kvothe went to market", corrective_tokens=corrective_tokens, verbose=False)

['kvothe', 'went', 'to', 'UNK']

In [32]:
decode_sentence(sess, model, data_reader, "blablahblah and bladdddd went to market", corrective_tokens=corrective_tokens,
                verbose=False)

['blablahblah', 'and', 'and', 'bladdddd', 'to', 'UNK']

In [33]:
decode_sentence(sess, model, data_reader, "do you have book", corrective_tokens=corrective_tokens, verbose=False)

['do', 'you', 'have']

In [34]:
decode_sentence(sess, model, data_reader, "the cardinals did better then the cubs", corrective_tokens=corrective_tokens, verbose=False)

['the', 'cardinals', 'did', 'cubs', 'UNK', 'the', 'the', 'UNK', 'UNK']

In [23]:
# 4 layers, 40k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

KeyboardInterrupt: 

In [9]:
# 4 layers, 30k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8368
	Model BLEU = 0.8425
	Baseline Accuracy: 0.9110
	Model Accuracy: 0.9303
Bucket 1: (15, 15)
	Baseline BLEU = 0.8818
	Model BLEU = 0.8459
	Baseline Accuracy: 0.8063
	Model Accuracy: 0.8014
Bucket 2: (20, 20)
	Baseline BLEU = 0.8891
	Model BLEU = 0.7986
	Baseline Accuracy: 0.7309
	Model Accuracy: 0.6281
Bucket 3: (40, 40)
	Baseline BLEU = 0.9099
	Model BLEU = 0.5997
	Baseline Accuracy: 0.6007
	Model Accuracy: 0.1607


In [13]:
# 4 layers, 20k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8330
	Model BLEU = 0.8335
	Baseline Accuracy: 0.9067
	Model Accuracy: 0.9218
Bucket 1: (15, 15)
	Baseline BLEU = 0.8772
	Model BLEU = 0.8100
	Baseline Accuracy: 0.7980
	Model Accuracy: 0.7437
Bucket 2: (20, 20)
	Baseline BLEU = 0.8898
	Model BLEU = 0.7636
	Baseline Accuracy: 0.7366
	Model Accuracy: 0.5370
Bucket 3: (40, 40)
	Baseline BLEU = 0.9098
	Model BLEU = 0.5387
	Baseline Accuracy: 0.6041
	Model Accuracy: 0.1117


In [16]:
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8341
	Model BLEU = 0.8516
	Baseline Accuracy: 0.9083
	Model Accuracy: 0.9384
Bucket 1: (15, 15)
	Baseline BLEU = 0.8850
	Model BLEU = 0.8860
	Baseline Accuracy: 0.8156
	Model Accuracy: 0.8491
Bucket 2: (20, 20)
	Baseline BLEU = 0.8876
	Model BLEU = 0.8880
	Baseline Accuracy: 0.7291
	Model Accuracy: 0.7817
Bucket 3: (40, 40)
	Baseline BLEU = 0.9099
	Model BLEU = 0.9045
	Baseline Accuracy: 0.6073
	Model Accuracy: 0.6425


In [15]:
for decoding, target in errors:
    print("Decoding: " + " ".join(decoding))
    print("Target:   " + " ".join(target) + "\n")

Decoding: you beg for mercy in a second .
Target:   you 'll beg for mercy in a second .

Decoding: i 'm dying for a shower . you could use the one too . and we 'd better check that bandage .
Target:   i 'm dying for a shower . you could use one too . and we 'd better check that bandage .

Decoding: whatever ... they 've become hotshot computer guys so they get a job to build el computer grande ... skynet ... for the government . right ?
Target:   whatever ... they become the hotshot computer guys so they get the job to build el computer grande ... skynet ... for the government . right ?

Decoding: did n't you say that they 're going to develop this revolutionary a new thing ...
Target:   did n't you say that they 're going to develop this revolutionary new thing ...

Decoding: bag some z ?
Target:   bag some z 's ?

Decoding: sleep . it 'll be a light soon .
Target:   sleep . it 'll be light soon .

Decoding: well , at least i know what to name him . i do n't suppose you 'd know who fa