In [38]:
import os
import time
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from correct_text import train, decode, decode_sentence, evaluate_accuracy, create_model,\
    get_corrective_tokens, DefaultPTBConfig, DefaultMovieDialogConfig
from text_corrector_data_readers import PTBDataReader, MovieDialogReader

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
root_data_path = "C:\\Users\\svenk\\Documents\\dialog_corpus"
# root_data_path = "E:\\Documents\\dialog_corpus"
full_path = os.path.join(root_data_path, "movie_lines.txt")
train_path = os.path.join(root_data_path, "train.txt")
val_path = os.path.join(root_data_path, "val.txt")
test_path = os.path.join(root_data_path, "test.txt")
model_path = os.path.join(root_data_path, "dialog_correcter_model_testnltk")
config = DefaultMovieDialogConfig()

In [34]:
count = 0
with open(full_path, "r") as raw_data:
        for line in raw_data:
            count += 1
train_count = 0
val_count = 0
test_count = 0
with open(full_path, "r") as raw_data, \
            open(train_path, "w") as train, \
            open(val_path, "w") as val, \
            open(test_path, "w") as test:
        for i, line in enumerate(raw_data):
            if i+1 < 0.7*count:
                train.write(line)
                train_count += 1
            elif i+1 < 0.85*count:
                val.write(line)
                val_count += 1
            else:
                test.write(line)
                test_count += 1
print("Train Lines: {}/{}, {}%".format(train_count, count, train_count * 100.0 / count))
print("Val Lines: {}/{}, {}%".format(val_count, count, val_count * 100.0 / count))
print("Test Lines: {}/{}, {}%".format(test_count, count, test_count * 100.0 / count))

Train Lines: 213299/304713, 69.99996718223377%
Val Lines: 45707/304713, 15.000016408883113%
Test Lines: 45707/304713, 15.000016408883113%


## Train

In [40]:
data_reader = MovieDialogReader(config, train_path)

In [41]:
tf.reset_default_graph()
train(data_reader, train_path, val_path, model_path)

Reading data; train = C:\Users\svenk\Documents\dialog_corpus\train.txt, test = C:\Users\svenk\Documents\dialog_corpus\val.txt
Creating 4 layers of 512 units.
Created model with fresh parameters.
Training bucket sizes: [198232, 85878, 50002, 70314]
Total train size: 404426.0
global step 100 learning rate 0.5000 step-time 10.70 perplexity 300.21
  eval: bucket 0 perplexity 93.15
  eval: bucket 1 perplexity 134.16
  eval: bucket 2 perplexity 145.36
  eval: bucket 3 perplexity 170.63
global step 200 learning rate 0.5000 step-time 10.00 perplexity 120.32
  eval: bucket 0 perplexity 72.97
  eval: bucket 1 perplexity 138.97
  eval: bucket 2 perplexity 157.70
  eval: bucket 3 perplexity 174.74
global step 300 learning rate 0.5000 step-time 10.22 perplexity 93.21
  eval: bucket 0 perplexity 48.15
  eval: bucket 1 perplexity 100.84
  eval: bucket 2 perplexity 143.31
  eval: bucket 3 perplexity 162.79
global step 400 learning rate 0.5000 step-time 10.13 perplexity 79.39
  eval: bucket 0 perplexit

KeyboardInterrupt: 

## Decode sentences

In [42]:
data_reader = MovieDialogReader(config, train_path, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1)

In [43]:
corrective_tokens = get_corrective_tokens(data_reader, train_path)

In [44]:
import pickle
with open(os.path.join(root_data_path, "corrective_tokens.pickle"), "wb") as f:
    pickle.dump(corrective_tokens, f)

In [45]:
import pickle
with open(os.path.join(root_data_path, "token_to_id.pickle"), "wb") as f:
    pickle.dump(data_reader.token_to_id, f)

In [46]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

Created model with fresh parameters.


In [47]:
# Test a sample from the test dataset.
decoded = decode_sentence(sess, model, data_reader, "you must have girlfriend", corrective_tokens=corrective_tokens)

Input: you must have girlfriend
Output: must must must must must must must must must must



In [48]:
decoded

['must',
 'must',
 'must',
 'must',
 'must',
 'must',
 'must',
 'must',
 'must',
 'must']

In [49]:
decoded = decode_sentence(sess, model, data_reader,
                          "did n't you say that they 're going to develop this revolutionary new thing ...",
                          corrective_tokens=corrective_tokens)

Input: did n't you say that they 're going to develop this revolutionary new thing ...
Output: that that that that that that that that that that that that that that that that that that that that



In [50]:
decode_sentence(sess, model, data_reader, "kvothe went to market", corrective_tokens=corrective_tokens, verbose=False)

['went',
 'went',
 'went',
 'went',
 'went',
 'than',
 'than',
 'than',
 'than',
 'than']

In [20]:
decode_sentence(sess, model, data_reader, "blablahblah and bladdddd went to market", corrective_tokens=corrective_tokens,
                verbose=False)

["'ll", "'ll", "'ll", "'ll", "'ll", "'ll", "'ll", 'PAD', 'PAD', 'PAD']

In [21]:
decode_sentence(sess, model, data_reader, "do you have book", corrective_tokens=corrective_tokens, verbose=False)

["'ll", "'ll", "'m", "'m", "'m", "'m", "'m", "'m", "'m", "'m"]

In [22]:
decode_sentence(sess, model, data_reader, "the cardinals did better then the cubs", corrective_tokens=corrective_tokens, verbose=False)

["'s", "'s", "'s", "'s", 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']

In [23]:
# 4 layers, 40k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

KeyboardInterrupt: 

In [9]:
# 4 layers, 30k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8368
	Model BLEU = 0.8425
	Baseline Accuracy: 0.9110
	Model Accuracy: 0.9303
Bucket 1: (15, 15)
	Baseline BLEU = 0.8818
	Model BLEU = 0.8459
	Baseline Accuracy: 0.8063
	Model Accuracy: 0.8014
Bucket 2: (20, 20)
	Baseline BLEU = 0.8891
	Model BLEU = 0.7986
	Baseline Accuracy: 0.7309
	Model Accuracy: 0.6281
Bucket 3: (40, 40)
	Baseline BLEU = 0.9099
	Model BLEU = 0.5997
	Baseline Accuracy: 0.6007
	Model Accuracy: 0.1607


In [13]:
# 4 layers, 20k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8330
	Model BLEU = 0.8335
	Baseline Accuracy: 0.9067
	Model Accuracy: 0.9218
Bucket 1: (15, 15)
	Baseline BLEU = 0.8772
	Model BLEU = 0.8100
	Baseline Accuracy: 0.7980
	Model Accuracy: 0.7437
Bucket 2: (20, 20)
	Baseline BLEU = 0.8898
	Model BLEU = 0.7636
	Baseline Accuracy: 0.7366
	Model Accuracy: 0.5370
Bucket 3: (40, 40)
	Baseline BLEU = 0.9098
	Model BLEU = 0.5387
	Baseline Accuracy: 0.6041
	Model Accuracy: 0.1117


In [16]:
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8341
	Model BLEU = 0.8516
	Baseline Accuracy: 0.9083
	Model Accuracy: 0.9384
Bucket 1: (15, 15)
	Baseline BLEU = 0.8850
	Model BLEU = 0.8860
	Baseline Accuracy: 0.8156
	Model Accuracy: 0.8491
Bucket 2: (20, 20)
	Baseline BLEU = 0.8876
	Model BLEU = 0.8880
	Baseline Accuracy: 0.7291
	Model Accuracy: 0.7817
Bucket 3: (40, 40)
	Baseline BLEU = 0.9099
	Model BLEU = 0.9045
	Baseline Accuracy: 0.6073
	Model Accuracy: 0.6425


In [15]:
for decoding, target in errors:
    print("Decoding: " + " ".join(decoding))
    print("Target:   " + " ".join(target) + "\n")

Decoding: you beg for mercy in a second .
Target:   you 'll beg for mercy in a second .

Decoding: i 'm dying for a shower . you could use the one too . and we 'd better check that bandage .
Target:   i 'm dying for a shower . you could use one too . and we 'd better check that bandage .

Decoding: whatever ... they 've become hotshot computer guys so they get a job to build el computer grande ... skynet ... for the government . right ?
Target:   whatever ... they become the hotshot computer guys so they get the job to build el computer grande ... skynet ... for the government . right ?

Decoding: did n't you say that they 're going to develop this revolutionary a new thing ...
Target:   did n't you say that they 're going to develop this revolutionary new thing ...

Decoding: bag some z ?
Target:   bag some z 's ?

Decoding: sleep . it 'll be a light soon .
Target:   sleep . it 'll be light soon .

Decoding: well , at least i know what to name him . i do n't suppose you 'd know who fa