In [1]:
file = open("data/cornell movie-dialogs corpus/movie_lines.txt", "r",encoding='utf-8', errors='ignore')
data = file.read()

In [4]:
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
import re
import enchant
from nltk.tokenize import word_tokenize

replacement_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
        
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s
    

class SpellingReplacer(object):
    def __init__(self, dict_name="en", max_dist=2, min_word_length=1):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
        self.min_word_length = min_word_length
        
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        if len(word) < self.min_word_length:
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, sent):
        i, length = 0, len(sent)
        words = []
        while i < length:
            word = sent[i]
            if word == "not" and i+1 < length:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words


class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word
        
def clean_text(text, replace_neg=False, use_unknown=False, dict_name="en", max_dist=2, min_word_length=1):
    rep = RepeatReplacer()
    ant = AntonymReplacer()
    reg = RegexpReplacer()
    spell = SpellingReplacer(dict_name, max_dist, min_word_length)
    # Get rid of some abbreviations
    text = reg.replace(text)
    words = word_tokenize(text)
    if replace_neg:
        words = ant.replace_negations(words)
    words = [spell.replace(word) for word in words]
    if not use_unknown:
        spell_dict = enchant.Dict(dict_name)
        words = [word for word in words if spell_dict.check(word)]
    return words

In [5]:
import re
from tqdm import tqdm

def clean_movie_data(data):
    pattern = " \+\+\+\$\+\+\+ u[0-9] \+\+\+\$\+\+\+ m[0-9] \+\+\+\$\+\+\+ \w* \+\+\+\$\+\+\+ "
    lines = re.split(pattern, data.lower())
    line_words = []
    for line in tqdm(lines):
        try:
            cleaned_line = clean_text(line, max_dist=0)
            if len(cleaned_line) >= 1:
                line_words.append(cleaned_line)
        except KeyboardInterrupt:
            break
    return line_words

In [127]:
%%time
import pandas as pd
# texts = clean_movie_data(data)
df = pd.read_csv("data/All-seasons.csv")

CPU times: user 62.5 ms, sys: 46.9 ms, total: 109 ms
Wall time: 322 ms


In [131]:
df.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."


In [133]:
texts = df["Line"].values

In [135]:
texts[:10]

array(['You guys, you guys! Chef is going away. \n',
       'Going away? For how long?\n', 'Forever.\n', "I'm sorry boys.\n",
       "Chef said he's been bored, so he joining a group called the Super Adventure Club. \n",
       'Wow!\n',
       'Chef?? What kind of questions do you think adventuring around the world is gonna answer?!\n',
       "What's the meaning of life? Why are we here?\n",
       "I hope you're making the right choice.\n",
       "I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him! \n"], dtype=object)

In [30]:
import numpy as np

def batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    sequence_lengths = [min(length, max_sequence_length) for length in sequence_lengths]
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        seq = seq[:max_sequence_length]
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    return inputs_batch_major, sequence_lengths


def random_sequences(length_from, length_to,
                     vocab_lower, vocab_upper,
                     batch_size):
    """ Generates batches of random integer sequences,
        sequence length in [length_from, length_to],
        vocabulary in [vocab_lower, vocab_upper]
    """
    if length_from > length_to:
            raise ValueError('length_from > length_to')

    def random_length():
        if length_from == length_to:
            return length_from
        return np.random.randint(length_from, length_to + 1)
    
    while True:
        yield [
            np.random.randint(low=vocab_lower,
                              high=vocab_upper,
                              size=random_length()).tolist()
            for _ in range(batch_size)
        ]
        
def get_length(sequence):
    """Get length of each instance in batch
    
    Args:
        sequence: tensor, shape = (batch_size, length)
            or shape = (batch_size, length) + shape
            
    Returns:
        tensor: shape = (None,), length of each instance
    """
    shape = sequence.get_shape().as_list()
    if len(shape) < 2:
        used = tf.sign(tf.abs(sequence))
    else:
        reduction_indices = list(range(2, len(shape)))
        used = tf.sign(tf.reduce_max(tf.abs(sequence),
                       reduction_indices=reduction_indices))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

In [271]:
import tensorflow as tf
import tensorflow.contrib.rnn as rnn
import tensorflow.contrib.seq2seq as seq2seq
from hedgeable_ai.models.nn import BaseModel, get_shape, get_length

import tensorflow as tf

from hedgeable_ai.models.nn.params import nn_is_logit
from hedgeable_ai.models.nn import BaseNN, get_shape

from hedgeable_ai.models.nn.rnn import get_cell


class DialogueAgent(BaseNN):
    def __init__(self, processor, maxlen=None, conf=None, *args, **kwargs):
        self.emb_size = 300
        # add padding index 0 and index 1 for <eos>
        self.vocab_size = processor.vocab_size + 2
        self.maxlen = maxlen
        super().__init__(processor=processor, conf=conf, *args, **kwargs)
        
    def _build_graph(self):
        """Build tensorflow graph
        
        Note:
            You build graphs for output and input, which will be used 
            for training and prediction.
        """
        self.encoder_input = tf.placeholder(tf.int32, shape=(None, None), name="encoder_input")
        self.encoder_input_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_input_length')
        self.decoder_target = tf.placeholder(tf.int32, shape=(None, None), name="decoder_target")
        encoder_length = get_length(self.encoder_input)
        self.encoder_length = encoder_length
        decoder_length = get_length(self.decoder_target)
        # Encoder
        encoder_cell = get_cell(self.conf["model"])
        embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0, dtype=tf.float32))
        encoder_input_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_input)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            encoder_cell, encoder_input_embedded, sequence_length=encoder_length, dtype=tf.float32, 
            time_major=False, scope="encoder")
        
        decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
        decoder_outputs = decoder_outputs_ta.stack()
        decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
        decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_output_size))
        decoder_logits_flat = tf.matmul(decoder_outputs_flat, W)
        decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, self.vocab_size))
        decoder_logits = tf.transpose(decoder_logits, [1, 0, 2])
        self.decoder_prediction = tf.argmax(decoder_logits, 2)
        self.decoder_logits = decoder_logits

        # Decoder
        decoder_cell = get_cell(self.conf["model"])
        batch_size, encoder_max_time = tf.unstack(tf.shape(self.encoder_input))
        decoder_output_size = self._get_output_size(self.conf["model"])
        # We use a decoder length 10 words longer than encoder
        max_decoder_length = self.encoder_input_length + 6
        self.max_decoder_length = max_decoder_length
        W = tf.Variable(tf.random_uniform([decoder_output_size, self.vocab_size], -1, 1), dtype=tf.float32)
        b = tf.Variable(tf.zeros([self.vocab_size]), dtype=tf.float32)
        # Prepare for padding and EOS
        eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS')
        pad_time_slice = tf.zeros([batch_size],  dtype=tf.int32, name='PAD')
        eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice)
        pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)
        
        def loop_fn_initial():
            initial_elements_finished = (0 >= max_decoder_length)
            initial_input = eos_step_embedded
            initial_cell_state = encoder_final_state
            initial_cell_output  = None
            initial_loop_state = None
            return (initial_elements_finished,
                initial_input,
                initial_cell_state,
                initial_cell_output,
                initial_loop_state)
        
        def loop_fn_transition(time, previous_output,  previous_state, previous_loop_state):
            output_logits = tf.matmul(previous_output, W)
            prediction  = tf.argmax(output_logits, axis=-1)
            
            def get_next_input():
                # output_logits = tf.add(tf.matmul(previous_output, W), b)
                next_input = tf.nn.embedding_lookup(embeddings,  prediction)
                return next_input
            
            prediciton_finished = (prediction <= tf.ones_like(prediction))
            elements_finished = (time >= max_decoder_length)
            finished = tf.logical_or(elements_finished, prediciton_finished)
            finished = tf.reduce_all(finished)
            input_ = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
            state  = previous_state
            output = previous_output
            loop_state = None
            return (elements_finished, 
                input_,
                state,
                output,
                loop_state)
        
        def loop_fn(time, previous_output, previous_state,  previous_loop_state):
            if previous_state is None:
                assert previous_output is None and previous_state is None
                return loop_fn_initial()
            else:
                return loop_fn_transition(time, previous_output, previous_state,  previous_loop_state)
        
        decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
        decoder_outputs = decoder_outputs_ta.stack()
        decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
        decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_output_size))
        decoder_logits_flat = tf.matmul(decoder_outputs_flat, W)
        decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, self.vocab_size))
        decoder_logits = tf.transpose(decoder_logits, [1, 0, 2])
        self.decoder_prediction = tf.argmax(decoder_logits, 2)
        self.decoder_logits = decoder_logits
        
        # Optimization
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.one_hot(self.decoder_target, depth=self.vocab_size, dtype=tf.float32),
            logits=decoder_logits)
        self.loss = tf.reduce_mean(cross_entropy)
        self.learning_rate_op = self._get_learning_rate()
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            self.optimizer = self._get_optimizer(self.optimizer_name, self.learning_rate_op, self.optimizer_conf)
            self.train_step = self.optimizer.minimize(self.loss)
        
    def _optimize(self, batch_X, batch_y, *args, **kwargs):
        batch_X = [x[::-1] for x in batch_X[0]]
        batch_X, Xlen = batch(batch_X, self.maxlen)
        length = np.max(Xlen) + 6
        batch_y = self._batch_padding(batch_y, length)
        
        feed_dict = {self.encoder_input: batch_X,
                     self.decoder_target: batch_y,
                     self.encoder_input_length: Xlen,
                     self.training: True}
        # print(self.sess.run(self.max_decoder_length, feed_dict=feed_dict))
        _, loss = self.sess.run([self.train_step, self.loss], feed_dict=feed_dict)
        return loss
    
    def _get_output_size(self, conf):
        if isinstance(conf, list) or isinstance(conf, tuple):
            x = conf[-1]
        else:
            x = conf
        return x["num_units"]
    
    def generate_sentences(self, sentences):
        X = [self.processor.encode(sentence) for sentence in sentences]
        X = [x_[::-1] for x_ in X]
        X, Xlen = batch(X, self.maxlen)
        feed_dict = {self.encoder_input: X,
                     self.encoder_input_length: Xlen,
                     self.training: False}
        word_idx = self.sess.run(self.decoder_prediction, feed_dict=feed_dict)
        print(word_idx)
        return [self.processor.decode(i) for i in word_idx]
    
    def _batch_padding(self, batch, length):
        EOS = 1
        PAD = 0
        padded_batch = []
        for x in batch:
            x = list(x)
            if len(x) > length:
                x = x[:length]
            elif len(x) < length:
                x.append(EOS)
            while len(x) < length:
                x.append(PAD)
            padded_batch.append(x)
        return np.array(padded_batch)
    
    def attention(self, query, attentions=None):
        if attentions is None:
            return query
        else:
            query = tf.expand_dims(query, 1)
            score = tf.matmul(query, attentions, transpose_b=True)
        score = tf.squeeze(score, [1])
        alignments = tf.nn.softmax(score)
        return alignments
    

In [277]:
import tensorflow as tf

conf = {
        "learning_rate": 0.5,
        "learning_rate_minimum": 0.5,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 20,
        "batch_size": 64,
        "model_dir": "./logs",
        "load_file_path": None,
        "save_file_path": None,
        "log_freq": 1,
        "model":{"name":"lstm", "num_units":256},
}

tf.reset_default_graph()
agent = DialogueAgent(processor, maxlen=30, conf=conf)
train_X = processor.data[:-1]
train_y = []
for y in processor.data[1:]:
    list_y = list(y)
    list_y.append(1)
    train_y.append(np.array(list_y))
train_y = np.array(train_y)
agent.fit(train_X[:100], train_y[:100], num_epochs=100, batch_bar=False, log_freq=1, batch_log_freq=10)






[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A











  0%|          | 0/100 [00:00<?, ?it/s]

Model saved in file: params/model.ckpt


[A[A[A[A[A[A[A[A[A[A[A[A



[A[A[A[A










  1%|          | 1/100 [00:06<11:21,  6.88s/it][A[A[A[A[A[A[A[A[A[A[A










  2%|▏         | 2/100 [00:23<16:10,  9.91s/it][A[A[A[A[A[A[A[A[A[A[A










  3%|▎         | 3/100 [00:42<20:27, 12.66s/it][A[A[A[A[A[A[A[A[A[A[A










  4%|▍         | 4/100 [00:57<21:02, 13.15s/it][A[A[A[A[A[A[A[A[A[A[A










  5%|▌         | 5/100 [01:17<24:18, 15.36s/it][A[A[A[A[A[A[A[A[A[A[A










  6%|▌         | 6/100 [01:34<24:47, 15.82s/it][A[A[A[A[A[A[A[A[A[A[A










  7%|▋         | 7/100 [01:54<26:28, 17.08s/it][A[A[A[A[A[A[A[A[A[A[A










  8%|▊         | 8/100 [02:09<25:22, 16.55s/it][A[A[A[A[A[A[A[A[A[A[A










  9%|▉         | 9/100 [02:25<24:28, 16.14s/it][A[A[A[A[A[A[A[A[A[A[A










 10%|█         | 10/100 [02:41<24:23, 16.26s/it][A[A[A[A[A[A[A[A[A[A[A










 11%|█         | 11/100 [0

Model saved in file: params/model.ckpt













[A[A[A[A[A[A[A[A[A[A[A

In [280]:
texts_tilde = [("").join(text) for text in texts[:10]]
agent.generate_sentences(texts_tilde)

[[17496     1     1    36     1     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [   36     1     1     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [ 7949  7949   899     1 17914     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [17877  7184     1    36     1     1     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [17877     1  8492     1    36     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [174

[['what', '.'],
 ['.'],
 ['i', 'i', 'am', 'you'],
 ['yeah', 'guys', '.'],
 ['yeah', 'is', '.'],
 ['what'],
 ['what', 'is', 'is', 'you'],
 ['i', 'i', 'i', 'you', '.'],
 ['i', 'you', '.'],
 ['shreds', 'misses', 'fatal']]

In [281]:
texts_tilde

['You guys, you guys! Chef is going away. \n',
 'Going away? For how long?\n',
 'Forever.\n',
 "I'm sorry boys.\n",
 "Chef said he's been bored, so he joining a group called the Super Adventure Club. \n",
 'Wow!\n',
 'Chef?? What kind of questions do you think adventuring around the world is gonna answer?!\n',
 "What's the meaning of life? Why are we here?\n",
 "I hope you're making the right choice.\n",
 "I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him! \n"]

In [110]:
processor.data

array([array([996, 270, 657]), array([ 996,  270, 1018]),
       array([489, 476, 904,   2]), array([873, 675]),
       array([550, 515, 403,   2]), array([1140]),
       array([ 675, 1150,   57,  641, 1018,  544,  483, 1018,  552,    2]),
       array([650]),
       array([ 489,   42,  527,    2, 1150,  536,  483,  912, 1150,  523,   91,
       1002,  727,   45, 1150,  270,  657,  536,  483, 1018,  793]),
       array([ 555,  633,  339,  668, 1097,  711]),
       array([ 989,  805, 1150,    2]), array([1105,  411,  949]),
       array([ 489,  352, 1150, 1139,  396, 1018,  989,  411,  949,  308,    2]),
       array([ 986,  404,  491,  489,  428, 1018,  446,  680,  620,  943,   12,
       1151,  185,    3]),
       array([ 597,    2, 1002,  302,    3,   76,    2,  489,   42,  555,  124,
        634,    2]),
       array([1105,  217]), array([ 270, 1150,  560, 1018, 1002,  217]),
       array([650,   3]),
       array([ 992,  843,  491, 1150,  403,   50,  554, 1150,   57,  569,  555,
  

In [162]:
from sklearn import preprocessing
import nltk
import numpy as np

class BasicProcessor(object):
    """Process data for estimators."""
        
    def batch_process(self, X, y=None):
        """Make sure to have numpy data for input and target"""
        if y is None:
            return np.array(X)
        else:
            return np.array(X), np.array(y)

    def batch_process_y(self, y):
        return np.array(y)
    
class Word2IndexProcessor(BasicProcessor):
    def __init__(self,  texts, is_processed=False):
        _texts = []
        lengths = []
        for text in texts:
            if is_processed:
                words = text
            else:
                words = clean_text(text.lower(), max_dist=0, min_word_length=1)
            if len(words) > 0:
                _texts.extend(words)
                lengths.append(len(words))
        lengths = list(np.cumsum(lengths))
        lengths.insert(0, 0)
        self.encoder = preprocessing.LabelEncoder()
        # 0 and 1 are taken for padding and <eos>
        indices = self.encoder.fit_transform(_texts) + 2
        # split to sentences
        self.data = np.array([indices[lengths[i]:lengths[i+1]] for i in range(len(lengths) - 1)])
        
    def encode(self, text):
        words = clean_text(text.lower())
        return self.encoder.transform(words) + 2
    
    def decode(self, index):
        return [self.encoder.inverse_transform(i-2) for i in index if i >=2]    
            
    
    def batch_process_test(self, X, y=None):
        if y is None:
            return np.array([self.encode(x_i) for x_i in X])
        else:
            return np.array([self.encode(x_i) for x_i in X]), np.array(y)
        
    @property
    def vocab_size(self):
        return len(self.encoder.classes_)

In [164]:
%%time

processor = Word2IndexProcessor(texts, is_processed=False)

CPU times: user 38min 50s, sys: 1min 51s, total: 40min 42s
Wall time: 40min 44s


In [165]:
[processor.decode(index) for index in processor.data[:10]]

[['you', 'guys', 'you', 'guys', 'chef', 'is', 'going', 'away', '.'],
 ['going', 'away', 'for', 'how', 'long'],
 ['forever', '.'],
 ['i', 'am', 'sorry', 'boys', '.'],
 ['chef',
  'said',
  'he',
  'is',
  'been',
  'bored',
  'so',
  'he',
  'joining',
  'a',
  'group',
  'called',
  'the',
  'super',
  'adventure',
  'club',
  '.'],
 ['wow'],
 ['chef',
  'what',
  'kind',
  'of',
  'questions',
  'do',
  'you',
  'think',
  'adventuring',
  'around',
  'the',
  'world',
  'is',
  'answer'],
 ['what', 'is', 'the', 'meaning', 'of', 'life', 'why', 'are', 'we', 'here'],
 ['i', 'hope', 'you', 'are', 'making', 'the', 'right', 'choice', '.'],
 ['i',
  'am',
  'miss',
  'him',
  '.',
  'i',
  'am',
  'miss',
  'chef',
  'and',
  'i',
  '...',
  'and',
  'i',
  'do',
  'not',
  'know',
  'how',
  'to',
  'tell',
  'him']]

In [157]:
texts[:100]

array(['You guys, you guys! Chef is going away. \n',
       'Going away? For how long?\n', 'Forever.\n', "I'm sorry boys.\n",
       "Chef said he's been bored, so he joining a group called the Super Adventure Club. \n",
       'Wow!\n',
       'Chef?? What kind of questions do you think adventuring around the world is gonna answer?!\n',
       "What's the meaning of life? Why are we here?\n",
       "I hope you're making the right choice.\n",
       "I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him! \n",
       'Dude, how are we gonna go on? Chef was our fuh...f-ffriend. \n',
       'And we will all miss you, Chef,  but we know you must do what your heart tells you..\n',
       'Bye-bye!\n', 'Good-bye!\n', 'So long!\n', 'So long, Chef!\n',
       'Good-bye, Chef!\n',
       'Good-bye, Chef! Have a great time with the Super Adventure Club!\n',
       'Good-bye! ..\n', 'Draw two card, fatass.\n',
       'Reverse to you, Jew. \n', "I'll get it. \n",
     

In [346]:
import tensorflow as tf

conf = {
        "learning_rate": 0.5,
        "learning_rate_minimum": 0.5,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 20,
        "batch_size": 128,
        "model_dir": "./logs",
        "load_file_path": None,
        "save_file_path": None,
        "log_freq": 1,
        "model":{"name":"lstm", "num_units":256},
        "attension_size": 512,
}

tf.reset_default_graph()
agent = DialogueAgent(processor, maxlen=30, conf=conf)
train_X = processor.data[:-1]
train_y = processor.data[1:]
agent.fit(train_X, train_y, num_epochs=10, batch_bar=True, log_freq=1, batch_log_freq=10)

ValueError: The two structures don't have the same number of elements. First structure: LSTMStateTuple(c=<tf.Tensor 'encoder/while/Exit_2:0' shape=(?, 256) dtype=float32>, h=<tf.Tensor 'encoder/while/Exit_3:0' shape=(?, 256) dtype=float32>), second structure: AttentionWrapperState(cell_state=LSTMStateTuple(c=256, h=256), attention=256, time=TensorShape([]), alignments=<tf.Tensor 'LuongAttention/strided_slice_2:0' shape=() dtype=int32>, alignment_history=()).

In [334]:
train_X

array([array([75, 19, 53]), array([75, 19, 77]), array([33, 31, 67,  0]),
       array([66, 55]), array([40, 35, 26,  0]), array([85]),
       array([55, 86, 10, 50, 77, 39, 32, 77, 41,  0]), array([52]),
       array([33,  6, 37,  0, 86, 38, 32, 68, 86, 36, 12, 76, 60,  8, 86, 19, 53,
       38, 32, 77, 61]),
       array([43, 48, 23, 54, 80, 59]), array([73, 62, 86,  0]),
       array([82, 28, 70]),
       array([33, 24, 86, 84, 25, 77, 73, 28, 70, 21,  0]),
       array([71, 27, 34, 33, 29, 77, 30, 57, 47, 69,  3, 87, 16,  1]),
       array([46,  0, 76, 20,  1, 11,  0, 33,  6, 43, 14, 49,  0]),
       array([82, 17]), array([19, 86, 44, 77, 76, 17]), array([52,  1]),
       array([74, 64, 34, 86, 26,  9, 42, 86, 10, 45, 43,  7, 22, 56,  2,  0]),
       array([86,  5, 13, 76, 65]), array([15]),
       array([74, 72, 35,  4, 86, 29, 77, 63,  0]), array([81, 52,  1]),
       array([86, 51, 78, 77, 26, 58, 83, 18, 86])], dtype=object)

In [308]:
texts_tilde = [(" ").join(text) for text in texts[:100]]
agent.generate_sentences(texts_tilde)

[['hope', 'had', 'selfish'],
 ['hope', 'had', 'selfish'],
 ['says', 'not'],
 ['not', 'would', 'and', 'my', 'they', 'learn'],
 ['hope', 'all', 'is'],
 ['hope',
  'extra',
  'would',
  'what',
  'fear',
  'they',
  'thank',
  'go',
  'sometimes',
  'do'],
 ['would', 'about', 'babble', 'then', 'say'],
 ['hope',
  'extra',
  'would',
  'what',
  'fear',
  'they',
  'thank',
  'go',
  'sometimes',
  'do'],
 ['need'],
 ['wearing', 'go', 'sometimes'],
 ['hope', 'had', 'selfish'],
 ['wearing', 'go', 'sometimes'],
 ['hope', 'to'],
 ['wearing', 'but'],
 ['hope', 'to'],
 ['hope',
  'extra',
  'would',
  'what',
  'fear',
  'they',
  'thank',
  'go',
  'sometimes',
  'do'],
 ['hope', 'to'],
 ['not', 'would', 'and', 'my', 'they', 'kidding', 'hear'],
 ['but'],
 ['wearing', 'go', 'sometimes'],
 ['crap', 'would', 'lighter', 'they', 'then', 'but'],
 ['been'],
 ['hope',
  'extra',
  'would',
  'what',
  'fear',
  'they',
  'thank',
  'go',
  'sometimes',
  'do'],
 ['hope', 'to'],
 ['would', 'myself', 't

In [236]:
texts_tilde

['theydonot',
 'theydoto',
 'ihopeso.',
 'sheokay',
 'letisgo.',
 'wow',
 'okayyouareneedtolearnhowtolie.',
 'no',
 'iamkidding.youknowhowsometimesyoujustbecomethispersonaandyoudonotknowhowtoquit',
 'likemyfearofwearingpastels',
 'therealyou.',
 'whatgoodstuff',
 'ifiguredyouwouldgettothegoodstuffeventually.',
 'thankgodifihadtohearonemorestoryaboutyourcoiffure...',
 'me.thisendless...babble.iamlikeboringmyself.',
 'whatcrap',
 'doyoulistentothiscrap',
 'no...',
 'thensaysifyougoanylighteryouarelooklikeanextraon90210.',
 'youalwaysbeenthisselfish',
 'but',
 'thenthatisallyouhadtosay.',
 'wellno...',
 'youneverwantedtogooutwithdidyou',
 'iwas']

In [25]:
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
import re
import enchant
from nltk.tokenize import word_tokenize

replacement_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
        
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s
    

class SpellingReplacer(object):
    def __init__(self, dict_name="en", max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
        
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, sent):
        i, length = 0, len(sent)
        words = []
        while i < length:
            word = sent[i]
            if word == "not" and i+1 < length:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words


class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word
        
def clean_text(text, replace_neg=False, use_unknown=False, dict_name="en",):
    rep = RepeatReplacer()
    ant = AntonymReplacer()
    reg = RegexpReplacer()
    spell = SpellingReplacer(dict_name)
    # Get rid of some abbreviations
    text = reg.replace(text)
    words = word_tokenize(text)
    if replace_neg:
        words = ant.replace_negations(words)
    words = [spell.replace(word) for word in words]
    if not use_unknown:
        spell_dict = enchant.Dict(dict_name)
        words = [word for word in words if spell_dict.check(word)]
    return words

In [26]:
clean_text("I don\'t swim in the sea werwerwe.")

['I', 'do', 'not', 'swim', 'in', 'the', 'sea', '.']

In [6]:
from nltk.tokenize import word_tokenize
word_tokenize("I don\'t swim in the sea")

['I', 'do', "n't", 'swim', 'in', 'the', 'sea']

In [51]:
replacer = AntonymReplacer()
replacer.replace_negations(["I", "am",  "not","good", "guy"])

{'evil', 'badness', 'bad', 'evilness', 'ill'}


['I', 'am', 'not', 'good', 'guy']

In [104]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmer.stem('fucking')

'fuck'

In [112]:
replacer = RegexpReplacer()
replacer.replace("I should've done that thing I didn't do")

'I should have done that thing I did not do'

In [114]:
wordnet.synsets("erqeqr")

[]

In [100]:
syn.lemmas()[0].name()

'name'

In [50]:
"i doingOwerwe".lower()

'i doingowerwe'

In [55]:
replacer = RepeatReplacer()
replacer.replace(["ahhhhhhhhhhhhhhhhh", "Looooove"])

AttributeError: 'list' object has no attribute 'lower'

In [74]:
texts[100:200]

array(['I specializes in your asshole, Kyle. \n',
       "...Man, I can't believe all this time, Chef just wanted us for sex.\n",
       "He didn't want us for sex, fatass! Something is making him say those things.\n",
       '(Like what?)\n',
       'Something must have happened to Chef while he was gone. Maybe he hit his head or, or got stuck in some quantum time vortex.\n',
       "Well look: he spent the last three months with that adventurers' club. Maybe they know what happened to him.\n",
       '(Yeah! I think...)\n', 'Yeah!\n', 'All right, come on guys!\n',
       "Hey you guys, you know what they call a Jewish woman's boobs? Jewbs. \n",
       'May I help you.\n',
       'Ahh, hi, can we speak to the head guy or something?\n',
       'Right this way. \n',
       "Now, the upper rim of Kilimanjaro should be quite a trek, and so we'll need to have a-\n",
       'Excuse me, sir. These boys wanted to speak with you.\n',
       "Ahh yes, splendid! Good afternoon, lads! I'm Head Ad

In [70]:
[processor.decode(text) for text in processor.data[1:10]]

[["goin'ta", 'aware', '=', 'footsteps', 'hover', 'loneliness', '='],
 ['forests', '-where'],
 ['hände', "'loverboy", 'sori', 'boyle', '-where'],
 ['cheesy',
  'sahr',
  'hbc',
  "'red",
  'beelzeboot',
  'borders',
  '+1',
  'snusunarich',
  'hbc',
  'join',
  '_________',
  'groundings',
  'call',
  'thay',
  'sunshine',
  'advantaaage',
  'clown',
  '-where'],
 ['wounds'],
 ['cheesy',
  '=',
  '=',
  'whassat',
  'kincade',
  'oen',
  'question',
  'dmvs',
  'yorkshire',
  'thingy',
  'adventures',
  'arose',
  'thay',
  'works',
  'iru',
  'gomez',
  'n-word',
  'another',
  '='],
 ['whassat',
  "'red",
  'thay',
  'meanest',
  'oen',
  'lieu',
  '=',
  'whup',
  'arctic',
  'wayward',
  'herder',
  '='],
 ['hände',
  'hoowwdy',
  'yorkshire',
  "'quiet",
  'make…pot-bellied',
  'thay',
  'rigged',
  'chocolaty',
  '-where'],
 ['hände',
  "'loverboy",
  'gomez',
  'n-word',
  'misled',
  'hilt',
  '-where',
  'hände',
  "'loverboy",
  'gomez',
  'n-word',
  'misled',
  'cheesy',
  '

In [49]:
[processor.decode(text) for text in processor.data[:100]+2]

[['you', 'guys', 'e', 'you', 'guys', 'e', 'chef', 'is', 'going', 'away', '.'],
 ['going', 'away', 'e', 'for', 'how', 'long', 'e'],
 ['forever', '.'],
 ['i', 'm', 'sorry', 'boys', '.'],
 ['chef',
  'said',
  'he',
  'is',
  'been',
  'bored',
  'e',
  'so',
  'he',
  'joining',
  'a',
  'group',
  'called',
  'the',
  'super',
  'adventure',
  'club',
  '.'],
 ['wow', 'e'],
 ['chef',
  'e',
  'e',
  'what',
  'kind',
  'of',
  'questions',
  'do',
  'you',
  'think',
  'adventuring',
  'around',
  'the',
  'world',
  'is',
  'hon',
  'an',
  'answer',
  'e',
  'e'],
 ['what',
  'is',
  'the',
  'meaning',
  'of',
  'life',
  'e',
  'why',
  'are',
  'we',
  'here',
  'e'],
 ['i', 'hope', 'you', 'are', 'making', 'the', 'right', 'choice', '.'],
 ['i',
  'm',
  'hon',
  'an',
  'miss',
  'him',
  '.',
  'i',
  'm',
  'hon',
  'an',
  'miss',
  'chef',
  'and',
  'i',
  '...',
  'and',
  'i',
  'do',
  'not',
  'know',
  'how',
  'to',
  'tell',
  'him',
  'e'],
 ['dude',
  'e',
  'how',
  

In [41]:
texts[:10]

array(['You guys, you guys! Chef is going away. \n',
       'Going away? For how long?\n', 'Forever.\n', "I'm sorry boys.\n",
       "Chef said he's been bored, so he joining a group called the Super Adventure Club. \n",
       'Wow!\n',
       'Chef?? What kind of questions do you think adventuring around the world is gonna answer?!\n',
       "What's the meaning of life? Why are we here?\n",
       "I hope you're making the right choice.\n",
       "I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him! \n"], dtype=object)

In [32]:
import nltk

sentence = nltk.word_tokenize(X[0][0])

In [19]:
np.mean([len(x) for x in processor.data])

15.11404028436019

In [62]:
%%time
index = processor.batch_process(processor.data[:10])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 263 µs


In [64]:
index.shape

(10, 21)

In [53]:
xt, xlen = batch(index)

In [38]:
x = np.arange(10)
np.cumsum(x)

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

In [2]:
import tensorflow.contrib.seq2seq as seq2seq

In [43]:
_texts = []
lengths = []
for text in texts:
    words = nltk.word_tokenize(text)
    _texts.extend(words)
    lengths.append(len(words))

In [40]:
lengths.insert(0, 0)
# lenghts = np.cumsum(lengths)

In [47]:
from copy import deepcopy

_lengths = deepcopy(lengths)
_lengths.insert(0, 0)
np.cumsum(_lengths)

array([      0,      11,      18, ..., 1071516, 1071523, 1071525])

In [49]:
_texts = []
lengths = []
for text in texts:
    words = nltk.word_tokenize(text)
    _texts.extend(words)
    lengths.append(len(words))
lengths = list(np.cumsum(lengths))
lengths.insert(0, 0)

In [50]:
lengths

[0,
 11,
 18,
 20,
 25,
 43,
 45,
 65,
 77,
 86,
 112,
 129,
 149,
 151,
 153,
 156,
 161,
 165,
 179,
 182,
 188,
 194,
 199,
 204,
 208,
 210,
 213,
 223,
 229,
 236,
 240,
 247,
 254,
 258,
 283,
 285,
 307,
 309,
 332,
 336,
 394,
 400,
 416,
 421,
 442,
 452,
 471,
 485,
 492,
 499,
 515,
 536,
 551,
 566,
 573,
 592,
 609,
 622,
 628,
 630,
 644,
 647,
 660,
 663,
 672,
 674,
 679,
 682,
 688,
 690,
 705,
 708,
 726,
 734,
 746,
 754,
 766,
 770,
 793,
 800,
 817,
 823,
 826,
 832,
 839,
 844,
 882,
 884,
 892,
 894,
 904,
 913,
 924,
 940,
 959,
 964,
 974,
 988,
 1002,
 1013,
 1033,
 1041,
 1059,
 1077,
 1082,
 1109,
 1132,
 1139,
 1141,
 1148,
 1165,
 1170,
 1184,
 1188,
 1209,
 1222,
 1250,
 1253,
 1255,
 1276,
 1295,
 1298,
 1308,
 1316,
 1330,
 1402,
 1405,
 1485,
 1488,
 1511,
 1513,
 1521,
 1537,
 1546,
 1557,
 1563,
 1566,
 1590,
 1597,
 1611,
 1627,
 1637,
 1641,
 1648,
 1654,
 1669,
 1683,
 1685,
 1712,
 1722,
 1739,
 1750,
 1757,
 1767,
 1776,
 1786,
 1794,
 1804,
 18