In [185]:
import tensorflow as tf
#import keras from tensorflow
import pandas as pd
import nltk
import math
import itertools
import numpy as np
import collections
import random

In [186]:
seinfeld_data = pd.read_csv('Sentence_generation/scripts.csv')

In [187]:
seinfeld_text_data = seinfeld_data[['Dialogue']]

In [188]:
#Vocab infor
VOCAB_SIZE = 10000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

use_dropout = False
num_epochs = 10

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
hidden_size = 128
skip_window = 4  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64
num_steps = 12

In [189]:
sentences =  []
for i in range(0, len(seinfeld_text_data)):
    temp = seinfeld_text_data.iloc[i]['Dialogue']
    if type(temp) is str:
          
        temp = nltk.sent_tokenize(temp)
    
        for j in range(0, len(temp)): 
            sentences.append(temp[j])
   

    #seinfeld_text_data.iloc[i]['Dialogue'] = temp
    
    
    #seinfeld_text_data.iloc[i]['Dialogue'] = temp

In [190]:
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [191]:
sentences[2]

'To be out, this is out...and out is one of the single most enjoyable experiences of life.'

In [192]:
flat_sentences =  [temp_sent for sublist in tokenized_sentences for temp_sent in sublist]


In [194]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

In [195]:
vocab = word_freq.most_common(VOCAB_SIZE-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [196]:
flat_sentences = [w if w in word_to_index else unknown_token for w in flat_sentences]

In [198]:
train_data = np.asarray([word_to_index[w] for w in flat_sentences[:600000]])
valid_data = np.asarray([word_to_index[w] for w in flat_sentences[600000:800000]])
test_data = np.asarray([word_to_index[w] for w in flat_sentences[800000:]])

In [199]:
valid_data[:20]

array([  21,    1,  514,    3,   35,    1,    2,   15,   13,   87,   37,
          4,   29,   60, 9999,    7, 9999, 9999, 7771,    0])

In [200]:
class KerasBatchGenerator(object):

    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step
        
    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = tf.keras.utils.to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

In [201]:
train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, VOCAB_SIZE,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, VOCAB_SIZE,
                                           skip_step=num_steps)

In [202]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(VOCAB_SIZE, embedding_size, input_length=num_steps))
model.add(tf.keras.layers.LSTM(hidden_size, return_sequences=True))
model.add(tf.keras.layers.LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(VOCAB_SIZE)))
model.add(tf.keras.layers.Activation('softmax'))

In [203]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])


In [204]:
model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(batch_size*num_steps))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a34e5fef0>

In [205]:
dummy_iters = 40
example_training_generator = KerasBatchGenerator(train_data, num_steps, 1, VOCAB_SIZE,
                                                     skip_step=1)
print("Training data:")
for i in range(dummy_iters):
    dummy = next(example_training_generator.generate())
num_predict = 10
true_print_out = "Actual words: "
pred_print_out = "Predicted words: "
for i in range(num_predict):
    data = next(example_training_generator.generate())
    prediction = model.predict(data[0])
    predict_word = np.argmax(prediction[:, num_steps-1, :])
    true_print_out += index_to_word[train_data[num_steps + dummy_iters + i]] + " "
    pred_print_out += index_to_word[predict_word] + " "
print(true_print_out)
print(pred_print_out)

Training data:
Actual words: This is what theyre talking about ... this whole thing 
Predicted words: , , , , , , , , , , 
