<a href="https://colab.research.google.com/github/keshvi-srivastava/star-wars-dialogue-generation/blob/main/Yoda_Model5_GLOVE_Sliding_window_bidirectional_with_return_seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model to generate a sequence of following words:
1. Convert the data into token list
2. Convert data to token sentences with sliding windows
3. Encode the sentence
4. Simple LSTM model
5. Create a bidirectional model
6. Add Glove word embeddings

- Makes sentences sequence from the whole token list
- Make a sliding window of size 5 each

Reference:

https://medium.com/@plusepsilon/the-bidirectional-language-model-1f3961d1fb27

In [4]:
import tensorflow as tf
import numpy as np
import os
import time
import pandas as pd
import re
from numpy import array
from pickle import dump
import string
from random import randint
from pickle import load
from tensorflow.keras.layers.experimental import preprocessing

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Embedding
from keras.layers import GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy

#import spacy, and spacy french model
# spacy is used to work on text

#import other libraries
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle

In [5]:
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered_Data/'

data = pd.DataFrame(columns = ['character', 'dialogue'])

for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

data['character'] = data["character"].str.lower()

data['character'] = data.character.replace("anakin", "vader", regex=True)
data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

unique_characters = data.character.unique()

data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [6]:
def preprocess_text(sen):

    # Remove numbers
    sentence = re.sub(" \d+", " ", sen)

    # # Single character removal
    # sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Remove ....
    sentence = re.sub('\.+', ' ', sentence)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()

    return sentence

In [7]:
obi_wan_tokens = [preprocess_text(row) for row in data_dict['yoda']]
print(obi_wan_tokens[:5])

obi_wan_data = [' '.join(row) for row in obi_wan_tokens]
print(obi_wan_data)

obi_wan_token_list = [item for sublist in obi_wan_tokens for item in sublist]
print(obi_wan_token_list)

print("Total # of tokens(words)")
print(len(obi_wan_token_list))

print("Total # of unique tokens(words)")
print(len(set(obi_wan_token_list)))

[['the', 'very', 'republic', 'is', 'threatened', 'if', 'involved', 'the', 'sith', 'are'], ['hard', 'to', 'see', 'the', 'dark', 'side', 'is', 'discover', 'who', 'this', 'assassin', 'is', 'we'], ['with', 'this', 'naboo', 'queen', 'you', 'must', 'stay', 'quigon', 'protect', 'her'], ['may', 'the', 'force', 'be', 'with', 'you'], ['contd', 'master', 'quigon', 'more', 'to', 'say', 'have', 'you']]
['the very republic is threatened if involved the sith are', 'hard to see the dark side is discover who this assassin is we', 'with this naboo queen you must stay quigon protect her', 'may the force be with you', 'contd master quigon more to say have you', 'a vergence you say', 'but you do rrevealed your opinion is', 'trained as a jedi you request for him', 'tested he will be', 'good good young one how feel you', 'afraid are you', 'see through you we can', 'afraid to lose her i think', 'eveything fear is the path to the dark side fear leads to', 'a jedi must have the deepest commitment the most serio

In [8]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(obi_wan_token_list)
unique_words = set(obi_wan_token_list)
sequences_tokenised = tokenizer.texts_to_sequences(obi_wan_tokens)

vocab_size = len(unique_words)+1
n_sentences = len(obi_wan_tokens)

In [9]:
print(unique_words)
print(sequences_tokenised)

print(vocab_size)
print(n_sentences)

{'end', 'my', 'you', 'sway', 'already', 'stink', 'way', 'temple', 'grave', 'fight', 'help', 'question', 'greater', 'could', 'father', 'make', 'queens', 'youworry', 'need', 'wookiees', 'coded', 'quicker', 'careful', 'defense', 'heart', 'obiwan', 'influence', 'sky', 'afraid', 'how', 'studied', 'yet', 'yourself', 'serve', 'look', 'onlygoing', 'lightly', 'heard', 'master', 'conquer', 'planets', 'thecentre', 'would', 'did', 'confident', 'jedis', 'does', 'run', 'anything', 'death', 'one', 'physical', 'war', 'rim', 'miss', 'out', 'continuingnot', 'mud', 'luke', 'might', 'exile', 'lord', 'masked', 'choice', 'or', 'ship', 'short', 'finished', 'turned', 'me', 'hope', 'more', 'must', 'council', 'her', 'clonewar', 'solitude', 'timing', 'through', 'him', 'inform', 'rushed', 'life', 'gratitude', 'perhaps', 'care', 'working', 'our', 'all', 'this', 'clear', 'sees', 'even', 'pain', 'goodbye', 'tarfful', 'leave', 'needed', 'wait', 'just', 'protect', 'act', 'feeling', 'appointment', 'destiny', 'fathers',

In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

--2021-04-14 20:18:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-14 20:18:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-14 20:18:51--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [11]:
 # load the whole embedding into memory
embeddings_index = dict()
f = open('/content/glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [12]:
print(embedding_matrix.shape)

(635, 300)


In [13]:
#Create sliding windows
seq_length = 5
sequences_step = 1
dataX = []
dataY = []
for dialogue in sequences_tokenised:
  window = []
  for i in range(len(dialogue)-5):
    dataX.append(dialogue[i:i+5])
    dataY.append(dialogue[i+5])


In [15]:
print(len(dataX))
print(len(dataY))
#print(dataY[1450])

1333
1333


In [16]:
print(dataX)
print(pd.get_dummies(dataY))

[[1, 257, 258, 3, 259], [257, 258, 3, 259, 32], [258, 3, 259, 32, 260], [3, 259, 32, 260, 1], [259, 32, 260, 1, 94], [261, 4, 40, 1, 23], [4, 40, 1, 23, 33], [40, 1, 23, 33, 3], [1, 23, 33, 3, 147], [23, 33, 3, 147, 48], [33, 3, 147, 48, 18], [3, 147, 48, 18, 262], [147, 48, 18, 262, 3], [21, 18, 263, 264, 2], [18, 263, 264, 2, 11], [263, 264, 2, 11, 265], [264, 2, 11, 265, 58], [2, 11, 265, 58, 266], [59, 1, 9, 12, 21], [267, 34, 58, 37, 4], [34, 58, 37, 4, 60], [58, 37, 4, 60, 10], [38, 2, 27, 269, 15], [2, 27, 269, 15, 270], [75, 41, 7, 28, 2], [41, 7, 28, 2, 271], [7, 28, 2, 271, 25], [42, 42, 76, 43, 77], [42, 76, 43, 77, 78], [149, 4, 150, 148, 6], [273, 44, 3, 1, 61], [44, 3, 1, 61, 4], [3, 1, 61, 4, 1], [1, 61, 4, 1, 23], [61, 4, 1, 23, 33], [4, 1, 23, 33, 44], [1, 23, 33, 44, 152], [7, 28, 11, 10, 1], [28, 11, 10, 1, 153], [11, 10, 1, 153, 154], [10, 1, 153, 154, 1], [1, 153, 154, 1, 110], [153, 154, 1, 110, 155], [154, 1, 110, 155, 50], [276, 18, 277, 51, 3], [18, 277, 51, 3,

In [17]:
# X = np.zeros((len(dataX), seq_length, vocab_size), dtype=np.bool)
# y = np.zeros((len(dataX), vocab_size), dtype=np.bool)
# for i, sentence in enumerate(dataX):
#   for t, word in enumerate(sentence):
#     X[i, t, word] = 1
#   y[i, dataY[i]] = 1

In [18]:
# print(X.shape)
# print(y.shape)

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = np.zeros((len(dataX), vocab_size), dtype=np.bool)
for i, sentence in enumerate(dataX):
  y[i, dataY[i]] = 1

print(y.shape)

X = pad_sequences(dataX, maxlen=5)
print(X.shape)

(1333, 635)
(1333, 5)


In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, GRU, Flatten
def bidirectional_lstm_model(seq_length, vocab_size):
    # print('Build LSTM model.')
    # model = Sequential()
    # model.add(Embedding(vocab_size,300,weights=[embedding_matrix],input_length=5,trainable=False))
    # model.add(Bidirectional(LSTM(rnn_size,dropout=0.1,recurrent_dropout=0.1, activation="relu"),input_shape=(seq_length, vocab_size)))
    # model.add(Dropout(0.6))
    # model.add(Dense(vocab_size))
    # model.add(Activation('softmax'))
    
    # optimizer = Adam(lr=learning_rate)
    # callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    # model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    # print("model built!")

    embedding_layer = Embedding(vocab_size,
                                300,
                                weights=[embedding_matrix],
                                input_length=5,
                                trainable=False)
    inp = Input(shape=(5,))
    x = embedding_layer(inp)
    x = Bidirectional(LSTM(200,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(vocab_size,activation='relu')(x)
    x = Dense(vocab_size,activation='sigmoid')(x)
    model = Model(inputs=inp,outputs=x)
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    #call the functions in the metrics 
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [21]:
rnn_size = 256 # size of RNN
seq_length = 5 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 300)            190500    
_________________________________________________________________
bidirectional (Bidirectional (None, 5, 400)            801600    
_________________________________________________________________
global_max_pooling1d (Global (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 635)               254635    
_________________________________________________________________
dense_1 (Dense)              (None, 635)               403860    
Total params: 1,650,595
Trainable params: 1,460,095
Non-trainable params: 190,500
_____________________________________________

In [22]:
batch_size = 32 # minibatch size
num_epochs = 100 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath="./" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [23]:
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [24]:
model = load_model("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [25]:
def sample(preds, temperature=2.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [26]:
reverse_word_dict = {v: k for k, v in tokenizer.word_index.items()}

In [40]:

#initiate sentences
generated = ''
# sentence = ['anakin has turned to the']
sentence = dataX[randint(0,len(dataX))]
print(sentence)
sentence = [' '.join([reverse_word_dict[word] for word in sentence])]

generated += sentence[0]

#the, we generate the text
for i in range(5):
  
    seq = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(seq, maxlen=5)
    #calculate next word
    preds = model.predict(padded, verbose=0)[0]
    # print(preds)

    next_index = sample(preds, 0.33)
    next_word = reverse_word_dict[next_index]

    #add the next word to the text
    generated += " " + next_word
    print("generated sentence: ", generated)
    sentence = [' '.join(sentence[0].split()[1:]) + " " + next_word]
    
#print the whole text
print(generated)

[27, 13, 403, 152, 4]
generated sentence:  do not attachment leads to jealousy
generated sentence:  do not attachment leads to jealousy the
generated sentence:  do not attachment leads to jealousy the shadow
generated sentence:  do not attachment leads to jealousy the shadow of
generated sentence:  do not attachment leads to jealousy the shadow of greed
do not attachment leads to jealousy the shadow of greed


In [31]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

In [41]:
gen = ['learned luke there is another too to be be be', 'do not attachment leads to jealousy the shadow of greed']
ref = obi_wan_tokens

In [42]:
print(ref)
print(gen[1].split())

[['the', 'very', 'republic', 'is', 'threatened', 'if', 'involved', 'the', 'sith', 'are'], ['hard', 'to', 'see', 'the', 'dark', 'side', 'is', 'discover', 'who', 'this', 'assassin', 'is', 'we'], ['with', 'this', 'naboo', 'queen', 'you', 'must', 'stay', 'quigon', 'protect', 'her'], ['may', 'the', 'force', 'be', 'with', 'you'], ['contd', 'master', 'quigon', 'more', 'to', 'say', 'have', 'you'], ['a', 'vergence', 'you', 'say'], ['but', 'you', 'do', 'rrevealed', 'your', 'opinion', 'is'], ['trained', 'as', 'a', 'jedi', 'you', 'request', 'for', 'him'], ['tested', 'he', 'will', 'be'], ['good', 'good', 'young', 'one', 'how', 'feel', 'you'], ['afraid', 'are', 'you'], ['see', 'through', 'you', 'we', 'can'], ['afraid', 'to', 'lose', 'her', 'i', 'think'], ['eveything', 'fear', 'is', 'the', 'path', 'to', 'the', 'dark', 'side', 'fear', 'leads', 'to'], ['a', 'jedi', 'must', 'have', 'the', 'deepest', 'commitment', 'the', 'most', 'serious', 'mind', 'i'], ['then', 'continue', 'we', 'will'], ['correct', 'yo

In [44]:
print('BLEU score -> {}'.format(sentence_bleu(ref, gen[0].split())))

BLEU score -> 0.48109772909788073
