<a href="https://colab.research.google.com/github/keshvi-srivastava/star-wars-dialogue-generation/blob/main/Yoda_Model5_GLOVE_Sliding_window_bidirectional_with_return_seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model to generate a sequence of following words:
1. Convert the data into token list
2. Convert data to token sentences with sliding windows
3. Encode the sentence
4. Simple LSTM model
5. Create a bidirectional model
6. Add Glove word embeddings

- Makes sentences sequence from the whole token list
- Make a sliding window of size 5 each

Reference:

https://medium.com/@plusepsilon/the-bidirectional-language-model-1f3961d1fb27

In [6]:
import tensorflow as tf
import numpy as np
import os
import time
import pandas as pd
import re
from numpy import array
from pickle import dump
import string
from random import randint
from pickle import load
from tensorflow.keras.layers.experimental import preprocessing

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Embedding
from keras.layers import GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy

#import spacy, and spacy french model
# spacy is used to work on text

#import other libraries
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle

In [7]:
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered_Data/'

data = pd.DataFrame(columns = ['character', 'dialogue'])

for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

data['character'] = data["character"].str.lower()

data['character'] = data.character.replace("anakin", "vader", regex=True)
data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

unique_characters = data.character.unique()

data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [8]:
def preprocess_text(sen):

    # # Single character removal
    # sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sen)

    # Remove ....
    sentence = re.sub('\.+', ' ', sentence)

    # Remove extra spaces
    sentence = re.sub(' +', ' ', sentence)

    # Remove numbers
    sentence = ''.join(filter(lambda x: not x.isdigit(), sentence))

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()

    return sentence

In [9]:
yoda_tokens = [preprocess_text(row) for row in data_dict['yoda']]
print(yoda_tokens[:5])

yoda_data = [' '.join(row) for row in yoda_tokens]
print(yoda_data)

yoda_token_list = [item for sublist in yoda_tokens for item in sublist]
print(yoda_token_list)

print("Total # of tokens(words)")
print(len(yoda_token_list))

print("Total # of unique tokens(words)")
print(len(set(yoda_token_list)))

[['the', 'very', 'republic', 'is', 'threatened', 'if', 'involved', 'the', 'sith', 'are'], ['hard', 'to', 'see', 'the', 'dark', 'side', 'is', 'discover', 'who', 'this', 'assassin', 'is', 'we'], ['with', 'this', 'naboo', 'queen', 'you', 'must', 'stay', 'quigon', 'protect', 'her'], ['may', 'the', 'force', 'be', 'with', 'you'], ['contd', 'master', 'quigon', 'more', 'to', 'say', 'have', 'you']]
['the very republic is threatened if involved the sith are', 'hard to see the dark side is discover who this assassin is we', 'with this naboo queen you must stay quigon protect her', 'may the force be with you', 'contd master quigon more to say have you', 'a vergence you say', 'but you do rrevealed your opinion is', 'trained as a jedi you request for him', 'tested he will be', 'good good young one how feel you', 'afraid are you', 'see through you we can', 'afraid to lose heri think', 'eveything fear is the path to the dark side fear leads to', 'a jedi must have the deepest commitment the most seriou

In [10]:
file1 = open("YodaProcessedData.txt","a")
for row in yoda_tokens:
  str_val = ' '.join(row)
  file1.writelines(str_val + '\n')
file1.close()

In [11]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(yoda_token_list)
unique_words = set(yoda_token_list)
sequences_tokenised = tokenizer.texts_to_sequences(yoda_tokens)

vocab_size = len(unique_words)+1
n_sentences = len(yoda_tokens)

In [12]:
print(unique_words)
print(sequences_tokenised)

print(vocab_size)
print(n_sentences)

{'havedisappeared', 'when', 'with', 'force', 'kenobi', 'maste', 'mind', 'accept', 'involved', 'stop', 'miss', 'later', 'timing', 'need', 'why', 'binds', 'give', 'might', 'hmph', 'longer', 'up', 'finished', 'move', 'commune', 'must', 'failure', 'thepossibilities', 'planet', 'sick', 'reckless', 'not', 'begins', 'queens', 'heard', 'path', 'destroy', 'kashyyyk', 'be', 'its', 'consumed', 'frowningmuch', 'returned', 'strongyoung', 'duty', 'suggest', 'enough', 'shadow', 'line', 'until', 'placeis', 'creates', 'training', 'sith', 'achild', 'his', 'old', 'vader', 'let', 'prophecy', 'would', 'am', 'friends', 'level', 'mindful', 'confident', 'watched', 'last', 'far', 'recordings', 'has', 'tothe', 'gratitude', 'strength', 'harder', 'consciousness', 'once', 'amsure', 'serious', 'danger', 'moreno', 'apprentice', 'thendivide', 'obiwans', 'welcome', 'misplaced', 'greed', 'different', 'take', 'planetfind', 'treethe', 'always', 'contact', 'naboo', 'help', 'to', 'join', 'doing', 'seductive', 'goodbye', 'd

In [13]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

--2021-04-15 21:13:00--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-15 21:13:00--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-15 21:13:01--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [14]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/content/glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [15]:
print(embedding_matrix.shape)

(650, 300)


In [16]:
#Create sliding windows
seq_length = 5
sequences_step = 1
dataX = []
dataY = []
for dialogue in sequences_tokenised:
  window = []
  for i in range(len(dialogue)-5):
    dataX.append(dialogue[i:i+5])
    dataY.append(dialogue[i+5])


In [18]:
print(len(dataX))
print(len(dataY))

1304
1304


In [19]:
print(dataX)
print(pd.get_dummies(dataY))

[[1, 251, 252, 3, 253], [251, 252, 3, 253, 31], [252, 3, 253, 31, 254], [3, 253, 31, 254, 1], [253, 31, 254, 1, 90], [255, 4, 40, 1, 21], [4, 40, 1, 21, 32], [40, 1, 21, 32, 3], [1, 21, 32, 3, 146], [21, 32, 3, 146, 46], [32, 3, 146, 46, 18], [3, 146, 46, 18, 256], [146, 46, 18, 256, 3], [20, 18, 257, 258, 2], [18, 257, 258, 2, 11], [257, 258, 2, 11, 259], [258, 2, 11, 259, 57], [2, 11, 259, 57, 260], [58, 1, 9, 12, 20], [262, 37, 57, 38, 4], [37, 57, 38, 4, 59], [57, 38, 4, 59, 10], [36, 2, 29, 264, 13], [2, 29, 264, 13, 265], [71, 41, 7, 26, 2], [41, 7, 26, 2, 266], [7, 26, 2, 266, 25], [42, 42, 91, 43, 72], [42, 91, 43, 72, 92], [269, 48, 3, 1, 60], [48, 3, 1, 60, 4], [3, 1, 60, 4, 1], [1, 60, 4, 1, 21], [60, 4, 1, 21, 32], [4, 1, 21, 32, 48], [1, 21, 32, 48, 150], [7, 26, 11, 10, 1], [26, 11, 10, 1, 151], [11, 10, 1, 151, 152], [10, 1, 151, 152, 1], [1, 151, 152, 1, 109], [151, 152, 1, 109, 153], [152, 1, 109, 153, 49], [272, 18, 273, 61, 3], [18, 273, 61, 3, 274], [273, 61, 3, 274

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = np.zeros((len(dataX), vocab_size), dtype=np.bool)
for i, sentence in enumerate(dataX):
  y[i, dataY[i]] = 1

print(y.shape)

X = pad_sequences(dataX, maxlen=5)
print(X.shape)

(1304, 650)
(1304, 5)


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, GRU, Flatten
def bidirectional_lstm_model(seq_length, vocab_size):

    embedding_layer = Embedding(vocab_size,
                                300,
                                weights=[embedding_matrix],
                                input_length=5,
                                trainable=False)
    inp = Input(shape=(5,))
    x = embedding_layer(inp)
    x = Bidirectional(LSTM(200,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(vocab_size,activation='relu')(x)
    x = Dense(vocab_size,activation='sigmoid')(x)
    model = Model(inputs=inp,outputs=x)
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    #call the functions in the metrics 
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [22]:
rnn_size = 256 # size of RNN
seq_length = 5 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 300)            195000    
_________________________________________________________________
bidirectional (Bidirectional (None, 5, 400)            801600    
_________________________________________________________________
global_max_pooling1d (Global (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 650)               260650    
_________________________________________________________________
dense_1 (Dense)              (None, 650)               423150    
Total params: 1,680,400
Trainable params: 1,485,400
Non-trainable params: 195,000
_____________________________________________

In [23]:
batch_size = 32 # minibatch size
num_epochs = 100 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath="./" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [24]:
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [25]:
model = load_model("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [26]:
def sample(preds, temperature=2.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [27]:
reverse_word_dict = {v: k for k, v in tokenizer.word_index.items()}

In [28]:
def generate_random_text():

  #initiate sentences
  generated = ''
  sentence = dataX[randint(0,len(dataX))]
  sentence = [' '.join([reverse_word_dict[word] for word in sentence])]
  generated += sentence[0]

  #we generate the next text
  for i in range(5):
    
      seq = tokenizer.texts_to_sequences(sentence)
      padded = pad_sequences(seq, maxlen=5)

      #calculate next word
      preds = model.predict(padded, verbose=0)[0]
      next_index = sample(preds, 0.33)
      next_word = reverse_word_dict[next_index]

      #add the next word to the text
      generated += " " + next_word
      sentence = [' '.join(sentence[0].split()[1:]) + " " + next_word]
      
  #print the whole text
  return (generated)

In [29]:
n_sentences_gen = 10
generated_sentences = []
for i in range(n_sentences_gen):
  generated_sentences.append(generate_random_text())

print(generated_sentences)

['remains vader you must confront you become ready you you', 'transform into the force mourn you not it your you', 'hurry careful timing we will need i are are you', 'times nothing is what itappears to the dark side side', 'twilight is upon me and the dark side the dark', 'then a jedi will you have for i old to', 'of the emperor or suffer the force the dark side', 'itappears to be but the force you are be with', 'train yourself to let go it it is will be', 'a fully trained jedi knight the deepest force become you']


In [30]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

In [31]:
gen = generated_sentences
ref = yoda_tokens

In [32]:
print(ref)
print(gen[1].split())

[['the', 'very', 'republic', 'is', 'threatened', 'if', 'involved', 'the', 'sith', 'are'], ['hard', 'to', 'see', 'the', 'dark', 'side', 'is', 'discover', 'who', 'this', 'assassin', 'is', 'we'], ['with', 'this', 'naboo', 'queen', 'you', 'must', 'stay', 'quigon', 'protect', 'her'], ['may', 'the', 'force', 'be', 'with', 'you'], ['contd', 'master', 'quigon', 'more', 'to', 'say', 'have', 'you'], ['a', 'vergence', 'you', 'say'], ['but', 'you', 'do', 'rrevealed', 'your', 'opinion', 'is'], ['trained', 'as', 'a', 'jedi', 'you', 'request', 'for', 'him'], ['tested', 'he', 'will', 'be'], ['good', 'good', 'young', 'one', 'how', 'feel', 'you'], ['afraid', 'are', 'you'], ['see', 'through', 'you', 'we', 'can'], ['afraid', 'to', 'lose', 'heri', 'think'], ['eveything', 'fear', 'is', 'the', 'path', 'to', 'the', 'dark', 'side', 'fear', 'leads', 'to'], ['a', 'jedi', 'must', 'have', 'the', 'deepest', 'commitment', 'the', 'most', 'serious', 'mind', 'i'], ['then', 'continue', 'we', 'will'], ['correct', 'you', 

In [33]:
def get_bleu_score(ref, gen):
  bleu_score = 0
  for sen in gen:
    bleu_score += sentence_bleu(ref, sen.split())
  return (bleu_score/10)

In [34]:
print('BLEU score -> {}'.format(get_bleu_score(ref, gen)))

BLEU score -> 0.5636460841171014


In [35]:
file1 = open("YodaGeneratedSen.txt","a")
for row in generated_sentences:
  file1.writelines(row + '\n')
file1.close()