<a href="https://colab.research.google.com/github/keshvi-srivastava/star-wars-dialogue-generation/blob/main/Threepio_Model5_GLOVE_Sliding_window_bidirectional_with_return_seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Model to generate a sequence of following words:
1. Convert the data into token list
2. Convert data to token sentences with sliding windows
3. Encode the sentence
4. Simple LSTM model
5. Create a bidirectional model
6. Add Glove word embeddings

- Makes sentences sequence from the whole token list
- Make a sliding window of size 5 each

Reference:

https://medium.com/@plusepsilon/the-bidirectional-language-model-1f3961d1fb27

In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import pandas as pd
import re
from numpy import array
from pickle import dump
import string
from random import randint
from pickle import load
from tensorflow.keras.layers.experimental import preprocessing

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Embedding
from keras.layers import GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy

#import spacy, and spacy french model
# spacy is used to work on text

#import other libraries
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle

In [3]:
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered_Data/'

data = pd.DataFrame(columns = ['character', 'dialogue'])

for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

data['character'] = data["character"].str.lower()

data['character'] = data.character.replace("anakin", "vader", regex=True)
data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

unique_characters = data.character.unique()

data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [4]:
def preprocess_text(sen):

    # # Single character removal
    # sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sen)

    # Remove ....
    sentence = re.sub('\.+', ' ', sentence)

    # Remove extra spaces
    sentence = re.sub(' +', ' ', sentence)

    # Remove numbers
    sentence = ''.join(filter(lambda x: not x.isdigit(), sentence))

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()

    return sentence

In [5]:
threepio_tokens = [preprocess_text(row) for row in data_dict['threepio']]
print(threepio_tokens[:5])

threepio_data = [' '.join(row) for row in threepio_tokens]
print(threepio_data)

threepio_token_list = [item for sublist in threepio_tokens for item in sublist]
print(threepio_token_list)

print("Total # of tokens(words)")
print(len(threepio_token_list))

print("Total # of unique tokens(words)")
print(len(set(threepio_token_list)))

[['how', 'do', 'you', 'do', 'i', 'am', 'seethreepio', 'human', 'cyborg', 'relations', 'how'], ['i', 'beg', 'your', 'pardonwhat', 'do', 'you', 'mean', 'im', 'naked'], ['contd', 'my', 'parts', 'are', 'showing', 'oh', 'my', 'goodness', 'how'], ['oh', 'my', 'space', 'travel', 'sounds', 'rather', 'perilous'], ['i', 'can', 'assure', 'you', 'they', 'will', 'never', 'get', 'me', 'onto', 'one', 'of', 'those']]
['how do you do i am seethreepio human cyborg relations how', 'i beg your pardonwhat do you mean im naked', 'contd my parts are showing oh my goodness how', 'oh my space travel sounds rather perilous', 'i can assure you they will never get me onto one of those', 'he has to complete two more circuits oh dear', 'master annie you are my maker and i wish you well although', 'sell me', 'good evening may i help you', 'oh my oh my masteranakin my goodness i can hardlybelieve it and this must be misspadme', 'oh dear im so terribly sorrymaster annie', 'i think wed better go inside', 'master lars m

In [6]:
file1 = open("ThreepioProcessedData.txt","a")
for row in threepio_tokens:
  str_val = ' '.join(row)
  file1.writelines(str_val + '\n')
file1.close()

In [7]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(threepio_token_list)
unique_words = set(threepio_token_list)
sequences_tokenised = tokenizer.texts_to_sequences(threepio_tokens)

vocab_size = len(unique_words)+1
n_sentences = len(threepio_tokens)

In [8]:
print(unique_words)
print(sequences_tokenised)

print(vocab_size)
print(n_sentences)

{'suicide', 'grease', 'nesting', 'leia', 'things', 'had', 'surviving', 'somebody', 'inquire', 'pardonwhat', 'ready', 'so', 'didnt', 'observant', 'admit', 'know', 'bo', 'flying', 'damaged', 'nobody', 'themi', 'terminated', 'abide', 'frozen', 'strange', 'lars', 'masteranakin', 'adventures', 'most', 'heartily', 'counterpart', 'programmed', 'meltingthis', 'mean', 'mistake', 'a', 'mmmm', 'place', 'cant', 'hutt', 'looked', 'dreadful', 'alliance', 'you', 'minded', 'theyregenerating', 'fluent', 'perfect', 'considerable', 'lost', 'ill', 'oh', 'real', 'destroyer', 'showing', 'and', 'no', 'nobodywould', 'luke', 'somethingyou', 'reward', 'may', 'please', 'antilles', 'meaning', 'field', 'say', 'extreme', 'five', 'rumor', 'having', 'head', 'systems', 'suffer', 'up', 'control', 'unit', 'helpless', 'ii', 'safe', 'wasnt', 'seven', 'only', 'hundred', 'asks', 'behind', 'halfsized', 'her', 'hurry', 'anyouter', 'ourselves', 'sure', 'stick', 'yourself', 'unhappy', 'about', 'im', 'coupled', 'account', 'jump'

In [9]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

--2021-04-15 21:28:39--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-15 21:28:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-15 21:28:40--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [10]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/content/glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [11]:
print(embedding_matrix.shape)

(1042, 300)


In [12]:
#Create sliding windows
seq_length = 5
sequences_step = 1
dataX = []
dataY = []
for dialogue in sequences_tokenised:
  window = []
  for i in range(len(dialogue)-5):
    dataX.append(dialogue[i:i+5])
    dataY.append(dialogue[i+5])


In [13]:
print(len(dataX))
print(len(dataY))
print(dataY[1450])

2523
2523
240


In [14]:
print(dataX)
print(pd.get_dummies(dataY))

[[43, 32, 2, 32, 1], [32, 2, 32, 1, 58], [2, 32, 1, 58, 161], [32, 1, 58, 161, 207], [1, 58, 161, 207, 422], [58, 161, 207, 422, 208], [1, 162, 30, 423, 32], [162, 30, 423, 32, 2], [30, 423, 32, 2, 143], [423, 32, 2, 143, 11], [424, 9, 260, 17, 261], [9, 260, 17, 261, 5], [260, 17, 261, 5, 9], [17, 261, 5, 9, 97], [5, 9, 262, 263, 163], [9, 262, 263, 163, 122], [1, 48, 426, 2, 59], [48, 426, 2, 59, 71], [426, 2, 59, 71, 111], [2, 59, 71, 111, 72], [59, 71, 111, 72, 13], [71, 111, 72, 13, 427], [111, 72, 13, 427, 79], [72, 13, 427, 79, 10], [20, 62, 4, 429, 430], [62, 4, 429, 430, 123], [4, 429, 430, 123, 164], [429, 430, 123, 164, 5], [23, 264, 2, 17, 9], [264, 2, 17, 9, 265], [2, 17, 9, 265, 24], [17, 9, 265, 24, 1], [9, 265, 24, 1, 165], [265, 24, 1, 165, 2], [24, 1, 165, 2, 39], [74, 432, 166, 1, 55], [5, 9, 5, 9, 433], [9, 5, 9, 433, 9], [5, 9, 433, 9, 97], [9, 433, 9, 97, 1], [433, 9, 97, 1, 48], [9, 97, 1, 48, 434], [97, 1, 48, 434, 18], [1, 48, 434, 18, 24], [48, 434, 18, 24, 15

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = np.zeros((len(dataX), vocab_size), dtype=np.bool)
for i, sentence in enumerate(dataX):
  y[i, dataY[i]] = 1

print(y.shape)

X = pad_sequences(dataX, maxlen=5)
print(X.shape)

(2523, 1042)
(2523, 5)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, GRU, Flatten
def bidirectional_lstm_model(seq_length, vocab_size):

    embedding_layer = Embedding(vocab_size,
                                300,
                                weights=[embedding_matrix],
                                input_length=5,
                                trainable=False)
    inp = Input(shape=(5,))
    x = embedding_layer(inp)
    x = Bidirectional(LSTM(200,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(vocab_size,activation='relu')(x)
    x = Dense(vocab_size,activation='sigmoid')(x)
    model = Model(inputs=inp,outputs=x)
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    #call the functions in the metrics 
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [17]:
rnn_size = 256 # size of RNN
seq_length = 5 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 300)            312600    
_________________________________________________________________
bidirectional (Bidirectional (None, 5, 400)            801600    
_________________________________________________________________
global_max_pooling1d (Global (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 1042)              417842    
_________________________________________________________________
dense_1 (Dense)              (None, 1042)              1086806   
Total params: 2,618,848
Trainable params: 2,306,248
Non-trainable params: 312,600
_____________________________________________

In [18]:
batch_size = 32 # minibatch size
num_epochs = 100 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath="./" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [19]:
md.save("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [20]:
model = load_model("/content/drive/MyDrive/Colab Notebooks/" + "my_model_generate_sentences.h5")

In [21]:
def sample(preds, temperature=2.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [22]:
reverse_word_dict = {v: k for k, v in tokenizer.word_index.items()}

In [23]:
def generate_random_text():

  #initiate sentences
  generated = ''
  sentence = dataX[randint(0,len(dataX))]
  sentence = [' '.join([reverse_word_dict[word] for word in sentence])]
  generated += sentence[0]

  #we generate the next text
  for i in range(5):
    
      seq = tokenizer.texts_to_sequences(sentence)
      padded = pad_sequences(seq, maxlen=5)

      #calculate next word
      preds = model.predict(padded, verbose=0)[0]
      next_index = sample(preds, 0.33)
      next_word = reverse_word_dict[next_index]

      #add the next word to the text
      generated += " " + next_word
      sentence = [' '.join(sentence[0].split()[1:]) + " " + next_word]
      
  #print the whole text
  return (generated)

In [24]:
n_sentences_gen = 10
generated_sentences = []
for i in range(n_sentences_gen):
  generated_sentences.append(generate_random_text())

print(generated_sentences)

['where are we going i cant see know should you', 'gone with master luke than stay here with you well', 'told you artoodetoo you know better than to trust trust', 'were coming out of the asteroid field has direct to', 'this thats why ive also been programmed to make up', 'communicate but it has the most peculiar dialect i believe', 'has been rather a longtime on you master luke is', 'get me onto one of those youre going to get', 'oh my id forgotten how much i hate space travel', 'wait oh dear artoo artoo artoo artoo you might to']


In [25]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

In [26]:
gen = generated_sentences
ref = threepio_tokens

In [27]:
print(ref)
print(gen[1].split())

[['how', 'do', 'you', 'do', 'i', 'am', 'seethreepio', 'human', 'cyborg', 'relations', 'how'], ['i', 'beg', 'your', 'pardonwhat', 'do', 'you', 'mean', 'im', 'naked'], ['contd', 'my', 'parts', 'are', 'showing', 'oh', 'my', 'goodness', 'how'], ['oh', 'my', 'space', 'travel', 'sounds', 'rather', 'perilous'], ['i', 'can', 'assure', 'you', 'they', 'will', 'never', 'get', 'me', 'onto', 'one', 'of', 'those'], ['he', 'has', 'to', 'complete', 'two', 'more', 'circuits', 'oh', 'dear'], ['master', 'annie', 'you', 'are', 'my', 'maker', 'and', 'i', 'wish', 'you', 'well', 'although'], ['sell', 'me'], ['good', 'evening', 'may', 'i', 'help', 'you'], ['oh', 'my', 'oh', 'my', 'masteranakin', 'my', 'goodness', 'i', 'can', 'hardlybelieve', 'it', 'and', 'this', 'must', 'be', 'misspadme'], ['oh', 'dear', 'im', 'so', 'terribly', 'sorrymaster', 'annie'], ['i', 'think', 'wed', 'better', 'go', 'inside'], ['master', 'lars', 'master', 'owensomebody', 'to', 'see', 'you'], ['please', 'dont', 'leave', 'us', 'miss', 'p

In [28]:
def get_bleu_score(ref, gen):
  bleu_score = 0
  for sen in gen:
    bleu_score += sentence_bleu(ref, sen.split())
  return (bleu_score/10)

In [29]:
print('BLEU score -> {}'.format(get_bleu_score(ref, gen)))

BLEU score -> 0.7684978670123553


In [30]:
file1 = open("ThreepioGeneratedSen.txt","a")
for row in generated_sentences:
  file1.writelines(row + '\n')
file1.close()