Model to generate a sequence of following words:
1. Extract relevant characters data
2. Convert data to token sequence of size 5
3. Encode the sentence 
4. Create a bidirectional LSTM model
5. Add Glove word embeddings

Reference:

~ https://medium.com/@plusepsilon/the-bidirectional-language-model-1f3961d1fb27

~ https://www.kaggle.com/guidant/mimicking-star-wars-characters-using-a-i-rnn#2.-Data-Preparation

~ https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

~ https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

~ https://towardsdatascience.com/simple-text-generation-d1c93f43f340



In [122]:
# Imports
import numpy as np
import pandas as pd
import string
from random import randint
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.models import Model
from keras.models import load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Bidirectional
from keras.layers import GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping


from __future__ import print_function
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

# Get Pre-trained GLOVE embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

#load the whole embedding into memory
embeddings_index = dict()
f = open('/content/glove.6B.300d.txt')

for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

--2021-04-22 01:37:29--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-22 01:37:29--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-22 01:37:29--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.6’


2021

In [123]:
# Add file path to where the filtered data may be
# One can also add the folder temporarily in the runtime location
# path_to_file = '/content/Filtered Data/'
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered Data/'

# Create Dataframe of all characters and dialogues
data = pd.DataFrame(columns = ['character', 'dialogue'])

for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

data['character'] = data["character"].str.lower()

data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

unique_characters = data.character.unique()

data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [124]:
def preprocess_text(sen):

    # Remove ....
    sentence = re.sub('\.+', ' ', sen)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)

    # Remove extra spaces
    sentence = re.sub(' +', ' ', sentence)

    # Remove numbers
    sentence = ''.join(filter(lambda x: not x.isdigit(), sentence))

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()

    return sentence

In [125]:
def get_char_df(name, preprocessed_sen_filename):

  global tokens, char_data, token_list

  tokens = [preprocess_text(row) for row in data_dict[name]]
  char_data = [' '.join(row) for row in tokens]
  token_list = [item for sublist in tokens for item in sublist]

  preprocessed_output_file = open(preprocessed_sen_filename,"a")
  for row in tokens:
    str_val = ' '.join(row)
    preprocessed_output_file.writelines(str_val + '\n')
  preprocessed_output_file.close() 

In [126]:
def model_gen(name):

  global tokenizer, reverse_word_dict, dataX

  # integer encode sequences of words
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(token_list)
  unique_words = set(token_list)
  sequences_tokenised = tokenizer.texts_to_sequences(tokens)

  vocab_size = len(unique_words)+1
  n_sentences = len(tokens)

  print("# of Unique Words: ", unique_words)
  print("Tokenised sequences ", sequences_tokenised)
  print("Vocabulary Size: ", vocab_size)
  print("Number of sentences ", n_sentences)

  reverse_word_dict = {v: k for k, v in tokenizer.word_index.items()} 

  # create a weight matrix for words in training docs
  embedding_matrix = np.zeros((vocab_size, 300))

  for word, i in tokenizer.word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector

  #Create sliding windows of size 5
  seq_length = 5
  dataX = []
  dataY = []
  for dialogue in sequences_tokenised:
    for i in range(len(dialogue)-5):
      dataX.append(dialogue[i:i+5])
      dataY.append(dialogue[i+5])

  y = np.zeros((len(dataY), vocab_size), dtype=np.bool)
  for i, sentence in enumerate(dataY):
    y[i, dataY[i]] = 1

  X = pad_sequences(dataX, maxlen=5)

  learning_rate = 0.001
  embedding_layer = Embedding(vocab_size,
                                300,
                                weights=[embedding_matrix],
                                input_length=seq_length,
                                trainable=False)
  inp = Input(shape=(seq_length,))
  x = embedding_layer(inp)
  x = Bidirectional(LSTM(200,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
  x = GlobalMaxPool1D()(x)
  x = Dense(vocab_size,activation='relu')(x)
  x = Dense(vocab_size,activation='sigmoid')(x)
  model = Model(inputs=inp,outputs=x)
  optimizer = Adam(lr=learning_rate)

  #call the functions in the metrics 
  model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])
  model.summary()

  batch_size = 32 # minibatch size
  num_epochs = 100 # number of epochs

  callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
              ModelCheckpoint(filepath="./" + name + 'model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                              monitor='val_loss', verbose=0, mode='auto', period=2)]
  #fit the model
  history = model.fit(X, y,
                    batch_size=batch_size,
                    shuffle=True,
                    epochs=num_epochs,
                    callbacks=callbacks,
                    validation_split=0.1)

  #save the model
  model.save("/content/drive/MyDrive/Colab Notebooks/" + name + "_model_generate_sentences.h5")


In [127]:
def sample(preds, temperature=2.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [128]:
def generate_sentence(model):

  #initiate sentences
  generated = ''
  sentence = dataX[randint(0,len(dataX))]
  sentence = [' '.join([reverse_word_dict[word] for word in sentence])]
  generated += sentence[0]

  #we generate the next text
  for i in range(5):
    
      seq = tokenizer.texts_to_sequences(sentence)
      padded = pad_sequences(seq, maxlen=5)

      #calculate next word
      preds = model.predict(padded, verbose=0)[0]
      next_index = sample(preds, 0.33)
      next_word = reverse_word_dict[next_index]

      #add the next word to the text
      generated += " " + next_word
      sentence = [' '.join(sentence[0].split()[1:]) + " " + next_word]
      
  #print the whole text
  return (generated)

In [129]:
def get_bleu_score(ref, gen):
  bleu_score = 0
  for sen in gen:
    bleu_score += sentence_bleu(ref, sen.split())
  return (bleu_score/10)

In [130]:
def main(name, preprocessed_sen_filename, output_sen_filename):

  # Get individual character Dataframe
  get_char_df(name, preprocessed_sen_filename)

  # Prepare and train the model
  model_gen(name)

  # Load the trained model
  model = load_model("/content/drive/MyDrive/Colab Notebooks/" + name + "_model_generate_sentences.h5")

  # Generate 10 random sentences from 10 randomly selected seed sentences from original tokens
  n_sentences_gen = 10
  generated_sentences = []
  for i in range(n_sentences_gen):
    generated_sentences.append(generate_sentence(model))

  # Create output file for generated sentences
  output_file = open(output_sen_filename,"a")
  for row in generated_sentences:
    output_file.writelines(row + '\n')
  

  # Check Bleu Score of the output
  output_file.writelines('\n')
  output_file.writelines('BLEU score for {} -> {}'.format(name, get_bleu_score(tokens, generated_sentences)))

  output_file.close()


In [131]:
main("ben", "ObiWanProcessedData.txt","ObiWanGeneratedSen.txt")

Tokenised sequences  [[4, 12, 5, 275, 276, 91, 15], [29, 13, 91, 1, 277, 23, 29], [23, 79, 550, 4, 124, 11, 384, 7, 1, 551], [98, 23, 125, 27, 2, 68, 1, 552, 553, 25, 554, 30], [555, 385, 143, 15, 277, 10, 386, 1, 556, 557], [42, 12, 558, 559], [29, 111, 560, 278], [2, 44, 50, 91, 87, 160, 23, 1, 561, 44], [144, 23, 1, 562, 563, 37, 228], [98, 23], [279, 15], [59, 42, 99, 74, 42, 25, 564, 74, 565, 74, 145, 161], [28, 191, 15, 43, 13, 60, 112, 280, 26, 566, 567], [281, 162, 80, 88, 192, 7, 1, 568, 42, 25, 146], [2, 8, 1, 282, 283, 5, 569, 570, 31, 571, 3, 572, 7], [23, 279, 5, 573], [24, 22, 284, 7, 51, 23], [23, 75, 27, 2, 163, 574, 164, 387, 165, 575, 285], [75, 44, 2, 388, 166, 166], [42, 388, 2, 229, 69, 389], [193, 21, 1, 286], [44, 576, 167], [390, 100], [2, 577, 9], [31, 10, 9], [47, 81, 39], [18, 19, 287], [230, 88, 288, 7, 18, 168, 578, 391, 1, 579, 7], [45, 169, 39, 8, 163, 63, 7, 289], [39, 23, 231, 29, 290, 63, 7, 1, 101, 580, 1], [29, 392, 32, 170, 34, 581], [582, 9, 231], 



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [133]:
main("yoda", "YodaProcessedData.txt","YodaGeneratedSen.txt")

# of Unique Words:  {'lets', 'ourselves', 'aboutpolitics', 'have', 'missingfrom', 'palpatine', 'luke', 'safe', 'learn', 'immortality', 'opinion', 'today', 'with', 'solitude', 'barely', 'through', 'nevertheless', 'obiwan', 'little', 'dismantling', 'influence', 'out', 'you', 'opens', 'feeling', 'rim', 'makes', 'respect', 'assassin', 'hundredjedi', 'clonewar', 'wait', 'friend', 'that', 'strong', 'kenobi', 'war', 'would', 'powers', 'must', 'obiwanyounglings', 'welcome', 'how', 'sidious', 'creates', 'kashyyyk', 'not', 'wemust', 'sky', 'or', 'our', 'most', 'burden', 'sleep', 'temple', 'doing', 'confident', 'one', 'maste', 'security', 'grievous', 'only', 'windu', 'obiwans', 'tothe', 'agree', 'tatooine', 'shadow', 'continuingthe', 'remains', 'powerful', 'droid', 'own', 'nothing', 'too', 'hard', 'chewbacca', 'then', 'transform', 'lightly', 'jedis', 'vader', 'oh', 'misread', 'just', 'failure', 'goand', 'yes', 'many', 'puzzle', 'agent', 'other', 'impossible', 'things', 'eveything', 'twilight', 't



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [135]:
main("threepio", "ThreepioProcessedData.txt","ThreepioGeneratedSen.txt")

# of Unique Words:  {'venture', 'lets', 'oil', 'rusty', 'hasnt', 'better', 'somethingshappened', 'behave', 'curse', 'ourselves', 'hangar', 'have', 'commented', 'tricked', 'luke', 'expect', 'safe', 'replace', 'readily', 'alternative', 'bad', 'deity', 'itif', 'hersoh', 'opinion', 'with', 'ever', 'needing', 'bounty', 'bear', 'barely', 'lars', 'through', 'aatwentythree', 'switch', 'obiwan', 'load', 'impersonate', 'damage', 'leia', 'said', 'least', 'entirely', 'little', 'worries', 'dull', 'saying', 'weight', 'empire', 'delusions', 'out', 'sent', 'fortunate', 'you', 'havent', 'alliance', 'veranda', 'person', 'overweight', 'twenty', 'peculiar', 'feeling', 'naked', 'wooly', 'heading', 'thirtyfive', 'makes', 'mischka', 'surrender', 'extreme', 'supposed', 'bolt', 'fast', 'wait', 'maam', 'odds', 'friend', 'therell', 'that', 'rocky', 'approximately', 'ey', 'mutualattraction', 'kenobi', 'navigating', 'shutting', 'would', 'million', 'condition', 'responsibility', 'must', 'welcome', 'quitefound', 'ta



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
