Model to generate a sequence of following words:
1. Extract relevant characters data
2. Convert data to token sequence of size 5
3. Encode the sentence 
4. Create a bidirectional LSTM model
5. Add Glove word embeddings

Reference:

~ https://medium.com/@plusepsilon/the-bidirectional-language-model-1f3961d1fb27

~ https://www.kaggle.com/guidant/mimicking-star-wars-characters-using-a-i-rnn#2.-Data-Preparation

~ https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

~ https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

~ https://towardsdatascience.com/simple-text-generation-d1c93f43f340



In [1]:
# Imports
import numpy as np
import pandas as pd
import string
import os
import re
from random import randint
from tensorflow.keras.preprocessing.sequence import pad_sequences


from keras.models import Sequential
from keras.models import Model
from keras.models import load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Bidirectional
from keras.layers import GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping


from __future__ import print_function
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu

# Get Pre-trained GLOVE embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

#load the whole embedding into memory
embeddings_index = dict()
f = open('/content/glove.6B.300d.txt')

for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

--2021-04-24 00:40:58--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-24 00:40:58--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-24 00:40:58--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [2]:
# Add file path to where the filtered data may be
# One can also add the folder temporarily in the runtime location
# path_to_file = '/content/Filtered Data/' 
# --> Contains all the generated csv files 
# (SW_EpisodeI.csv, SW_EpisodeII.csv, SW_EpisodeIII.csv, SW_EpisodeIV.csv, SW_EpisodeV.csv, SW_EpisodeVI.csv)
# Currently pointing to private drive folder
path_to_file = '/content/drive/MyDrive/SNLP Project/Filtered Data/'

# Create Dataframe of all characters and dialogues
data = pd.DataFrame(columns = ['character', 'dialogue'])

for file in os.listdir(path_to_file):
    print(file)
    df = pd.read_csv(path_to_file+file)
    data = data.append(df, ignore_index=True)

# Lower case all the characters
data['character'] = data["character"].str.lower()

# Unify all the characters
data['character'] = data.character.replace("obi-wan", "ben", regex=True)
data['character'] = data.character.replace("c-3po", "threepio", regex=True)

# Get all the unique characters
unique_characters = data.character.unique()

# data_dict :
#     key   -> character name
#     value -> string list all dialogues spoken by the character
data_dict = data.groupby('character')['dialogue'].apply(lambda g: g.values.tolist()).to_dict()

SW_EpisodeI.csv
SW_EpisodeII.csv
SW_EpisodeIII.csv
SW_EpisodeIV.csv
SW_EpisodeV.csv
SW_EpisodeVI.csv


In [3]:
def preprocess_text(sen):

    # Remove ....
    sentence = re.sub('\.+', ' ', sen)

    # Remove punctuations
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)

    # Remove extra spaces
    sentence = re.sub(' +', ' ', sentence)

    # Remove numbers
    sentence = ''.join(filter(lambda x: not x.isdigit(), sentence))

    # Lower case
    sentence = sentence.lower()

    # Return a list of tokens (words)
    sentence = sentence.split()

    return sentence

In [4]:
# Function to retrive all the dialogues spoken by the character
# Params : 
#     name                      -> Name of the character 
#                                 (eg. ben, yoda, threepio)
#     preprocessed_sen_filename -> Name of the output file to write the filtered 
#                                  data (eg. ObiwanProcessedData, 
#                                            YodaProcessedData,
#                                            ThreepioProcessedData)

def get_char_df(name, preprocessed_sen_filename):

  global tokens, char_data, token_list

  tokens = [preprocess_text(row) for row in data_dict[name]]
  char_data = [' '.join(row) for row in tokens]
  token_list = [item for sublist in tokens for item in sublist]

  preprocessed_output_file = open(preprocessed_sen_filename,"a")
  for row in tokens:
    str_val = ' '.join(row)
    preprocessed_output_file.writelines(str_val + '\n')
  preprocessed_output_file.close() 

In [5]:
# Function to generate the model and train the same 
def model_gen(name):

  global tokenizer, reverse_word_dict, dataX

  # integer encode sequences of words
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(token_list)
  unique_words = set(token_list)
  sequences_tokenised = tokenizer.texts_to_sequences(tokens)

  vocab_size = len(unique_words)+1
  n_sentences = len(tokens)

  reverse_word_dict = {v: k for k, v in tokenizer.word_index.items()} 

  # create a weight matrix for words in training docs (wrt GLOVE embeddings)
  embedding_matrix = np.zeros((vocab_size, 300))

  for word, i in tokenizer.word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector

  #Create sequences of size 5
  seq_length = 5
  dataX = []
  dataY = []
  for dialogue in sequences_tokenised:
    for i in range(len(dialogue)-5):
      dataX.append(dialogue[i:i+5])
      dataY.append(dialogue[i+5])

  y = np.zeros((len(dataY), vocab_size), dtype=np.bool)
  for i, sentence in enumerate(dataY):
    y[i, dataY[i]] = 1

  X = pad_sequences(dataX, maxlen=5)

  learning_rate = 0.001
  embedding_layer = Embedding(vocab_size,
                                300,
                                weights=[embedding_matrix],
                                input_length=seq_length,
                                trainable=False)
  inp = Input(shape=(seq_length,))
  x = embedding_layer(inp)
  x = Bidirectional(LSTM(200,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
  x = GlobalMaxPool1D()(x)
  x = Dense(vocab_size,activation='relu')(x)
  x = Dense(vocab_size,activation='sigmoid')(x)
  model = Model(inputs=inp,outputs=x)
  optimizer = Adam(lr=learning_rate)

  #call the functions in the metrics 
  model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])
  model.summary()

  batch_size = 32 # minibatch size
  num_epochs = 100 # number of epochs

  callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
              ModelCheckpoint(filepath="./" + name + 'model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                              monitor='val_loss', verbose=0, mode='auto', period=2)]
  #fit the model
  history = model.fit(X, y,
                    batch_size=batch_size,
                    shuffle=True,
                    epochs=num_epochs,
                    callbacks=callbacks,
                    validation_split=0.1)

  #save the model
  model.save("/content/drive/MyDrive/Colab Notebooks/" + name + "_model_generate_sentences.h5")


In [6]:
# Function to draw out the most likely next word from predicted matrix 
def sample(preds, temperature=2.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [7]:
# Function to generate the sentences based on random seed words
def generate_sentence(model):

  #initiate sentences
  generated = ''
  sentence = dataX[randint(0,len(dataX))]
  sentence = [' '.join([reverse_word_dict[word] for word in sentence])]
  generated += sentence[0]

  # Generate the next five words
  for i in range(5):
    
      seq = tokenizer.texts_to_sequences(sentence)
      padded = pad_sequences(seq, maxlen=5)

      #calculate next word
      preds = model.predict(padded, verbose=0)[0]
      next_index = sample(preds, 0.33)
      next_word = reverse_word_dict[next_index]

      #add the next word to the text
      generated += " " + next_word
      sentence = [' '.join(sentence[0].split()[1:]) + " " + next_word]

  return (generated)

In [8]:
# Get the average BLEU score for the generated dialogues
# Params : 
#     ref -> List of tokenised training dialogues 
#     gen -> List of generated dialogues

def get_bleu_score(ref, gen):
  bleu_score = 0
  for sen in gen:
    bleu_score += sentence_bleu(ref, sen.split())
  return (bleu_score/10)

In [9]:
# Get the average BLEU score for the generated dialogues
# Params : 
#     name                      -> Name of character
#     preprocessed_sen_filename -> Name of filtered dialogues output file
#     output_sen_filename       -> Name of generated dialgoues output file

def main(name, preprocessed_sen_filename, output_sen_filename):

  # Get individual character Dataframe
  get_char_df(name, preprocessed_sen_filename)

  # Prepare and train the model
  model_gen(name)

  # Load the trained model
  model = load_model("/content/drive/MyDrive/Colab Notebooks/" + name + "_model_generate_sentences.h5")

  # Generate 10 random sentences from 10 randomly selected seed sentences from original tokens
  n_sentences_gen = 10
  generated_sentences = []
  for i in range(n_sentences_gen):
    generated_sentences.append(generate_sentence(model))

  # Create output file for generated sentences
  output_file = open(output_sen_filename,"a")
  for row in generated_sentences:
    output_file.writelines(row + '\n')
  

  # Check Bleu Score of the output
  output_file.writelines('\n')
  output_file.writelines('BLEU score for {} -> {}'.format(name, get_bleu_score(tokens, generated_sentences)))

  output_file.close()


Below are the commands to run to generate the dialogues for three characters ( Obiwan-Kenobi, Yoda and C3PO)

In [10]:
main("ben", "ObiWanProcessedData.txt","ObiWanGeneratedSen.txt")

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 300)            435300    
_________________________________________________________________
bidirectional (Bidirectional (None, 5, 400)            801600    
_________________________________________________________________
global_max_pooling1d (Global (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 1451)              581851    
_________________________________________________________________
dense_1 (Dense)              (None, 1451)              2106852   
Total params: 3,925,603
Trainable params: 3,490,303
Non-trainable params: 435,300
_____________________________________________

In [11]:
main("yoda", "YodaProcessedData.txt","YodaGeneratedSen.txt")

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5, 300)            190500    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 5, 400)            801600    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 400)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 635)               254635    
_________________________________________________________________
dense_3 (Dense)              (None, 635)               403860    
Total params: 1,650,595
Trainable params: 1,460,095
Non-trainable params: 190,500
___________________________________________

In [12]:
main("threepio", "ThreepioProcessedData.txt","ThreepioGeneratedSen.txt")

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 5, 300)            308700    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 5, 400)            801600    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 400)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1029)              412629    
_________________________________________________________________
dense_5 (Dense)              (None, 1029)              1059870   
Total params: 2,582,799
Trainable params: 2,274,099
Non-trainable params: 308,700
___________________________________________