In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils 
import tensorflow as tf
import numpy as np 
import pandas as pd
import numpy as np
import string
from textwrap3 import wrap
import warnings
from nltk.corpus import stopwords
# from langdetect import detect
import nltk
warnings.filterwarnings('ignore')
import re


In [81]:
df=pd.read_csv("https://raw.githubusercontent.com/jessiececilya/Projects/main/hippoCorpusV2.csv")


Convert the summary of the story into a list of arrays

In [4]:
corpus=df['summary'].to_list()

Cleaning of text data of puntuations, and ignoring any non ascii charchters

In [5]:

def text_cleaner(text):
    text = "".join(car for car in text if car not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii",'ignore')
    return text

corpus = [text_cleaner(line) for line in corpus]

Considering first 5000 stories for the model and tokenizing the text

In [6]:
corpus = corpus[:5000]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
total_words = len(word_index) + 1
total_words

5909

Creating n-gram sentence sequence. This is to create a pattern to train the model. 

In [7]:
input_sequences =[]

for sentence in corpus:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
input_sequences=input_sequences[:30000]

Padding the input sequences upto the maximum length of the squence so as to train the model

In [9]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, 
                                         maxlen=max_sequence_len, 
                                         padding='pre'))

Separating the input and ouput tokens, where n-1 words are trained to predict nth word

In [10]:
# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
# create one-hot encoding of the labels
label = tensorflow.keras.utils.to_categorical(label, num_classes=total_words)


In [11]:
input_sequences.shape

(30000, 63)

In [12]:
print(label[0])
print(label[0].shape)

[0. 0. 0. ... 0. 0. 0.]
(5909,)


Creating the model using Birectional LSTM as it works better in sequential classification. I have introduced a drop out in order introduce fairness to neuron intensity. 

In [13]:
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(200)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 62, 10)            59090     
_________________________________________________________________
bidirectional (Bidirectional (None, 400)               337600    
_________________________________________________________________
dropout (Dropout)            (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 5909)              2369509   
Total params: 2,766,199
Trainable params: 2,766,199
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
history = model.fit(predictors, label, epochs=100,  verbose=1)

In [79]:
input_text = "A month ago"
input=input_text
next_words = 50
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([input_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word



In [80]:
print("Input text: ",input)
print("Output text: ")
print("\n".join(wrap(input_text, width=50)))

Input text:  A month ago
Output text: 
A month ago of mothers and the excitement for the
milestone transition from toddler to grade
schooler and got up with pay and find the best
time there is very be and i got to her they coming
able of the lives this was never food that the
promotion and was extremely scary


In [22]:
model=tf.keras.models.load_model('/content/storymodel (2).h5')

In [82]:
# !pip install textwrap3