In [1]:
import numpy
import re
import pandas as pd
import numpy as np
import keras
import string
import nltk
nltk.download('punkt')

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

string.punctuation = string.punctuation +'“'+'”'+'-'+'’'+'‘'+'—'
string.punctuation = string.punctuation.replace('.', '')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Loads the data and preprocesses data and stores corpus in raw_text
raw_text = open('wonderland_ch1.txt', encoding = 'utf8').read()

file_nl_removed = ""
for line in raw_text:
  line_nl_removed = line.replace("\n", " ")           
#removes newlines
  file_nl_removed += line_nl_removed

file_p = "".join([char for char in file_nl_removed if char not in string.punctuation])   
#removes all special characters
sents = nltk.sent_tokenize(file_p)
print("The number of sentences is", len(sents)) 
#prints the number of sentences

string.punctuation = string.punctuation + '.'
file_q = "".join([char for char in file_p if char not in string.punctuation])   #removes even periods.
words = nltk.word_tokenize(file_q)
print("The number of tokens is", len(words)) 
#prints the number of tokens

average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is", average_tokens) 
#prints the average number of tokens per sentence

unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) 
#prints the number of unique tokens

preprocessed_text = file_p.lower()       
#converts corpus into lowercase

The number of sentences is 43
The number of tokens is 2140
The average number of tokens per sentence is 50
The number of unique tokens are 651


In [3]:
# Uses the preprocessed data and create raw_text
raw_text = preprocessed_text   #periods have not been removed for better results

# creates mapping of unique characters to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
# Prints the total characters and character vocab size
n_chars = len(raw_text)
n_vocab = len(chars)

print("The number of total characters are", n_chars)
print("\nThe character vocab size is", n_vocab)

The number of total characters are 10801

The character vocab size is 28


In [4]:
#Prepares dataset where the input is sequence of 100 characters and target is next character.
seq_length = 100

dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[i + seq_length]

  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)
# reshapes X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# one hot encodes the output variable
y = np_utils.to_categorical(dataY)

Total Patterns:  10701


In [5]:
embedding_dim =100
max_length =100

model1 = Sequential()
model1.add(Embedding(n_vocab, embedding_dim, input_length=max_length))
model1.add(LSTM(256, input_shape=(X.shape[1], embedding_dim),return_sequences=True))
model1.add(Dropout(0.2))
model1.add(LSTM(256))
model1.add(Dropout(0.2))
model1.add(Dense(y.shape[1], activation='softmax'))
model1.compile(loss='categorical_crossentropy', optimizer='adam')
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          2800      
                                                                 
 lstm (LSTM)                 (None, 100, 256)          365568    
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense (Dense)               (None, 28)                7196      
                                                                 
Total params: 900,876
Trainable params: 900,876
Non-trai

In [6]:
# Uses validation split of 0.2 while training
history = model1.fit(X, y, epochs = 20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [7]:
# Generates the sequence similar to above methods. Gets the generated string using the model.
def predict_next_n_chars(pattern, n):
    for i in range(n):
      x = numpy.reshape(pattern, (1, len(pattern), 1))
      prediction = model1.predict(x, verbose=0)
      print (int_to_char[numpy.argmax(prediction)], end = '')   #get next char index.
      seq_in = [int_to_char[value] for value in pattern]
      pattern.append(numpy.argmax(prediction))
      pattern = pattern[1:len(pattern)]

In [8]:
#picks a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
input_str = ''.join([int_to_char[value] for value in pattern])
print ("Seed -",  input_str, sep = '\n\n')
print ("\nGenerated string -\n")

predict_next_n_chars(pattern, 200)
# specifies an unseen input string
input_str = "The boy laughed at the fright he had caused. This time, the villagers left angrily. The third day, as the boy went up\
 the small hill, he suddenly saw a wolf attacking his sheep. He cried as hard as he could, “Wolf! Wolf! Wolf!”, but not \
 a single villager came to help him. The villagers thought that he was trying to fool them again and did not come to rescue \
 him or his sheep."

#Uses the first 100 characters from given input_str as input to generate next 200 characters. 
input_str = input_str.lower()
input_string = ''
for each in input_str:
  if each in chars:
    if (len (input_string)<100):
      input_string += each

pattern = []
pattern.append([char_to_int[char] for char in input_string])

print ("Seed -",  input_str, sep = '\n\n')
print ("\nGenerated string -\n")
predict_next_n_chars(pattern[0], 200)

Seed -

sidering how in the world she was to get out again.  the rabbithole went straight on like a tunnel f

Generated string -

or some way and stupid for life to go on in the country is you know. please maam is this no use in crying to do that said alice it was all very well to say drink me but the was not a moment she troundSeed -

the boy laughed at the fright he had caused. this time, the villagers left angrily. the third day, as the boy went up the small hill, he suddenly saw a wolf attacking his sheep. he cried as hard as he could, “wolf! wolf! wolf!”, but not  a single villager came to help him. the villagers thought that he was trying to fool them again and did not come to rescue  him or his sheep.

Generated string -

it was all very well to say drink me but the was not a moment she tround the door and see the remame of the country is you know. please maam is this no use in crying to do that said alice it was all v