### Statistcal Language Model Trained on Charles Dicken's A Christmas Carol

In [11]:
file = open('christmascarol.txt', 'r')
text = file.read()
file.close()



In [12]:
print(text[:300])

In Prose

BEING A GHOST STORY OF CHRISTMAS




STAVE ONE

MARLEY'S GHOST


Marley was dead, to begin with. There is no doubt whatever about that.
The register of his burial was signed by the clergyman, the clerk, the
undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name
was good u


* Replace ‘–‘ with a white space so we can split words better.
* Split words based on white space.
* Remove all punctuation from words to reduce the vocabulary size (e.g. ‘What?’ becomes ‘What’).
* Remove all words that are not alphabetic to remove standalone punctuation tokens.
* Normalize all words to lowercase to reduce the vocabulary size.

In [13]:
import string

def staging(text):
    text = text.replace('-', ' ')
    tokens = text.split()

    # remove punctuation from each token
    p_map = string.punctuation.maketrans('','','.') #save fulstop
    p_remove = string.punctuation.translate(p_map)
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]

	# remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
	# make lower case
    tokens = [word.lower() for word in tokens]


    return tokens

tokens = staging(text)
print(len(tokens))

28834


In [14]:
seq_len = 51
seq_all = []

for i in range(seq_len , len(tokens)):
    seq = tokens[i - seq_len : i]
    line = ' '.join(seq)
    seq_all.append(line)
print("Number of sequences to avail: ", len(seq_all))


#save to file
data = '\n'.join(seq_all)
file = open('data_in.txt', 'w')
file.write(data)
file.close()


Number of sequences to avail:  28783


In [18]:
#load file into memory

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

filename_0 = 'data_in.txt'
content = load_doc(filename_0)
lines = content.split('\n')



In [30]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
vocab_size = len(tokenizer.word_index) + 1   #non-zero offset

In [47]:
import numpy as np
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [46]:

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            214150    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 4283)              432583    
Total params: 797,633
Trainable params: 797,633
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
E

<keras.callbacks.callbacks.History at 0x7f47c8018a60>

In [48]:
from pickle import dump, load

# save the model 
model.save('model.h5')
# save the tokenizer as pkl
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [49]:
# load model and tokenizer

model = load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))