# Game ot thrones
## Text Generation
<hr>

### Modelling

In [1]:
#import libraries
import os
import numpy as np
import pandas as pd
from random import randint
from scipy import sparse

from pickle import dump

from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.utils import pad_sequences

#### Read the sequences

In [2]:
file = open('../datasets/got1_sequences.txt', 'r')
# read all text
txtDataset = file.read()
# close the file
file.close()
lstSequences = txtDataset.split('\n')

### Encode Sequences
The word embedding layer expects input sequences to be comprised of integers. We can map
each word in our vocabulary to a unique integer and encode our input sequences. Later, when
we make predictions, we can convert the prediction to numbers and look up their associated
words in the same mapping. To do this encoding, we will use the Tokenizer class in the Keras
API.

In [5]:
lstSequences[100]

'saw them gared said if he says they are dead thats proof enough for me will had known they would drag him into the quarrel sooner or later he wished it had been later rather than sooner my mother told me that dead men sing no songs he put in my'

In [15]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lstSequences)
sequences = tokenizer.texts_to_sequences(lstSequences)

In [13]:
sequences[0]

[4,
 1116,
 5,
 1741,
 1323,
 46,
 5,
 4,
 1032,
 5,
 602,
 2,
 248,
 65,
 3339,
 11938,
 11938,
 2403,
 11940,
 63,
 181,
 1367,
 57,
 1096,
 1207,
 16,
 1,
 791,
 252,
 3,
 1095,
 228,
 126,
 32,
 1,
 2138,
 55,
 148,
 59,
 1,
 148,
 2012,
 10,
 40,
 1322,
 889,
 143,
 18,
 230,
 1,
 2580]

In [5]:
vocab_size=len(tokenizer.word_index) + 1
print (f'Size of vocabulary : {vocab_size}')

Size of vocabulary : 11941


Now that we have encoded the input sequences, we need to separate them into input (X) and
output (y) elements, remember in the previos stage I split the text into a 50 words secuence + 1 to be the target label

In [12]:
#original script 
sequences = np.array(sequences,dtype=object)


In [25]:
#option 3. using pandas
dfsequences = pd.DataFrame(sequences, index=None)
dfsequences

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,4,1116,5,1741,1323,46,5,4,1032,5,...,40,1322,889,143,18,230,1,2580,,
1,1116,5,1741,1323,46,5,4,1032,5,602,...,1322,889,143,18,230,1,2580,5,,
2,5,1741,1323,46,5,4,1032,5,602,2,...,889,143,18,230,1,2580,5,4,,
3,1741,1323,46,5,4,1032,5,602,2,248,...,143,18,230,1,2580,5,4,274,,
4,1323,46,5,4,1032,5,602,2,248,65,...,18,230,1,2580,5,4,274,1096,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293481,3,9,190,9,102,3337,340,1005,11937,31,...,5,187,1,157,125,744,18,1,,
293482,9,190,9,102,3337,340,1005,11937,31,99,...,187,1,157,125,744,18,1,1742,,
293483,190,9,102,3337,340,1005,11937,31,99,285,...,1,157,125,744,18,1,1742,5,,
293484,9,102,3337,340,1005,11937,31,99,285,2,...,157,125,744,18,1,1742,5,536,,


In [27]:
X = dfsequences.iloc[:,0:49]
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,4,1116,5,1741,1323,46,5,4,1032,5,...,1,148,2012,10,40,1322,889,143,18,230
1,1116,5,1741,1323,46,5,4,1032,5,602,...,148,2012,10,40,1322,889,143,18,230,1
2,5,1741,1323,46,5,4,1032,5,602,2,...,2012,10,40,1322,889,143,18,230,1,2580
3,1741,1323,46,5,4,1032,5,602,2,248,...,10,40,1322,889,143,18,230,1,2580,5
4,1323,46,5,4,1032,5,602,2,248,65,...,40,1322,889,143,18,230,1,2580,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293481,3,9,190,9,102,3337,340,1005,11937,31,...,137,101,11,1809,5,187,1,157,125,744
293482,9,190,9,102,3337,340,1005,11937,31,99,...,101,11,1809,5,187,1,157,125,744,18
293483,190,9,102,3337,340,1005,11937,31,99,285,...,11,1809,5,187,1,157,125,744,18,1
293484,9,102,3337,340,1005,11937,31,99,285,2,...,1809,5,187,1,157,125,744,18,1,1742


In [28]:
y = dfsequences.iloc[:,50]
y.dtype

dtype('int64')

In [29]:

X = dfsequences.iloc[:,0:49].values
y = dfsequences.iloc[:,50].values
y = to_categorical(y, num_classes=vocab_size)
#y= sparse.csr_matrix((np.ones(len(y)), (np.arange(len(y)), y)), shape=(len(y), vocab_size))
seq_length = X.shape[1]

In [35]:
print(y.shape)

(293486, 11941)


Let's create the model

In [31]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))


In [70]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize defined model
model.summary()
plot_model(model, to_file='../assets/model.png', show_shapes=True)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 50)            597050    
                                                                 
 lstm (LSTM)                 (None, 49, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 11941)             1206041   
                                                                 
Total params: 1,953,991
Trainable params: 1,953,991
Non-trainable params: 0
_________________________________________________________________
You must install pydot (`pip install pydot`) a

In [71]:
# fit model
model.fit(X, y, batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x254b7f18f40>

### Saving the model

In [98]:
# save the model to file
model.save('../model/model_got.h5')
# save the tokenizer
dump(tokenizer, open('../model/tokenizer_got.pkl', 'wb'))


### Predict

In [73]:
# select a seed text whatever
seed_text = lstSequences[randint(0,len(lstSequences))]
print(seed_text + '\n')


toward harrenhal burning as he goes grim and grimmer thought catelyn it was worse than shed imagined you mean to meet him here she asked if he comes so far but no one thinks he will robb said ive sent word to howland reed fathers old friend at greywater watch if



In order to predict I need to encode selected line with the same tokenizer

In [74]:
encodedLine = tokenizer.texts_to_sequences([seed_text])[0]

In [65]:
print(f'seed line : {seed_text}')
print(f'encoded line : {encodedLine}')

seed line : on his face watching heward turn over tiles and enjoying the view page 000 ned paused at the foot of the stair and pulled on his gloves its time we took our leave my business here is done heward lurched to his feet hurriedly gathering up his things as you will
encoded line : [24, 7, 88, 679, 3320, 743, 89, 9611, 2, 4535, 1, 2201, 67, 77, 51, 2021, 23, 1, 566, 5, 1, 1736, 2, 366, 24, 7, 1906, 99, 101, 63, 119, 195, 264, 28, 1117, 86, 27, 225, 3320, 1941, 3, 7, 190, 3295, 2223, 50, 7, 377, 16, 10, 43]


In [96]:
lstResult = list()
in_text = seed_text
# generate a fixed number of words, 50 because is the lenght of line
for iteration in range(50): 
    # encode the text as integer
    encodedLine = tokenizer.texts_to_sequences([in_text])[0]
    # truncate sequences to a fixed length, maxlen= 50 -1
    encodedLine = pad_sequences([encodedLine], maxlen=49, truncating='pre')
    
    # predict probabilities for each word
    prediction = np.argmax(model.predict(encodedLine), axis=-1)
    print(f'the index predicted {prediction} in iteration number {iteration}')
    
    # map predicted word index to word
    out_word = ''
    out_word = {i for i in tokenizer.word_index if tokenizer.word_index[i]==prediction}
    
    predictedWord = out_word.pop()
    in_text += ' ' + predictedWord
    lstResult.append(predictedWord)


the index predicted [55] in iteration number 0
the index predicted [6355] in iteration number 1
the index predicted [41] in iteration number 2
the index predicted [1] in iteration number 3
the index predicted [5] in iteration number 4
the index predicted [55] in iteration number 5
the index predicted [2947] in iteration number 6
the index predicted [14] in iteration number 7
the index predicted [10] in iteration number 8
the index predicted [3] in iteration number 9
the index predicted [4] in iteration number 10
the index predicted [5] in iteration number 11
the index predicted [1556] in iteration number 12
the index predicted [2011] in iteration number 13
the index predicted [903] in iteration number 14
the index predicted [1] in iteration number 15
the index predicted [5] in iteration number 16
the index predicted [5] in iteration number 17
the index predicted [1] in iteration number 18
the index predicted [8] in iteration number 19
the index predicted [5] in iteration number 20
the 

In [97]:
#finally I print the resultant line
print(' '.join(lstResult))

are prickly me the of are beloved i you to a of lives stewards eat the of of the was of the was of the was speaks escaped said the of of eyrie unhappy and grant the prowess the insolence come the of gerold out garrons dozen bows horsemen a


The poor accuracy affect the quality of generated text. I think I'm going to change the text preprocessing