### Importing Lib and data

In [46]:
import string
import re
import warnings
warnings.filterwarnings("ignore")


In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [23]:
# load document
in_filename = 'republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])




BOOK I. The Republic opens with a truly Greek scene—a festival in
honour of the goddess Bendis which is held in the Piraeus; to this is
added the promise of an equestrian torch-race in the evening. 


In [24]:
# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [25]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))


['book', 'i', 'the', 'republic', 'opens', 'with', 'a', 'truly', 'greek', 'festival', 'in', 'honour', 'of', 'the', 'goddess', 'bendis', 'which', 'is', 'held', 'in', 'the', 'piraeus', 'to', 'this', 'is', 'added', 'the', 'promise', 'of', 'an', 'equestrian', 'torchrace', 'in', 'the', 'evening', 'the', 'whole', 'work', 'is', 'supposed', 'to', 'be', 'recited', 'by', 'socrates', 'on', 'the', 'day', 'after', 'the', 'festival', 'to', 'a', 'small', 'party', 'consisting', 'of', 'critias', 'timaeus', 'hermocrates', 'and', 'another', 'this', 'we', 'learn', 'from', 'the', 'first', 'words', 'of', 'the', 'timaeus', 'when', 'the', 'rhetorical', 'advantage', 'of', 'reciting', 'the', 'dialogue', 'has', 'been', 'gained', 'the', 'attention', 'is', 'not', 'distracted', 'by', 'any', 'reference', 'to', 'the', 'audience', 'nor', 'is', 'the', 'reader', 'further', 'reminded', 'of', 'the', 'extraordinary', 'length', 'of', 'the', 'narrative', 'of', 'the', 'numerous', 'company', 'three', 'only', 'take', 'any', 'ser

### Save Clean Text

In [26]:
# organize into sequences of tokens
length = 70 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))


Total Sequences: 209707


In [27]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


In [28]:
# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)


### Train Language Model
### Load Sequences


In [29]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
lines

['book i the republic opens with a truly greek festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole work is supposed to be recited by socrates on the day after the festival to a small party consisting of critias timaeus hermocrates and another this we learn from the first words of the',
 'i the republic opens with a truly greek festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole work is supposed to be recited by socrates on the day after the festival to a small party consisting of critias timaeus hermocrates and another this we learn from the first words of the timaeus',
 'the republic opens with a truly greek festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole work is supposed to be recited by 

###  Encode Sequences

In [13]:
from numpy import array
from pickle import dump
from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.utils.vis_utils import plot_model
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [30]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
sequences

[[293,
  16,
  1,
  300,
  10309,
  28,
  7,
  363,
  227,
  1412,
  6,
  280,
  2,
  1,
  3133,
  6321,
  11,
  5,
  844,
  6,
  1,
  2709,
  4,
  30,
  5,
  1050,
  1,
  1760,
  2,
  49,
  6320,
  3724,
  6,
  1,
  2708,
  1,
  150,
  186,
  5,
  403,
  4,
  10,
  6318,
  23,
  157,
  57,
  1,
  358,
  164,
  1,
  1412,
  4,
  7,
  372,
  1247,
  3722,
  2,
  3721,
  787,
  6316,
  3,
  80,
  30,
  22,
  512,
  40,
  1,
  78,
  169,
  2,
  1],
 [16,
  1,
  300,
  10309,
  28,
  7,
  363,
  227,
  1412,
  6,
  280,
  2,
  1,
  3133,
  6321,
  11,
  5,
  844,
  6,
  1,
  2709,
  4,
  30,
  5,
  1050,
  1,
  1760,
  2,
  49,
  6320,
  3724,
  6,
  1,
  2708,
  1,
  150,
  186,
  5,
  403,
  4,
  10,
  6318,
  23,
  157,
  57,
  1,
  358,
  164,
  1,
  1412,
  4,
  7,
  372,
  1247,
  3722,
  2,
  3721,
  787,
  6316,
  3,
  80,
  30,
  22,
  512,
  40,
  1,
  78,
  169,
  2,
  1,
  787],
 [1,
  300,
  10309,
  28,
  7,
  363,
  227,
  1412,
  6,
  280,
  2,
  1,
  3133,
  6321,
  11,
  

In [33]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
# separate into input and output
sequences = array(sequences)
X, y = sequences[:10000,:-1], sequences[:10000,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [34]:
def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    # plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [35]:
# define model
model = define_model(vocab_size, seq_length)
model



<Sequential name=sequential, built=False>

In [36]:
# fit model
model.fit(X, y, batch_size=128, epochs=100)

Epoch 1/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 221ms/step - accuracy: 0.0730 - loss: 8.0850
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 225ms/step - accuracy: 0.0802 - loss: 6.0964
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 223ms/step - accuracy: 0.0817 - loss: 5.9490
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 214ms/step - accuracy: 0.0803 - loss: 5.8417
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 223ms/step - accuracy: 0.0891 - loss: 5.7931
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 229ms/step - accuracy: 0.1117 - loss: 5.6499
Epoch 7/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 220ms/step - accuracy: 0.1146 - loss: 5.5509
Epoch 8/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 211ms/step - accuracy: 0.1160 - loss: 5.5158
Epoch 9/100
[1m79/79[0

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 211ms/step - accuracy: 0.3390 - loss: 2.8635
Epoch 68/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 199ms/step - accuracy: 0.3420 - loss: 2.8352
Epoch 69/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 203ms/step - accuracy: 0.3511 - loss: 2.7754
Epoch 70/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 200ms/step - accuracy: 0.3553 - loss: 2.7375
Epoch 71/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 196ms/step - accuracy: 0.3588 - loss: 2.7361
Epoch 72/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 206ms/step - accuracy: 0.3717 - loss: 2.6878
Epoch 73/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 208ms/step - accuracy: 0.3841 - loss: 2.6181
Epoch 74/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 218ms/step - accuracy: 0.3803 - loss: 2.6263
Epoch 75/100
[1m79/79[0m [

<keras.src.callbacks.history.History at 0x22aec142dd0>

In [37]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))



### Use Language Model

In [63]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [39]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [40]:
# load the model
model = load_model('model.h5')
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))



In [42]:
#Generate Text
'''The first step in generating text is preparing a seed input. We will select a random line of text
from the input text for this purpose. Once selected, we will print it so that we have some idea
of what was used.'''

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

#encoding the seed text using the tokenizer
encoded = tokenizer.texts_to_sequences([seed_text])[0]
print('\n Encoded message',encoded)

a harmony which is diffused through the whole making the dwellers in the city to be of one mind and attuning the upper and middle and lower classes like the strings of an instrument whether you suppose them to differ in wisdom strength or wealth and now we are near the spot let us draw in and surround the cover and watch with all our eyes lest justice should slip away


 Encoded message [7, 298, 11, 5, 3960, 345, 1, 150, 330, 1, 6698, 6, 1, 168, 4, 10, 2, 32, 114, 3, 6699, 1, 873, 3, 671, 3, 507, 530, 61, 1, 1849, 2, 49, 1202, 104, 21, 179, 25, 4, 1120, 6, 315, 516, 13, 432, 3, 76, 22, 14, 835, 1, 1441, 98, 67, 616, 6, 3, 3342, 1, 2876, 3, 1586, 28, 34, 58, 354, 907, 83, 65, 4987, 243]


In [64]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        #yhat = model.predict_classes(encoded, verbose=0)
        predict_x=model.predict(encoded, verbose=0) 
        yhat=np.argmax(predict_x,axis=1)

        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [56]:
# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
# load the model
model = load_model('model.h5')
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
tokenizer



<keras.src.legacy.preprocessing.text.Tokenizer at 0x22b5e574850>

In [67]:
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

who has tasted the entrails of a single human victim minced up with the entrails of other victims is destined to become a wolf did you never hear it oh yes and the protector of the people is like him having a mob entirely at his disposal he is not restrained from shedding the blood of kinsmen by the favourite method of false accusation he brings them into court and murders

then is indignant that the argument only is inevitable the guardian of justice he is developing the governors of mankind may be affected by the argument and unjust imagine the giant between thee in the perfect state and the just and the orphic action in accordance with universal experience or
