# Importing the libraries that the program needs

In [1]:
import sys
print("Python:", sys.version)

import numpy as np
print("NumPy:", np.__version__)

import pandas as pd
print("Pandas:", pd.__version__)

import sklearn
print("Scikit-learn:", sklearn.__version__)

import spacy
from spacy.tokens import Doc
from spacy.attrs import IS_TITLE, LOWER, IS_ALPHA, IS_UPPER, IS_DIGIT
print("SpaCy:", spacy.__version__)

import nltk

Python: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
NumPy: 1.16.4
Pandas: 0.25.3
Scikit-learn: 0.23.2
SpaCy: 2.3.4


# Downloading the brown corpus

In [2]:
import nltk
#nltk.download('brown')
from nltk.corpus import brown

In [3]:
nlp = spacy.load("en")

In [4]:
words = [word.lower() for word in brown.words()]
words[:10]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of']

In [110]:
"""spl = int(0.80*len(words))
train = words[:spl]
test = words[spl:]"""

In [5]:
# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in words)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.items())))

In [6]:
lower_words = [x for x in words if x.isalpha() and x in vocabulary]

In [7]:
lower_words[:10]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of']

In [8]:
unique_word_number = len(set(lower_words))
print("Total words: ", len(lower_words))
print("Unique words: ", unique_word_number)

Total words:  935574
Unique words:  13366


In [9]:
# organize into sequences of tokens
length = 5 + 1
sequences = list()
for i in range(length, len(lower_words)):
    seq = lower_words[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 935568


In [10]:
sequences[:3]

['the fulton county grand jury said',
 'fulton county grand jury said friday',
 'county grand jury said friday an']

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sents = tokenizer.texts_to_sequences(sequences)

In [20]:
sents[:3]

[[1, 5458, 625, 2239, 1585, 53],
 [5458, 625, 2239, 1585, 53, 1760],
 [625, 2239, 1585, 53, 1760, 29]]

In [21]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

13367

In [22]:
from tensorflow.keras.utils import to_categorical

sents_array = np.array(sents)
X, y = sents_array[:,:-1], sents_array[:,-1]

In [23]:
y = to_categorical(y, num_classes=vocab_size, dtype=np.uint8)
seq_length = X.shape[1]

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 50)             668350    
_________________________________________________________________
lstm (LSTM)                  (None, 5, 100)            60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 13367)             1350067   
Total params: 2,169,317
Trainable params: 2,169,317
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=100)

In [29]:
import string
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens
 
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
# load document
in_filename = 'republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])
 
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
 
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))
 
# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what
['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'c

In [39]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
 
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
 
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [32]:
vocab_size

7410

In [60]:
118633 * 7410 

879070530

In [116]:
923538 * 11692 

10798006296

In [117]:
10798006296 / 879070530

12.283435660162558

In [None]:
 923538 /751280