In [None]:
import re
import string
import numpy as np
from unicodedata import normalize
from nltk import word_tokenize

def load_file(path: str):
  """ Loads the file into memory """
  with open(path, 'r', encoding='utf8') as fo:
    content = fo.read()
  return content

def clean_data(content: str):
    """ Cleans the data for tokenization """
    # Lowercase text and remove leading and ending newlines
    text = content.lower().strip('\n')
    # Fix any encoding issues
    text = normalize('NFD', text).encode('utf8')
    text = text.decode('utf8')
    # Match by paragraphs (at least two new lines)
    par_match = re.compile(r'\n{2,}')
    lines = par_match.split(text)
    cleaned = list()
    for line in lines:
        # Remove new line symbols
        line = line.replace('\n', ' ')
        # Remove special characters and numbers
        line = re.sub("[^a-z\s\']+", " ", line).replace("'", "")
        # Remove line with < 15 words
        line = line.split()
        if len(line) > 15:
          # Remove whitespace
          line = ' '.join(line)
          cleaned.append(line)
    return np.array(cleaned)

In [None]:
fyodor = load_file('/content/28054-0.txt')
f_data = clean_data(fyodor)
f_labels = np.zeros(f_data.shape[0])
f_data[0:20], f_labels[0:20]

(array(['this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at http www gutenberg org license if you are not located in the united states you ll have to check the laws of the country where you are located before using this ebook',
        'part i book i the history of a family chapter i fyodor pavlovitch karamazov chapter ii he gets rid of his eldest son chapter iii the second marriage and the second family chapter iv the third son alyosha chapter v elders book ii an unfortunate gathering chapter i they arrive at the monastery chapter ii the old buffoon chapter iii peasant women who have faith chapter iv a lady of little faith chapter v so be it so be it chapter vi why is such a man alive chapter vii a young man bent on a career chapter viii the scandalous scen

In [None]:
doyle = load_file('/content/pg1661.txt')
d_data = clean_data(doyle)
d_labels = np.ones(d_data.shape[0])
d_data[0:20], d_labels[0:20]

(array(['this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at www gutenberg net',
        'i a scandal in bohemia ii the red headed league iii a case of identity iv the boscombe valley mystery v the five orange pips vi the man with the twisted lip vii the adventure of the blue carbuncle viii the adventure of the speckled band ix the adventure of the engineers thumb x the adventure of the noble bachelor xi the adventure of the beryl coronet xii the adventure of the copper beeches',
        'to sherlock holmes she is always the woman i have seldom heard him mention her under any other name in his eyes she eclipses and predominates the whole of her sex it was not that he felt any emotion akin to love for irene adler all emotions and that one particularly were abhorrent to his cold precise but admirably balanced mind 

In [None]:
austen = load_file('/content/pg31100.txt')
a_data = clean_data(austen)
a_labels = np.ones(a_data.shape[0]) + 1
a_data[0:20], a_labels[0:20]

(array(['this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at www gutenberg org',
        'note the accompanying html file has active links to all the volumes and chapters in this set',
        'sir walter elliot of kellynch hall in somersetshire was a man who for his own amusement never took up any book but the baronetage there he found occupation for an idle hour and consolation in a distressed one there his faculties were roused into admiration and respect by contemplating the limited remnant of the earliest patents there any unwelcome sensations arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last century and there if every other leaf were powerless he could read his own history with an interest which never failed this was the page at

In [None]:
data = np.hstack((f_data, d_data))
data = np.hstack((data, a_data))

n = f_data.shape[0] + d_data.shape[0] + a_data.shape[0]
assert data.shape[0] == n

labels = np.hstack((f_labels, d_labels))
labels = np.hstack((labels, a_labels))

assert labels.shape[0] == n
data[0:10], labels[0:10]

(array(['this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at http www gutenberg org license if you are not located in the united states you ll have to check the laws of the country where you are located before using this ebook',
        'part i book i the history of a family chapter i fyodor pavlovitch karamazov chapter ii he gets rid of his eldest son chapter iii the second marriage and the second family chapter iv the third son alyosha chapter v elders book ii an unfortunate gathering chapter i they arrive at the monastery chapter ii the old buffoon chapter iii peasant women who have faith chapter iv a lady of little faith chapter v so be it so be it chapter vi why is such a man alive chapter vii a young man bent on a career chapter viii the scandalous scen

In [None]:
from keras.utils import to_categorical

labels = to_categorical(labels)

assert labels.shape[0] == n
labels.shape

(14068, 3)

In [None]:
n_train = int(data.shape[0] * .8)
n_val = int(data.shape[0] * .2)

indices = np.random.permutation(range(data.shape[0]))
train_indices = indices[:n_train-n_val]
val_indices = indices[n_train-n_val:n_train]
test_indices = indices[n_train:]

X_train, y_train = data[train_indices], labels[train_indices]
X_val, y_val = data[val_indices], labels[val_indices]
X_test, y_test = data[test_indices], labels[test_indices]

assert X_train.shape[0] + X_val.shape[0] + X_test.shape[0] == n
assert y_train.shape[0] + y_val.shape[0] + y_test.shape[0] == n

In [None]:
max_seq_length = max(len(line.split()) for line in data)
max_seq_length

2681

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
seqs = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(seqs, maxlen=max_seq_length, padding='post')

In [None]:
X_train_pad.shape, X_train_pad[0:10]

((8441, 2681), array([[ 481,    6,   83, ...,    0,    0,    0],
        [ 580, 1470,   22, ...,    0,    0,    0],
        [  48, 2291,   24, ...,    0,    0,    0],
        ...,
        [  27, 5226,  625, ...,    0,    0,    0],
        [  22,   42,    8, ...,    0,    0,    0],
        [  27,  139,   52, ...,    0,    0,    0]], dtype=int32))

In [None]:
seqs = tokenizer.texts_to_sequences(X_val)
X_val_pad = pad_sequences(seqs, maxlen=max_seq_length, padding='post')

seqs = tokenizer.texts_to_sequences(X_val)
X_test_pad = pad_sequences(seqs, maxlen=max_seq_length, padding='post')

X_val_pad.shape, X_test_pad.shape

((2813, 2681), (2813, 2681))

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip


--2021-04-23 03:38:11--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-23 03:38:11--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-23 03:38:11--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2021

In [None]:
embeddings_index = {}
with open("glove.6B.100d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
num_tokens = len(tokenizer.word_index) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 16631 words (1185 misses)


In [None]:
from keras.layers import Embedding, Conv1D, ZeroPadding1D, Dense, GlobalMaxPooling1D
from keras.models import Sequential
from keras.initializers import Constant

cnn_model = Sequential()
cnn_model.add(
    Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=Constant(embedding_matrix),
        trainable=False,
        input_length=max_seq_length
    )
)
cnn_model.add(ZeroPadding1D(3))
cnn_model.add(Conv1D(128, 4, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dense(3, activation='softmax'))
cnn_model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2681, 100)         1781800   
_________________________________________________________________
zero_padding1d_1 (ZeroPaddin (None, 2687, 100)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2684, 128)         51328     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 1,841,579
Trainable params: 59,779
Non-trainable params: 1,781,800
_______________________________________

In [None]:
from keras.metrics import Precision, Recall
from keras.optimizers import Adam
cnn_model.compile(optimizer=Adam(lr=1E-3), loss='categorical_crossentropy', 
                  metrics=['acc', Precision(), Recall()])

history = cnn_model.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val),
                        batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from keras.layers import LSTM

lstm_model = Sequential()
lstm_model.add(Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=Constant(embedding_matrix),
        trainable=False,
        input_length=max_seq_length
))
lstm_model.add(LSTM(256, return_sequences=True))
lstm_model.add(GlobalMaxPooling1D())
lstm_model.add(Dense(128, activation='relu'))
lstm_model.add(Dense(3, activation='softmax'))
lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2681, 100)         1781800   
_________________________________________________________________
lstm (LSTM)                  (None, 2681, 256)         365568    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 387       
Total params: 2,180,651
Trainable params: 398,851
Non-trainable params: 1,781,800
_________________________________________________________________


In [None]:
lstm_model.compile(optimizer=Adam(lr=1E-3), loss='categorical_crossentropy', 
                  metrics=['acc', Precision(), Recall()])

history = lstm_model.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val),
                         batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
lstm_model = Sequential()
lstm_model.add(Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=Constant(embedding_matrix),
        trainable=False,
        input_length=max_seq_length
))
lstm_model.add(LSTM(256, return_sequences=False))
lstm_model.add(Dense(128, activation='relu'))
lstm_model.add(Dense(3, activation='softmax'))
lstm_model.summary()