In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

## Load GloVe

In [2]:
import numpy as np
import codecs
glove_vectors = 'myglove.6B.50d.txt'
filecp = codecs.open(glove_vectors, encoding = 'utf-8')
glove = np.loadtxt(filecp, dtype='str', comments=None)
# Extract the vectors and words
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

# Create lookup of words to vectors
word_lookup = {word: vector for word, vector in zip(words, vectors)}


# Create a mapping from unique characters to indices
word2idx = {char:index for index, char in enumerate(words)}
idx2word = np.array(words)
vocab = len(words)
print("Vocabulary:",vocab)

Vocabulary: 400001


## Load books

In [3]:
def load_file(filename):
    fin = open(filename, 'rb')
    txt = fin.read().decode(encoding='utf-8')
    fin.close()
    return txt

book_names = ("hugo","dickens","wells","kipling")
texts = {}
for bn in book_names:
    texts[bn] = load_file(bn+'.txt')

for k in texts.keys():
    print(k, "characters:",len(texts[k]),"distinct characters:",len(set(texts[k])))

num_books = len(texts.keys())

hugo characters: 3303543 distinct characters: 119
dickens characters: 181399 distinct characters: 83
wells characters: 361811 distinct characters: 90
kipling characters: 298210 distinct characters: 87


In [None]:
num_books = len(texts.keys())

## Functions

In [4]:
def clean_txt(txt):
    txt = txt.replace('\r','')
    # txt = txt.replace('\n',' \n ')
    txt = txt.replace('\n',' ')
    txt = txt.replace(',',' ')
    txt = txt.replace(';',' ')
    txt = txt.replace('.',' ')
    txt = txt.replace('(','')
    txt = txt.replace(')','')
    txt = txt.replace('!',' ')
    txt = txt.replace('?',' ')
    txt = txt.replace('_',' ')
    txt = txt.replace('“','')
    txt = txt.replace('„','')
    txt = txt.replace('""','')
    txt = txt.lower()
    for i in range(5):
        txt = txt.replace('  ',' ')
    return txt

# converts a string to list of words
def text2words(dataset_txt):
    cleaned_dataset_txt = clean_txt(dataset_txt)
    words = cleaned_dataset_txt.rstrip().split(' ')
    return words

def to_text(sample):
    return ' '.join([idx2word[int(x)] for x in sample])


In [5]:
words = []
for k in texts.keys():
    w = text2words(texts[k])
    words.append(w)
    print(k,"words:",len(w),"distinct words:",len(set(w)))


hugo words: 573082 distinct words: 27267
dickens words: 32247 distinct words: 5152
wells words: 63317 distinct words: 7605
kipling words: 54907 distinct words: 5711


## Convert the dataset from 'words' to 'integers'

In [6]:
def words2ints(words):
    d = []
    for word in words:
        if word in word2idx:
            d.append(word2idx[word])
        else:
            #print(word)
            d.append(0)
    return np.array(d)

ibooks = []
for w in words:
    ibooks.append(words2ints(w))

for dint in ibooks:
    l = len(dint)
    l0 = np.count_nonzero(dint==0)
    print("words:",l,"not known words:",l0, "% of unknown words:", l0/l)    

words: 573082 not known words: 16270 % of unknown words: 0.028390352514997854
words: 32247 not known words: 1540 % of unknown words: 0.047756380438490405
words: 63317 not known words: 1158 % of unknown words: 0.018288927144368812
words: 54907 not known words: 2135 % of unknown words: 0.038883931010617954


## Create samples and labels

In [7]:
# take MAX word
MAX=30000
# length of one sample
LEN=200

samples = []
labels = []
for b,book in enumerate(ibooks):
    for i in range(0,MAX-LEN,LEN):
        samples.append(book[i:LEN+i])
        labels.append(b)
samples = np.array(samples,dtype=float)
samples = np.expand_dims(samples,axis=2)
labels = np.array(labels,dtype=float)
labels = tf.keras.utils.to_categorical(labels)
print("Samples:",samples.shape)
print("Labels:",labels.shape)

Samples: (596, 200, 1)
Labels: (596, 4)


## Replace word with the embedding (vector of 50 values)

In [8]:
samples50 = np.zeros((len(samples),LEN,50))
for i in range(len(samples)):
    for j in range(LEN):
        si = samples[i,j,0]
        v = word_lookup[idx2word[int(si)]]
        samples50[i,j] = v
        
print("Samples with embedding",samples50.shape)

Samples with embedding (596, 200, 50)


In [9]:
def build_model(size):
    model = tf.keras.Sequential()
    model.add(LSTM(1024, input_shape=(None,size))) #size is the size of ONE sample
    model.add(Dense(num_books,activation='softmax'))
    return model  
        
model1 = build_model(1)
model50 = build_model(50)

model1.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
model50.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1024)              4202496   
_________________________________________________________________
dense (Dense)                (None, 4)                 4100      
Total params: 4,206,596
Trainable params: 4,206,596
Non-trainable params: 0
_________________________________________________________________


In [10]:
import sklearn.model_selection
(trainSamples, testSamples, trainSamples50, testSamples50, trainLabels, testLabels) = sklearn.model_selection.train_test_split(samples, samples50, labels, test_size=0.5, random_state=1)


In [11]:
EPOCHS=30
H = model1.fit(trainSamples,trainLabels,epochs=EPOCHS,verbose=1,validation_data=(testSamples,testLabels))

Train on 298 samples, validate on 298 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [12]:
EPOCHS=30
H = model50.fit(trainSamples50,trainLabels,epochs=EPOCHS,verbose=1,validation_data=(testSamples50,testLabels))

Train on 298 samples, validate on 298 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score
def show_results(model,testSamples,testLabels):
    testResults = model.predict(testSamples)
    print(confusion_matrix(testLabels.argmax(axis=1), testResults.argmax(axis=1)))
    print(classification_report(testLabels.argmax(axis=1), testResults.argmax(axis=1)))
    print("Cohen's Kappa: {}".format(cohen_kappa_score(testLabels.argmax(axis=1), testResults.argmax(axis=1))))
    print("Accuracy: ",accuracy_score(testLabels.argmax(axis=1), testResults.argmax(axis=1)))

In [14]:
print("Model 1")
print(book_names)
show_results(model1,testSamples,testLabels)

Model 1
('hugo', 'dickens', 'wells', 'kipling')
[[16 32 20 12]
 [13 34 10  9]
 [14 17 27 17]
 [12 21 19 25]]
              precision    recall  f1-score   support

           0       0.29      0.20      0.24        80
           1       0.33      0.52      0.40        66
           2       0.36      0.36      0.36        75
           3       0.40      0.32      0.36        77

    accuracy                           0.34       298
   macro avg       0.34      0.35      0.34       298
weighted avg       0.34      0.34      0.33       298

Cohen's Kappa: 0.12809565749600682
Accuracy:  0.3422818791946309


In [15]:
print("Model 50")
show_results(model50,testSamples50,testLabels)

Model 50
[[57  9  7  7]
 [13 24 13 16]
 [10  9 45 11]
 [ 2 12 10 53]]
              precision    recall  f1-score   support

           0       0.70      0.71      0.70        80
           1       0.44      0.36      0.40        66
           2       0.60      0.60      0.60        75
           3       0.61      0.69      0.65        77

    accuracy                           0.60       298
   macro avg       0.59      0.59      0.59       298
weighted avg       0.59      0.60      0.60       298

Cohen's Kappa: 0.4655796009403822
Accuracy:  0.6006711409395973


In [None]:
model.save("model95_30k")