In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

## Load GloVe

In [3]:
import numpy as np
import codecs
glove_vectors = 'myglove.6B.50d.txt'
filecp = codecs.open(glove_vectors, encoding = 'utf-8')
glove = np.loadtxt(filecp, dtype='str', comments=None)
# Extract the vectors and words
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

# Create lookup of words to vectors
word_lookup = {word: vector for word, vector in zip(words, vectors)}


# Create a mapping from unique characters to indices
word2idx = {char:index for index, char in enumerate(words)}
idx2word = np.array(words)
vocab = len(words)
print("Vocabulary:",vocab)

Vocabulary: 400001


## Load books

In [4]:
def load_file(filename):
    fin = open(filename, 'rb')
    txt = fin.read().decode(encoding='utf-8')
    fin.close()
    return txt

book_names = ("hugo","dickens","wells","kipling")
texts = {}
for bn in book_names:
    texts[bn] = load_file(bn+'.txt')

for k in texts.keys():
    print(k, "characters:",len(texts[k]),"distinct characters:",len(set(texts[k])))

num_books = len(texts.keys())

hugo characters: 3303543 distinct characters: 119
dickens characters: 181399 distinct characters: 83
wells characters: 361811 distinct characters: 90
kipling characters: 298210 distinct characters: 87


## Functions

In [5]:
def clean_txt(txt):
    txt = txt.replace('\r','')
    # txt = txt.replace('\n',' \n ')
    txt = txt.replace('\n',' ')
    txt = txt.replace(',',' ')
    txt = txt.replace(';',' ')
    txt = txt.replace('.',' ')
    txt = txt.replace('(','')
    txt = txt.replace(')','')
    txt = txt.replace('!',' ')
    txt = txt.replace('?',' ')
    txt = txt.replace('_',' ')
    txt = txt.replace('“','')
    txt = txt.replace('„','')
    txt = txt.replace('""','')
    txt = txt.lower()
    for i in range(5):
        txt = txt.replace('  ',' ')
    return txt

# converts a string to list of words
def text2words(dataset_txt):
    cleaned_dataset_txt = clean_txt(dataset_txt)
    words = cleaned_dataset_txt.rstrip().split(' ')
    return words

def to_text(sample):
    return ' '.join([idx2word[int(x)] for x in sample])


In [6]:
words = []
for k in texts.keys():
    w = text2words(texts[k])
    words.append(w)
    print(k,"words:",len(w),"distinct words:",len(set(w)))


hugo words: 573082 distinct words: 27267
dickens words: 32247 distinct words: 5152
wells words: 63317 distinct words: 7605
kipling words: 54907 distinct words: 5711


## Convert the dataset from 'words' to 'integers'

In [7]:
def words2ints(words):
    d = []
    for word in words:
        if word in word2idx:
            d.append(word2idx[word])
        else:
            #print(word)
            d.append(0)
    return np.array(d)

ibooks = []
for w in words:
    ibooks.append(words2ints(w))

for dint in ibooks:
    l = len(dint)
    l0 = np.count_nonzero(dint==0)
    print("words:",l,"not known words:",l0, "% of unknown words:", l0/l)    

words: 573082 not known words: 16270 % of unknown words: 0.028390352514997854
words: 32247 not known words: 1540 % of unknown words: 0.047756380438490405
words: 63317 not known words: 1158 % of unknown words: 0.018288927144368812
words: 54907 not known words: 2135 % of unknown words: 0.038883931010617954


## Create samples and labels

In [8]:
# take MAX word
MAX=30000
# length of one sample
LEN=200

samples = []
labels = []
for b,book in enumerate(ibooks):
    for i in range(0,MAX-LEN,LEN):
        samples.append(book[i:LEN+i])
        labels.append(b)
samples = np.array(samples,dtype=float)
samples = np.expand_dims(samples,axis=2)
labels = np.array(labels,dtype=float)
labels = tf.keras.utils.to_categorical(labels)
print("Samples:",samples.shape)
print("Labels:",labels.shape)

Samples: (596, 200, 1)
Labels: (596, 4)


## Replace word with the embedding (vector of 50 values)

In [9]:
samples50 = np.zeros((len(samples),LEN,50))
for i in range(len(samples)):
    for j in range(LEN):
        si = samples[i,j,0]
        v = word_lookup[idx2word[int(si)]]
        samples50[i,j] = v
        
print("Samples with embedding",samples50.shape)

Samples with embedding (596, 200, 50)


In [10]:
def build_model(size):
    model = tf.keras.Sequential()
    model.add(LSTM(1024, input_shape=(None,size))) #size is the size of ONE sample
    model.add(Dense(num_books,activation='softmax'))
    return model  
        
model1 = build_model(1)
model50 = build_model(50)

model1.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
model50.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
model1.summary()
model50.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1024)              4202496   
_________________________________________________________________
dense (Dense)                (None, 4)                 4100      
Total params: 4,206,596
Trainable params: 4,206,596
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1024)              4403200   
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 4100      
Total params: 4,407,300
Trainable params: 4,407,300
Non-trainable params: 0
_________________________________________________________________


In [11]:
import sklearn.model_selection
(trainSamples, testSamples, trainSamples50, testSamples50, trainLabels, testLabels) = sklearn.model_selection.train_test_split(samples, samples50, labels, test_size=0.5, random_state=1)


# Train model without GloVo

In [12]:
EPOCHS=10
H = model1.fit(trainSamples,trainLabels,epochs=EPOCHS,verbose=1,validation_data=(testSamples,testLabels))

Train on 298 samples, validate on 298 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Train model using GloVo

In [15]:
EPOCHS=10
H = model50.fit(trainSamples50,trainLabels,epochs=EPOCHS,verbose=1,validation_data=(testSamples50,testLabels))

Train on 298 samples, validate on 298 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score
def show_results(model,testSamples,testLabels):
    testResults = model.predict(testSamples)
    print(confusion_matrix(testLabels.argmax(axis=1), testResults.argmax(axis=1)))
    #print(classification_report(testLabels.argmax(axis=1), testResults.argmax(axis=1),labels=book_names))
    print(classification_report(testLabels.argmax(axis=1), testResults.argmax(axis=1)))
    print("Cohen's Kappa: {}".format(cohen_kappa_score(testLabels.argmax(axis=1), testResults.argmax(axis=1))))
    print("Accuracy: ",accuracy_score(testLabels.argmax(axis=1), testResults.argmax(axis=1)))

In [1]:
print("Model 1")
print(book_names)
show_results(model1,testSamples,testLabels)

Model 1


NameError: name 'book_names' is not defined

In [None]:
print("Model 50")
print(book_names)
show_results(model50,testSamples50,testLabels)

In [None]:
model.save("model95_30k")

In [None]:
book_names = ["hugo","dickens","wells","kipling"]
book_names