In [189]:
import numpy as np
import torch
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import defaultdict
import random

# Import text

In [198]:
data_files = ['Beyond good and Evil.txt',
              'The Birth of Tragedy.txt',
              'On the Genealogy of Morality.txt',
              'Thus Spoke Zarathustra.txt',
              'The Antichrist.txt']
book_names = ['Beyond good and Evil',
              'The Birth of Tragedy',
              'On the Genealogy of Morality',
              'Thus Spoke Zarathustra',
              'The Antichrist']
class_count = len(data_files)

data_lines =[]
for file in data_files:
    path = 'datasets/{}'.format(file)
    with open(path) as f:
        lines = f.readlines()
    data_lines.append(lines)
    
data_strings = [" ".join(d).replace("\n","") for d in data_lines]
data_sentences = []
for st in data_strings:
    st = re.sub("[^a-zA-Z0-9\.\?\! ]", "", st)
    st = re.sub("\d+\.", "", st)
    sentences = re.split("\. |\! |\?", st)
    data_sentences.append([[elem.lower() for elem in sent.split(" ") if elem != ""] for sent in sentences])
    
print(data_sentences[0][:5])

[['chapter', 'i'], ['prejudices', 'of', 'philosophers', 'the', 'will', 'to', 'truth', 'which', 'is', 'to', 'tempt', 'us', 'to', 'many', 'a', 'hazardous', 'enterprise', 'the', 'famous', 'truthfulness', 'of', 'which', 'all', 'philosophers', 'have', 'hitherto', 'spoken', 'with', 'respect', 'what', 'questions', 'has', 'this', 'will', 'to', 'truth', 'not', 'laid', 'before', 'us'], ['what', 'strange', 'perplexing', 'questionable', 'questions'], ['it', 'is', 'already', 'a', 'long', 'story', 'yet', 'it', 'seems', 'as', 'if', 'it', 'were', 'hardly', 'commenced'], ['is', 'it', 'any', 'wonder', 'if', 'we', 'at', 'last', 'grow', 'distrustful', 'lose', 'patience', 'and', 'turn', 'impatiently', 'away']]


# Make vocabulary

In [132]:
index2word = ["<UNK>"]+list(set([word for book in data_sentences for sent in book for word in sent]))
word2index_ = defaultdict(int)
i = 0
for word in index2word:
    word2index_[word] = i
    i+=1
def word2index(word):
    if word in word2index_.keys():
        return word2index_[word]
    else:
        return 0

vocab_size = len(index2word)

print(word2index('bone'))
print(index2word[word2index('bone')])
print(vocab_size)

8330
bone
22723


# Make datasets

In [154]:
# word indices per sentence per book
data_sentences_indices = [
    [[word2index(word) for word in sent] for sent in book]
    for book in data_sentences]

# many-hot sentence representation, book index
data = []
for book_index in range(len(data_sentences_indices)):
    for word_indices in data_sentences_indices[book_index]:
        manyhot = np.zeros(vocab_size)
        manyhot[word_indices] = 1
        data.append((manyhot, book_index))
        
print(data[0])

(array([0., 0., 0., ..., 0., 0., 0.]), 0)


In [188]:
# shuffle data set
random.shuffle(data)

# segregate data
data_size = len(data)
data_train = data[:int(data_size*.9)]
data_test = data[int(data_size*.9):]

# Classifier definition

In [196]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        # neural network with two fully connected layers
        self.fc1 = nn.Linear(vocab_size, 20)
        self.fc2 = nn.Linear(20, len(book_names))
        
    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return x

# Train and Test

In [215]:
net = Classifier()
print(net)

optimiser = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.MSELoss()

for epoch in range(10):
    print("Starting epoch {}".format(epoch))
    for x,t in data_train:
        inp = torch.from_numpy(x).float()
        target = torch.zeros(class_count)
        target[t] = 1
        optimiser.zero_grad()
        output = net(inp)
        loss = criterion(F.softmax(output), target)
        loss.backward()
        optimiser.step()

Classifier(
  (fc1): Linear(in_features=22723, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=5, bias=True)
)
Starting epoch 0


  from ipykernel import kernelapp as app


Starting epoch 1
Starting epoch 2
Starting epoch 3
Starting epoch 4
Starting epoch 5
Starting epoch 6
Starting epoch 7
Starting epoch 8
Starting epoch 9


In [275]:
def test(model):
    total = len(data_test)
    total_correct = 0
    correct_per_class = np.zeros(class_count)
    total_per_class = np.zeros(class_count)
    for x,t in data_test:
        inp = torch.from_numpy(x).float()
        output = model(inp).detach()
        total_correct += 1 if np.argmax(output) == t else 0
        total_per_class[t] += 1
        correct_per_class[t] += 1 if np.argmax(output) == t else 0
    accuracy = total_correct / total
#     accuracy_per_class = correct_per_class / total_per_class
    return accuracy, correct_per_class, total_per_class

In [281]:
acc, class_cor, class_tot = test(net)
print("The accuracy of the sentence to book classifier is {:.02f}%".format(acc*100))
print("Accuracy per book:")
class_acc = class_cor / class_tot
for i in range(class_count):
    print("\t{}: {:.0f}% ({:d}/{:d})".format(book_names[i],class_acc[i]*100, int(class_cor[i]),int(class_tot[i])))

The accuracy of the sentence to book classifier is 65.48%
Accuracy per book:
	Beyond good and Evil: 28% (41/145)
	The Birth of Tragedy: 57% (86/151)
	On the Genealogy of Morality: 0% (0/130)
	Thus Spoke Zarathustra: 99% (666/674)
	The Antichrist: 0% (0/111)


# Prediction

In [294]:
def sentence2manyhot(sentence):
    # filter characters
    indices = [word2index(word.lower()) for word in sentence.split(" ")]
    # many-hot representation
    manyhot = np.zeros(vocab_size)
    manyhot[indices] = 1
    return torch.from_numpy(manyhot)

def predict(model, sentence):
    manyhot = sentence2manyhot(sentence).float()
    output = model(manyhot).detach()
    return book_names[np.argmax(output)]

In [310]:
sentence = "A few weeks later: and he found himself under the walls of Metz, still wrestling with the notes of interrogation he had set down concerning the alleged cheerfulness of the Greeks and of Greek art; till at last, in that month of deep suspense, when peace was debated at Versailles, he too attained to peace with himself, and, slowly recovering from a disease brought home from the field, made up his mind definitely regarding the Birth of Tragedy from the Spirit of _Music"

print("The sentence \"{}\" fits best in {}.".format(sentence, predict(net, sentence)))

The sentence "Superman" fits best in Thus Spoke Zarathustra.
