# Purpose

Train neural netwok to forecast horror author based on phrase

## 0. Imports

In [20]:
import numpy as np
import pandas as pd

import spacy
from nltk.corpus import stopwords
import string
from collections import Counter

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

## 1. Preprocessing

Dataset consist of phrases of 3 horror authors: Edgar Po (EAP), HP Lovecraft (HPL), Mary Shelley (MWS)

Dataset source - https://www.kaggle.com/competitions/spooky-author-identification/data

In [21]:
dataset = pd.read_csv('train.csv')

In [22]:
dataset.head(3)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP


In [23]:
## OneHotEncoding for y 
one_hot = pd.get_dummies(dataset['author'])
one_hot.head(3)

Unnamed: 0,EAP,HPL,MWS
0,1,0,0
1,0,1,0
2,1,0,0


In [24]:
X = dataset['text'].values
y = dataset.join(one_hot)[['EAP','HPL','MWS']].values

In [25]:
# set function for clear text from personal pronoun, punctuation, stopwords by spaCy
nlp = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def cleanup_text(docs, logging=False):
    texts = []
    counter = 1
    for doc in docs:
        if counter % 1000 == 0 and logging:
            print("Processed %d out of %d documents." % (counter, len(docs)))
        counter += 1
        doc = nlp(doc, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = ' '.join(tokens)
        texts.append(tokens)
    return pd.Series(texts)

In [26]:
stopwords = stopwords.words('english')

In [27]:
# clear text and check for completeness
print('Original training data shape: ', X.shape)
train_cleaned = cleanup_text(X, logging=True)
print('Cleaned up training data shape: ', train_cleaned.shape)

Original training data shape:  (19579,)
Processed 1000 out of 19579 documents.
Processed 2000 out of 19579 documents.
Processed 3000 out of 19579 documents.
Processed 4000 out of 19579 documents.
Processed 5000 out of 19579 documents.
Processed 6000 out of 19579 documents.
Processed 7000 out of 19579 documents.
Processed 8000 out of 19579 documents.
Processed 9000 out of 19579 documents.
Processed 10000 out of 19579 documents.
Processed 11000 out of 19579 documents.
Processed 12000 out of 19579 documents.
Processed 13000 out of 19579 documents.
Processed 14000 out of 19579 documents.
Processed 15000 out of 19579 documents.
Processed 16000 out of 19579 documents.
Processed 17000 out of 19579 documents.
Processed 18000 out of 19579 documents.
Processed 19000 out of 19579 documents.
Cleaned up training data shape:  (19579,)


In [28]:
# tokenezation and vectorization text data for training of model
vocab_size = 25000 # max level of unique words 
max_len = 250 # max len for sentense 
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_cleaned)
len(tokenizer.word_index)

19374

In [29]:
tokenized_train = tokenizer.texts_to_sequences(train_cleaned)

In [30]:
X = pad_sequences(tokenized_train,maxlen=max_len)

In [31]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

print('X_train size: {}'.format(X_train.shape))
print('X_test size: {}'.format(X_test.shape))
print('y_train size: {}'.format(y_train.shape))
print('y_test size: {}'.format(y_test.shape))

X_train size: (15663, 250)
X_test size: (3916, 250)
y_train size: (15663, 3)
y_test size: (3916, 3)


## 2. Training of neural network

Recurrent neural networks based on LSTM (Long short-term memory) is more appropriate for such tasks, so select that type network

In [32]:
emd_dim = 32
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,emd_dim,input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50,return_sequences=True)),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [33]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 250, 32)           800000    
                                                                 
 bidirectional_1 (Bidirectio  (None, 250, 100)         33200     
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 100)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 50)                5050      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 3)                

In [35]:
#  model training
batch_size = 264
epochs = 6
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [36]:
# result
print("Training accuracy: %.2f%% / Validation accuracy: %.2f%%" % 
      (100*history.history['accuracy'][-1], 100*history.history['val_accuracy'][-1]))

Training accuracy: 95.42% / Validation accuracy: 79.80%


## 3. Check model on test

In [37]:
# prediction on test
predicted_prob = model.predict(X_test)
print(predicted_prob.shape)

(3916, 3)


In [38]:
# accuracy on test
test_accuracy = tf.keras.metrics.CategoricalAccuracy()
test_accuracy.update_state(y_test, predicted_prob,
               sample_weight=None)

print("Test accuracy: %.2f%% " % 
      (100*test_accuracy.result().numpy()))

Test accuracy: 81.18% 


## Conclusion:

Accuracy on test at good level. Recurrent neural networks based on LSTM is efficient for that task.