In [None]:
!pip install gensim
!pip install nltk

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from nltk.tokenize import WordPunctTokenizer, sent_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.utils import to_categorical

In [3]:
train = pd.read_csv('open/train.csv', encoding = 'utf-8')

In [4]:
train['text'] = train['text'].str.lower()

In [34]:
sentences = []
for text in train['text']:
    for sentence in sent_tokenize(text):
        sentences.append(sentence)

In [6]:
word_tokenizer = WordPunctTokenizer()
tokenized_sentences = [word_tokenizer.tokenize(sentence) for sentence in sentences]

In [27]:
X_train = np.array([x for x in train['text']])
y_train = np.array([x for x in train['author']])

In [8]:
embedding_dim=64

In [9]:
wv_model = Word2Vec(sentences=tokenized_sentences, size=embedding_dim, window=5, min_count=5, workers=4, sg=0)

In [10]:
padding_type='post'
max_length=500

In [13]:
def gen_sequences(model, input_data):
    sequences = []
    for sentence in input_data:
        sequence = []
        for word in word_tokenizer.tokenize(sentence):
            try:
                sequence.append(wv_model.wv[word])
            except KeyError:
                pass
        sequences.append(sequence)
    return sequences

train_sequences = gen_sequences(wv_model, X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = gen_sequences(wv_model, X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [14]:
train_padded.shape

(54879, 500, 64)

In [32]:
model = tf.keras.Sequential([
    tf.keras.layers.GlobalAveragePooling1D(input_shape=(max_length,embedding_dim)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=.01), metrics=['acc'])

In [29]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True), input_shape=(max_length,embedding_dim)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=.01), metrics=['acc'])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 500, 128)          66048     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 645       
Total params: 165,509
Trainable params: 165,509
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
# fit model
num_epochs = 20
history = model.fit(train_padded, to_categorical(y_train), 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2)

Epoch 1/20
1372/1372 - 9s - loss: 1.2395 - acc: 0.5043 - val_loss: 1.1475 - val_acc: 0.5474
Epoch 2/20
1372/1372 - 9s - loss: 1.1262 - acc: 0.5563 - val_loss: 1.1078 - val_acc: 0.5635
Epoch 3/20
1372/1372 - 9s - loss: 1.1008 - acc: 0.5652 - val_loss: 1.0799 - val_acc: 0.5745
Epoch 4/20
1372/1372 - 9s - loss: 1.0850 - acc: 0.5716 - val_loss: 1.0499 - val_acc: 0.5927
Epoch 5/20
1372/1372 - 9s - loss: 1.0732 - acc: 0.5776 - val_loss: 1.0509 - val_acc: 0.5899
Epoch 6/20
1372/1372 - 9s - loss: 1.0632 - acc: 0.5797 - val_loss: 1.0523 - val_acc: 0.5899
Epoch 7/20
1372/1372 - 10s - loss: 1.0562 - acc: 0.5816 - val_loss: 1.0358 - val_acc: 0.5901
Epoch 8/20
1372/1372 - 9s - loss: 1.0469 - acc: 0.5869 - val_loss: 1.0281 - val_acc: 0.5953
Epoch 9/20
1372/1372 - 10s - loss: 1.0397 - acc: 0.5902 - val_loss: 1.0243 - val_acc: 0.5995
Epoch 10/20
1372/1372 - 9s - loss: 1.0371 - acc: 0.5921 - val_loss: 1.0326 - val_acc: 0.5930
Epoch 11/20
1372/1372 - 9s - loss: 1.0305 - acc: 0.5945 - val_loss: 1.0319 - 