In [8]:
import pandas as pd
import numpy as np
import nltk
import pickle

from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Flatten
from keras.utils import np_utils
from keras.preprocessing.text import one_hot
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/train.csv')
test_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/test.csv')

train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [12]:
le = LabelEncoder()
le.fit(train_df['author'])
print(le.classes_)

y = le.transform(train_df['author'])
y_train = np_utils.to_categorical(y)

y_train

['EAP' 'HPL' 'MWS']


array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [3]:
def textClean(text):
    
    # except [A-Za-z0-9^,!.\/'+-=], others will replace to " "
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    
    stops = {'so', 'his', 't', 'y', 'ours', 'herself', 
             'your', 'all', 'some', 'they', 'i', 'of', 'didn', 
             'them', 'when', 'will', 'that', 'its', 'because', 
             'while', 'those', 'my', 'don', 'again', 'her', 'if',
             'further', 'now', 'does', 'against', 'won', 'same', 
             'a', 'during', 'who', 'here', 'have', 'in', 'being', 
             'it', 'other', 'once', 'itself', 'hers', 'after', 're',
             'just', 'their', 'himself', 'theirs', 'whom', 'then', 'd', 
             'out', 'm', 'mustn', 'where', 'below', 'about', 'isn',
             'shouldn', 'wouldn', 'these', 'me', 'to', 'doesn', 'into',
             'the', 'until', 'she', 'am', 'under', 'how', 'yourself',
             'couldn', 'ma', 'up', 'than', 'from', 'themselves', 'yourselves',
             'off', 'above', 'yours', 'having', 'mightn', 'needn', 'on', 
             'too', 'there', 'an', 'and', 'down', 'ourselves', 'each',
             'hadn', 'ain', 'such', 've', 'did', 'be', 'or', 'aren', 'he', 
             'should', 'for', 'both', 'doing', 'this', 'through', 'do', 'had',
             'own', 'but', 'were', 'over', 'not', 'are', 'few', 'by', 
             'been', 'most', 'no', 'as', 'was', 'what', 's', 'is', 'you', 
             'shan', 'between', 'wasn', 'has', 'more', 'him', 'nor',
             'can', 'why', 'any', 'at', 'myself', 'very', 'with', 'we', 
             'which', 'hasn', 'weren', 'haven', 'our', 'll', 'only',
             'o', 'before'}
                         
    text = [w for w in text if not w in stops]    
    text = " ".join(text)
    text = text.replace("."," ").replace(","," ")
    return(text)

In [4]:
trainText = []
for it in train_df['text']:
    newT = textClean(it)
    trainText.append(newT)
    
testText = []
for it in test_df['text']:
    newT = textClean(it)
    testText.append(newT)

In [5]:
%%time
count_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df=0.65,
                        tokenizer=nltk.word_tokenize,
                        strip_accents='unicode', max_features=1000,
                        lowercase =True, analyzer='word', token_pattern=r'\w+',
                        use_idf=True, smooth_idf=True, sublinear_tf=False, 
                        stop_words = 'english')
bag_of_words = count_vectorizer.fit_transform(trainText)
print(bag_of_words.shape)
X_test = count_vectorizer.transform(testText)
print(X_test.shape)

(19579, 1000)
(8392, 1000)
Wall time: 5.23 s


In [6]:
NUM_WORDS = 16000
N = 12
MAX_LEN = 1000

In [9]:
model = Sequential() 
model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2))
# model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax',name="output"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 12)          192000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 12)                1200      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                832       
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
output (Dense)               (None, 3)                 195       
Total params: 194,227
Trainable params: 194,227
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
train_history = model.fit(bag_of_words.todense(), y_train, validation_split=0.2, epochs=5, batch_size=200, verbose=1)

Train on 15663 samples, validate on 3916 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
