In [1]:
import pandas as pd
import numpy as np
import pickle

from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Flatten
from keras.utils import np_utils
from keras.preprocessing.text import one_hot
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from keras.regularizers import l1,l2

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/train.csv')
test_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/test.csv')

# Label Onehot encoding  

le = LabelEncoder()
le.fit(train_df['author'])
print(le.classes_)

y = le.transform(train_df['author'])
y_train = np_utils.to_categorical(y)

y_train

['EAP' 'HPL' 'MWS']


array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [3]:
with open('keras_input_train.pkl', 'rb') as f:
    sequences = pickle.load(f)
with open('keras_input_test.pkl', 'rb') as f:
    test_sequences = pickle.load(f)
    
x_train = sequences
x_test = test_sequences

print(x_train.shape)
print(x_test.shape)

(19579, 24992)
(8392, 24992)


In [4]:
# tfidf

NUM_WORDS = 16000
N = 12
MAX_LEN = 300

tmp_X = train_df['text']
tmp_Y = train_df['author']
tmp_X_test = test_df['text']

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(tmp_X)

ttrain_x = tokenizer.texts_to_sequences(tmp_X)
ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)

ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

lb = preprocessing.LabelBinarizer()
lb.fit(tmp_Y)

ttrain_y = lb.transform(tmp_Y)

In [5]:
model = Sequential()

# If use LSTM, add Embedding layer first
# If "return_sequences=True" , output shape will add one dimension in the middle
# Add "Flatten()" let output dimension to one
model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2, return_sequences=True,kernel_regularizer=l2(0.00001),
                activity_regularizer=l1(0.0001)))
model.add(Flatten())

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 12)           192000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 300, 12)           1200      
_________________________________________________________________
flatten_1 (Flatten)          (None, 3600)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                230464    
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 195       
Total params: 423,859
Trainable params: 423,859
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.fit(ttrain_x, ttrain_y, 
                  validation_split=0.1,
                  batch_size=128, epochs=10, 
                  verbose=1,
                 )

Train on 17621 samples, validate on 1958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14b7f24dfd0>

In [7]:
scores = model.evaluate(ttrain_x, ttrain_y)  
print()
print(scores)


[0.10033964700557353, 0.97727156651385971]


In [8]:
%%time
prediction = model.predict(ttest_x)
print()
print("Show 10 prediction result :")  
print(prediction[:10])
print(prediction.shape)


Show 10 prediction result :
[[  5.00613474e-04   5.14466228e-05   9.99447882e-01]
 [  9.99988914e-01   7.27468796e-06   3.81492782e-06]
 [  2.41188519e-03   9.97187793e-01   4.00399585e-04]
 [  9.98840034e-01   1.08106434e-03   7.89987680e-05]
 [  9.99252141e-01   2.92358192e-04   4.55542846e-04]
 [  3.46272409e-01   6.52702510e-01   1.02507859e-03]
 [  8.08854043e-01   1.80783898e-01   1.03620673e-02]
 [  6.27815127e-02   4.72375825e-02   8.89980853e-01]
 [  9.99999523e-01   4.78674679e-07   2.61229989e-08]
 [  9.79163945e-01   1.29354699e-03   1.95425507e-02]]
(8392, 3)
Wall time: 27.6 s


In [9]:
# submission

submission = pd.DataFrame(prediction, columns=le.classes_)
submission["id"] = test_df["id"]
cols = submission.columns.tolist()
cols = cols[-1:]+cols[:-1]
submission = submission[cols]
submission.head(10)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.000501,5.144662e-05,0.9994479
1,id24541,0.999989,7.274688e-06,3.814928e-06
2,id00134,0.002412,0.9971878,0.0004003996
3,id27757,0.99884,0.001081064,7.899877e-05
4,id04081,0.999252,0.0002923582,0.0004555428
5,id27337,0.346272,0.6527025,0.001025079
6,id24265,0.808854,0.1807839,0.01036207
7,id25917,0.062782,0.04723758,0.8899809
8,id04951,1.0,4.786747e-07,2.6123e-08
9,id14549,0.979164,0.001293547,0.01954255


In [10]:
submission.to_csv('submission_lstm.csv',index=False)

In [11]:
x_train.todense().shape

(19579, 24992)

In [12]:
ttrain_x.shape

(19579, 300)

In [13]:
ttrain_y

array([[1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       ..., 
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]])

In [14]:
y_train

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])