In [13]:
import pandas as pd
import numpy as np
import pickle

from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Flatten
from keras.utils import np_utils
from keras.preprocessing.text import one_hot
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing

In [14]:
train_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/train.csv')
test_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/test.csv')

# Label Onehot encoding  

le = LabelEncoder()
le.fit(train_df['author'])
print(le.classes_)

y = le.transform(train_df['author'])
y_train = np_utils.to_categorical(y)

y_train

['EAP' 'HPL' 'MWS']


array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [15]:
with open('keras_input_train.pkl', 'rb') as f:
    sequences = pickle.load(f)
with open('keras_input_test.pkl', 'rb') as f:
    test_sequences = pickle.load(f)
    
x_train = sequences
x_test = test_sequences

print(x_train.shape)
print(x_test.shape)

(19579, 24992)
(8392, 24992)


In [17]:
# tfidf

NUM_WORDS = 16000
N = 12
MAX_LEN = 300

tmp_X = train_df['text']
tmp_Y = train_df['author']
tmp_X_test = test_df['text']

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(tmp_X)

ttrain_x = tokenizer.texts_to_sequences(tmp_X)
ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)

ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

lb = preprocessing.LabelBinarizer()
lb.fit(tmp_Y)

ttrain_y = lb.transform(tmp_Y)

In [18]:
model = Sequential()

# If use LSTM, add Embedding layer first
# If "return_sequences=True" , output shape will add one dimension in the middle
# Add "Flatten()" let output dimension to one
model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(Flatten())

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 12)           192000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 300, 12)           1200      
_________________________________________________________________
flatten_1 (Flatten)          (None, 3600)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                230464    
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 195       
Total params: 423,859
Trainable params: 423,859
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(ttrain_x, ttrain_y, 
                  validation_split=0.1,
                  batch_size=128, epochs=10, 
                  verbose=1,
                 )

Train on 17621 samples, validate on 1958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x228a642fcc0>

In [20]:
scores = model.evaluate(ttrain_x, ttrain_y)  
print()
print(scores)


[0.10344426550426419, 0.97609683843718176]


In [21]:
%%time
prediction = model.predict(ttest_x)
print()
print("Show 10 prediction result :")  
print(prediction[:10])
print(prediction.shape)


Show 10 prediction result :
[[  5.11398481e-04   1.68094994e-05   9.99471843e-01]
 [  1.00000000e+00   1.04812869e-08   5.85301851e-09]
 [  1.27971254e-03   9.98595417e-01   1.24923972e-04]
 [  3.58149819e-02   9.64108169e-01   7.68208411e-05]
 [  9.99523997e-01   2.56788946e-04   2.19228867e-04]
 [  4.32424575e-01   5.67425549e-01   1.49814121e-04]
 [  9.94698644e-01   4.89916280e-03   4.02204547e-04]
 [  1.05108891e-03   2.53042235e-04   9.98695910e-01]
 [  1.00000000e+00   4.56178828e-09   9.12870127e-12]
 [  9.95947182e-01   2.65623850e-04   3.78729939e-03]]
(8392, 3)
Wall time: 22.3 s


In [22]:
# submission

submission = pd.DataFrame(prediction, columns=le.classes_)
submission["id"] = test_df["id"]
cols = submission.columns.tolist()
cols = cols[-1:]+cols[:-1]
submission = submission[cols]
submission.head(10)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.000511,1.68095e-05,0.9994718
1,id24541,1.0,1.048129e-08,5.853019e-09
2,id00134,0.00128,0.9985954,0.000124924
3,id27757,0.035815,0.9641082,7.682084e-05
4,id04081,0.999524,0.0002567889,0.0002192289
5,id27337,0.432425,0.5674255,0.0001498141
6,id24265,0.994699,0.004899163,0.0004022045
7,id25917,0.001051,0.0002530422,0.9986959
8,id04951,1.0,4.561788e-09,9.128701e-12
9,id14549,0.995947,0.0002656239,0.003787299


In [23]:
submission.to_csv('submission_lstm.csv',index=False)

In [25]:
x_train.todense().shape

(19579, 24992)

In [25]:
ttrain_x.shape

(19579, 300)

In [26]:
ttrain_y

array([[1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       ..., 
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]])

In [27]:
y_train

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])