In [14]:
import pandas as pd
import numpy as np
import pickle

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import one_hot
from sklearn.preprocessing import LabelEncoder
from keras.regularizers import l1,l2

In [15]:
train_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/train.csv')
test_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/test.csv')


# Label Onehot encoding  

le = LabelEncoder()
le.fit(train_df['author'])
print(le.classes_)

y = le.transform(train_df['author'])
y_train = np_utils.to_categorical(y)

y_train

['EAP' 'HPL' 'MWS']


array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [16]:
with open('keras_input_train.pkl', 'rb') as f:
    sequences = pickle.load(f)
with open('keras_input_test.pkl', 'rb') as f:
    test_sequences = pickle.load(f)
    
x_train = sequences
x_test = test_sequences

print(x_train.shape)
print(x_test.shape)

(19579, 24992)
(8392, 24992)


In [40]:
model = Sequential() 

# use regularizer to avoid overfitting
model.add(Dense(units=200,input_shape=(x_train.shape[1],),activation="relu",kernel_regularizer=l2(0.00001),
                activity_regularizer=l1(0.0001), name="hidden"))
model.add(Dropout(0.2))

model.add(Dense(units=200,activation="relu",kernel_regularizer=l2(0.00001),
                activity_regularizer=l1(0.0001),name="hidden2"))
model.add(Dropout(0.2))

model.add(Dense(units=200,activation="relu",kernel_regularizer=l2(0.00001),
                activity_regularizer=l1(0.0001),name="hidden3"))
model.add(Dropout(0.2))

model.add(Dense(units=3,activation="softmax",name="output"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden (Dense)               (None, 200)               4998600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
hidden2 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
hidden3 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
output (Dense)               (None, 3)                 603       
Total para

In [41]:
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=['accuracy'])

In [42]:
# Input must be dense matrix, so use ".todense()". 
# The input type is <class 'scipy.sparse.csr.csr_matrix'>
train_history = model.fit(x_train.todense(), y_train, validation_split=0.2, epochs=5, batch_size=20, verbose=1)

Train on 15663 samples, validate on 3916 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
scores = model.evaluate(x_train.todense(), y_train)  
print()
print(scores)

[0.2084850277925781, 0.95658613822757177]


In [44]:
%%time
prediction = model.predict(x_test.todense())
print()
print("Show 10 prediction result :")  
print(prediction[:10])
print(prediction.shape)


Show 10 prediction result :
[[  1.54014537e-03   1.06270122e-03   9.97397184e-01]
 [  9.94073212e-01   3.16292769e-03   2.76380521e-03]
 [  9.48968232e-01   4.24261577e-02   8.60555284e-03]
 [  9.29553509e-01   6.33916557e-02   7.05479272e-03]
 [  9.93693471e-01   2.83501716e-03   3.47146275e-03]
 [  8.38207185e-01   1.38528273e-01   2.32645106e-02]
 [  9.61651802e-01   3.19257788e-02   6.42235996e-03]
 [  5.11639230e-02   3.18004727e-01   6.30831361e-01]
 [  9.99853253e-01   6.82993777e-05   7.84516160e-05]
 [  9.97769117e-01   1.08538358e-03   1.14556402e-03]]
(8392, 3)
Wall time: 4.64 s


In [45]:
# submission

submission = pd.DataFrame(prediction, columns=le.classes_)
submission["id"] = test_df["id"]
cols = submission.columns.tolist()
cols = cols[-1:]+cols[:-1]
submission = submission[cols]
submission.head(10)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.00154,0.001063,0.997397
1,id24541,0.994073,0.003163,0.002764
2,id00134,0.948968,0.042426,0.008606
3,id27757,0.929554,0.063392,0.007055
4,id04081,0.993693,0.002835,0.003471
5,id27337,0.838207,0.138528,0.023265
6,id24265,0.961652,0.031926,0.006422
7,id25917,0.051164,0.318005,0.630831
8,id04951,0.999853,6.8e-05,7.8e-05
9,id14549,0.997769,0.001085,0.001146


In [46]:
submission.to_csv('submission_NN.csv',index=False)