In [54]:
import pandas as pd
import numpy as np
import pickle

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import one_hot
from sklearn.preprocessing import LabelEncoder

In [55]:
train_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/train.csv')
test_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/test.csv')


# Label Onehot encoding  

le = LabelEncoder()
le.fit(train_df['author'])
print(le.classes_)

y = le.transform(train_df['author'])
y_train = np_utils.to_categorical(y)

y_train

['EAP' 'HPL' 'MWS']


array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [56]:
with open('keras_input_train.pkl', 'rb') as f:
    sequences = pickle.load(f)
with open('keras_input_test.pkl', 'rb') as f:
    test_sequences = pickle.load(f)
    
x_train = sequences
x_test = test_sequences

print(x_train.shape)
print(x_test.shape)

(19579, 25068)
(8392, 25068)


In [57]:
model = Sequential() 
model.add(Dense(units=200,input_shape=(x_train.shape[1],),activation="relu",name="hidden"))
model.add(Dense(units=200,activation="relu",name="hidden2"))
model.add(Dense(units=200,activation="relu",name="hidden3"))
model.add(Dense(units=3,activation="softmax",name="output"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden (Dense)               (None, 200)               5013800   
_________________________________________________________________
hidden2 (Dense)              (None, 200)               40200     
_________________________________________________________________
hidden3 (Dense)              (None, 200)               40200     
_________________________________________________________________
output (Dense)               (None, 3)                 603       
Total params: 5,094,803
Trainable params: 5,094,803
Non-trainable params: 0
_________________________________________________________________


In [58]:
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=['accuracy'])

In [59]:
# Input must be dense matrix, so use ".todense()". 
# The input type is <class 'scipy.sparse.csr.csr_matrix'>
train_history = model.fit(x_train.todense(), y_train, validation_split=0.2, epochs=5, batch_size=20, verbose=1)

Train on 15663 samples, validate on 3916 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [60]:
scores = model.evaluate(x_train.todense(), y_train)  
print()
print(scores)

[0.2165531121138474, 0.96511568516071211]


In [61]:
%%time
prediction = model.predict(x_test.todense())
print()
print("Show 10 prediction result :")  
print(prediction[:10])
print(prediction.shape)


Show 10 prediction result :
[[  3.01727996e-04   1.76775120e-05   9.99680638e-01]
 [  1.00000000e+00   6.48853193e-09   9.70381220e-10]
 [  3.18958424e-04   9.99663949e-01   1.69844243e-05]
 [  9.99985218e-01   1.47614037e-05   1.25481963e-08]
 [  9.99953747e-01   3.40093502e-05   1.23074478e-05]
 [  9.99070585e-01   9.29360976e-04   1.27822830e-07]
 [  9.99671221e-01   1.51088549e-04   1.77640512e-04]
 [  6.76922619e-01   1.74331978e-01   1.48745343e-01]
 [  1.00000000e+00   2.24265949e-16   5.74485538e-18]
 [  9.99999881e-01   1.62129083e-08   1.76212225e-07]]
(8392, 3)
Wall time: 3.73 s


In [62]:
# submission

submission = pd.DataFrame(prediction, columns=le.classes_)
submission["id"] = test_df["id"]
cols = submission.columns.tolist()
cols = cols[-1:]+cols[:-1]
submission = submission[cols]
submission.head(10)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.000302,1.767751e-05,0.9996806
1,id24541,1.0,6.488532e-09,9.703812e-10
2,id00134,0.000319,0.9996639,1.698442e-05
3,id27757,0.999985,1.47614e-05,1.25482e-08
4,id04081,0.999954,3.400935e-05,1.230745e-05
5,id27337,0.999071,0.000929361,1.278228e-07
6,id24265,0.999671,0.0001510885,0.0001776405
7,id25917,0.676923,0.174332,0.1487453
8,id04951,1.0,2.242659e-16,5.744855e-18
9,id14549,1.0,1.621291e-08,1.762122e-07


In [63]:
submission.to_csv('submission.csv',index=False)