In [1]:
import numpy as np   
from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
import time
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv("C:/Users/user/Desktop/kaggle_Spooky_Author_Identification/train.csv")
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
X = df["text"].values
Y = df["author"].values

# TfidfVectorizer combines all the options of CountVectorizer and TfidfTransformer in a single model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
X

<19579x25068 sparse matrix of type '<class 'numpy.float64'>'
	with 429602 stored elements in Compressed Sparse Row format>

In [4]:
# split data to 4 pieces, the test size is .25
# use cross validation
rs = ShuffleSplit(n_splits=4, test_size=.25)
rs_list = list(rs.split(X))

In [5]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=200,learning_rate_init=0.1)
score_list = []
tic = time.clock()

for train_index, test_index in rs_list:
    cross_tic = time.clock()
    mlp.fit(X[train_index],Y[train_index])
    y_pred_en = mlp.predict(X[test_index])
    s = accuracy_score(Y[test_index],y_pred_en)
    score_list.append(s)
    print("The cross validation score in part is {}".format(s))
    print("The training of part costs {} s".format(time.clock()-cross_tic))
    print("=====================NEXT PART=====================")
    
score = np.mean(score_list)  
print("\nThe prediction accuracy score is {}".format(score))
print("Time for training spent {} secs".format(time.clock() - tic))

The cross validation score in part is 0.7836567926455567
The training of part costs 16.30496479796316 s
The cross validation score in part is 0.7961184882533198
The training of part costs 11.103499854974416 s
The cross validation score in part is 0.7875383043922369
The training of part costs 17.959830170326082 s
The cross validation score in part is 0.7957099080694586
The training of part costs 15.64452650480964 s

The prediction accuracy score is 0.790755873340143
Time for training spent 61.01326235353339 secs


In [6]:
# Convert to a list of floats with 8 decimal points
print(mlp.classes_)
predict_probability = mlp.predict_proba(X[test_index]) 
# print(predict_probability)

formatted_list = [ '%.8f' % elem for elem_list in predict_probability for elem in elem_list]
formatted_list = np.asarray(formatted_list).reshape(-1,3)
print(formatted_list)

['EAP' 'HPL' 'MWS']
[['0.00592130' '0.99407741' '0.00000129']
 ['0.00674830' '0.99324794' '0.00000376']
 ['1.00000000' '0.00000000' '0.00000000']
 ..., 
 ['0.05163592' '0.94836390' '0.00000018']
 ['0.00322837' '0.99677162' '0.00000001']
 ['0.99475323' '0.00001968' '0.00522709']]


In [7]:
# TEST DATA PREDICT

test_df = pd.read_csv("C:/Users/user/Desktop/kaggle_Spooky_Author_Identification/test.csv")
test_df.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [8]:
X_test = test_df["text"].values
X_test = vectorizer.transform(X_test)
predict_probability = mlp.predict_proba(X_test)
print(mlp.classes_)
# print(predict_probability)

formatted_list = [ '%.6f' % elem for elem_list in predict_probability for elem in elem_list]
formatted_list = np.asarray(formatted_list).reshape(-1,3)
print(formatted_list)

['EAP' 'HPL' 'MWS']
[['0.001409' '0.000000' '0.998591']
 ['0.999990' '0.000000' '0.000009']
 ['0.002514' '0.997486' '0.000000']
 ..., 
 ['0.474629' '0.000216' '0.525156']
 ['0.004724' '0.000000' '0.995276']
 ['0.003565' '0.996435' '0.000000']]


In [9]:
# submission

submission = pd.DataFrame(formatted_list, columns=mlp.classes_)
submission["id"] = test_df["id"]
cols = submission.columns.tolist()
cols = cols[-1:]+cols[:-1]
submission = submission[cols]
submission.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.001409,0.0,0.998591
1,id24541,0.99999,0.0,9e-06
2,id00134,0.002514,0.997486,0.0
3,id27757,0.999998,2e-06,0.0
4,id04081,0.996115,0.0,0.003885


In [10]:
submission.to_csv('submission_nn.csv',index=False)