In [138]:
import numpy as np   
from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
import time
import pandas as pd
import matplotlib.pyplot as plt 

In [139]:
df = pd.read_csv("C:/Users/user/Desktop/kaggle/train.csv")
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [140]:
print(df['author'].value_counts())
X = df["text"].values
Y = df["author"].values

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64


In [141]:
# This cell is equal to next cell

# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(X)
# tf_transformer = TfidfTransformer(use_idf=False).fit(X)
# X = tf_transformer.transform(X)

In [142]:
# CountVectorizer implements both tokenization and occurrence counting in a single class
""""In a large text corpus, some words will be very present (e.g. “the”, “a”, “is” in English) 
hence carrying very little meaningful information about the actual contents of the document. 
If we were to feed the direct count data directly to a classifier those very frequent terms would 
shadow the frequencies of rarer yet more interesting terms."""
# TfidfVectorizer combines all the options of CountVectorizer and TfidfTransformer in a single model

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
X

<19579x25068 sparse matrix of type '<class 'numpy.float64'>'
	with 429602 stored elements in Compressed Sparse Row format>

In [143]:
# split data to 4 pieces, the test size is .25
# use cross validation
rs = ShuffleSplit(n_splits=4, test_size=.25)
rs_list = list(rs.split(X))

In [144]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=15)
score_list = []
tic = time.clock()

for train_index, test_index in rs_list:

    # the train_index is 3/4 of data, and it's random
    # train the tree
    clf_entropy.fit(X[train_index], Y[train_index])

    #predict the output
    y_pred_en = clf_entropy.predict(X[test_index])

    #score the prediction
    s = accuracy_score(Y[test_index],y_pred_en)
    score_list.append(s)
    
score = np.mean(score_list)    
print("Time for training spent {} secs".format(time.clock() - tic))
print("The prediction accuracy score is {}".format(score))
print(clf_entropy.classes_)
print(clf_entropy.predict_proba(X[test_index]))

Time for training spent 8.322034739678202 secs
The prediction accuracy score is 0.5280388151174668
['EAP' 'HPL' 'MWS']
[[ 0.          0.          1.        ]
 [ 0.53065074  0.2659541   0.20339516]
 [ 0.53065074  0.2659541   0.20339516]
 ..., 
 [ 0.33102041  0.4155102   0.25346939]
 [ 0.36363636  0.18181818  0.45454545]
 [ 1.          0.          0.        ]]


In [145]:
from sklearn.naive_bayes import MultinomialNB

gnb =  MultinomialNB()
score_list = []
tic = time.clock()

for train_index, test_index in rs_list:

    # the train_index is 3/4 of data, and it's random
    # train the tree
    gnb.fit(X[train_index], Y[train_index])

    #predict the output
    y_pred_en = gnb.predict(X[test_index])

    #score the prediction
    s = accuracy_score(Y[test_index],y_pred_en)
    score_list.append(s)

score = np.mean(score_list) 
print("Time for training spent {} secs".format(time.clock() - tic))
print("The prediction accuracy score is {}".format(score))
print(gnb.classes_)
predict_probability = gnb.predict_proba(X[test_index])
print(predict_probability)

Time for training spent 0.1909767421620927 secs
The prediction accuracy score is 0.8028600612870275
['EAP' 'HPL' 'MWS']
[[ 0.15727955  0.0929278   0.74979265]
 [ 0.81708369  0.02760237  0.15531394]
 [ 0.49135204  0.2913402   0.21730776]
 ..., 
 [ 0.29937187  0.2930788   0.40754932]
 [ 0.52755932  0.21394812  0.25849256]
 [ 0.66419773  0.18866107  0.1471412 ]]


In [146]:
# TEST DATA PREDICT

test_df = pd.read_csv("C:/Users/user/Desktop/kaggle/test.csv")
test_df.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [157]:
X_test = test_df["text"].values
X_test = vectorizer.transform(X_test)
predict_probability = gnb.predict_proba(X_test)
print(gnb.classes_)
print(predict_probability)

['EAP' 'HPL' 'MWS']
[[ 0.24306993  0.10767017  0.6492599 ]
 [ 0.74726846  0.13350911  0.11922243]
 [ 0.44088057  0.46265635  0.09646308]
 ..., 
 [ 0.79019247  0.08156215  0.12824538]
 [ 0.29597338  0.07158703  0.63243959]
 [ 0.43648689  0.45694563  0.10656748]]


In [158]:
# submission

submission = pd.DataFrame(predict_probability, columns=gnb.classes_)
submission["id"] = test_df["id"]
cols = submission.columns.tolist()
cols = cols[-1:]+cols[:-1]
submission = submission[cols]
submission.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.24307,0.10767,0.64926
1,id24541,0.747268,0.133509,0.119222
2,id00134,0.440881,0.462656,0.096463
3,id27757,0.561594,0.361322,0.077084
4,id04081,0.699153,0.119951,0.180896


In [154]:
submission.to_csv('submission.csv',index=False)