In [19]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from keras.utils import np_utils

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [20]:
train_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/train.csv')
test_df = pd.read_csv('D:/Program/dataset/Spooky_Author_Identification/test.csv')

y_train = train_df["author"].values

In [21]:
with open('keras_input_train.pkl', 'rb') as f:
    sequences = pickle.load(f)
with open('keras_input_test.pkl', 'rb') as f:
    test_sequences = pickle.load(f)
    
x_train = sequences
x_test = test_sequences

print(x_train.shape)
print(x_test.shape)

(19579, 24992)
(8392, 24992)


In [22]:
%%time
# use cross validation

SPLIT = 10
train_pred = 0

print("The number of splitting iterations in the cross-validator : {}".format(SPLIT))
print()

clf_linearsvc = LinearSVC(random_state=0, verbose=1)
kf = KFold(n_splits=SPLIT,shuffle=True)
for train_index, test_index in kf.split(x_train):
    clf_linearsvc.fit(x_train[train_index].todense(), y_train[train_index])
    
    y_pred = clf_linearsvc.predict(x_train[test_index].todense())
    s = accuracy_score(y_train[test_index],y_pred)
    print("The accuracy score of cross validation is {}".format(s))
    
    train_pred += s/SPLIT

print()    
print("The mean of the cross validation score is {}".format(train_pred))

The number of splitting iterations in the cross-validator : 10

[LibLinear]The accuracy score of cross validation is 0.8130745658835546
[LibLinear]The accuracy score of cross validation is 0.8038815117466803
[LibLinear]The accuracy score of cross validation is 0.8130745658835546
[LibLinear]The accuracy score of cross validation is 0.8192032686414709
[LibLinear]The accuracy score of cross validation is 0.8100102145045965
[LibLinear]The accuracy score of cross validation is 0.8227783452502554
[LibLinear]The accuracy score of cross validation is 0.8227783452502554
[LibLinear]The accuracy score of cross validation is 0.8232890704800817
[LibLinear]The accuracy score of cross validation is 0.8074565883554647
[LibLinear]The accuracy score of cross validation is 0.8129790495656617

The mean of the cross validation score is 0.8148525525561577
Wall time: 30.9 s


In [23]:
y_pred = clf_linearsvc.predict(x_train.todense())
s = accuracy_score(y_train,y_pred)
print("The accuracy score {}".format(s))

The accuracy score 0.967056540170591


In [24]:
print(clf_linearsvc.classes_)
predict_probability = clf_linearsvc.predict(x_test)
print(predict_probability.shape)

# LinearSVC doesn't have "predict_proba" function

['EAP' 'HPL' 'MWS']
(8392,)


In [25]:
"""
I use linear SVM from scikit learn (LinearSVC) for binary classification problem. 
I understand that LinearSVC can give me the predicted labels, 
and the decision scores but I wanted probability estimates (confidence in the label). 
I want to continue using LinearSVC because of speed (as compared to sklearn.svm.SVC with linear kernel) 
Is it reasonable to use a logistic function to convert the decision scores to probabilities? 


scikit-learn provides CalibratedClassifierCV which can be used to solve this problem: 
it allows to add probability output to LinearSVC or any other classifier which implements decision_function method: 
"""

'\nI use linear SVM from scikit learn (LinearSVC) for binary classification problem. \nI understand that LinearSVC can give me the predicted labels, \nand the decision scores but I wanted probability estimates (confidence in the label). \nI want to continue using LinearSVC because of speed (as compared to sklearn.svm.SVC with linear kernel) \nIs it reasonable to use a logistic function to convert the decision scores to probabilities? \n\n\nscikit-learn provides CalibratedClassifierCV which can be used to solve this problem: \nit allows to add probability output to LinearSVC or any other classifier which implements decision_function method: \n'

In [26]:
%%time
clf = CalibratedClassifierCV(clf_linearsvc) 
clf.fit(x_train, y_train) 
predicted_test_proba = clf.predict_proba(x_test) 
predicted_test_proba[:10]

[LibLinear][LibLinear][LibLinear]Wall time: 723 ms


In [27]:
# submission

submission = pd.DataFrame(predicted_test_proba, columns=clf.classes_)
submission["id"] = test_df["id"]
cols = submission.columns.tolist()
cols = cols[-1:]+cols[:-1]
submission = submission[cols]
submission.head(10)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.239326,0.032669,0.728004
1,id24541,0.804704,0.184019,0.011277
2,id00134,0.524274,0.413373,0.062353
3,id27757,0.763704,0.223888,0.012408
4,id04081,0.902076,0.055704,0.042221
5,id27337,0.618611,0.370156,0.011233
6,id24265,0.567026,0.429518,0.003456
7,id25917,0.056053,0.297341,0.646606
8,id04951,0.948434,0.047442,0.004124
9,id14549,0.865714,0.085618,0.048668


In [28]:
submission.to_csv('submission_SVM.csv',index=False)