In [1]:
import numpy as np

from sklearn import metrics
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# load processed datasets
train_data = np.genfromtxt('./AGNews_data/proc_train.txt', delimiter=',', dtype=str)
test_data = np.genfromtxt('./AGNews_data/proc_test.txt', delimiter=',', dtype=str)

In [3]:
# shuffle training data
np.random.shuffle(train_data)

In [4]:
# separate attributes from labels
train_x = train_data[:,0]
train_y = train_data[:,1].astype(int)

test_x = test_data[:,0]
test_y = test_data[:,1].astype(int)

In [5]:
# bag-of-words representation
#vectorizer = CountVectorizer(analyzer='word', stop_words='english')#, ngram_range=(1, 2))
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
train_vectors = vectorizer.fit_transform(train_x)
test_vectors = vectorizer.transform(test_x)

In [9]:
# SVM
best_model = 'None'
best_c = 0
best_acc = 0

c_list = np.linspace(0.01, 1, 100)

for c in c_list:
    svc_clf = svm.LinearSVC(C=c)
    svc_clf.fit(train_vectors, train_y)
    pred_y = svc_clf.predict(test_vectors.toarray())
    acc = metrics.accuracy_score(test_y, pred_y)

    if acc > best_acc:
        best_model = svc_clf
        best_c = c
        best_acc = acc

    print("(c={}): accuracy={}".format(c, acc))

print("Best model: (c={}): accuracy={}".format(best_c, best_acc))

(c=0.01): accuracy=0.8990789473684211
(c=0.02): accuracy=0.9053947368421053
(c=0.03): accuracy=0.9093421052631578
(c=0.04): accuracy=0.9114473684210527
(c=0.05): accuracy=0.9139473684210526
(c=0.060000000000000005): accuracy=0.9153947368421053
(c=0.06999999999999999): accuracy=0.9178947368421052
(c=0.08): accuracy=0.9184210526315789
(c=0.09): accuracy=0.9193421052631578
(c=0.09999999999999999): accuracy=0.92
(c=0.11): accuracy=0.9202631578947369
(c=0.12): accuracy=0.9207894736842105
(c=0.13): accuracy=0.9206578947368421
(c=0.14): accuracy=0.9210526315789473
(c=0.15000000000000002): accuracy=0.9219736842105263
(c=0.16): accuracy=0.9223684210526316
(c=0.17): accuracy=0.9228947368421052
(c=0.18000000000000002): accuracy=0.9223684210526316
(c=0.19): accuracy=0.9221052631578948
(c=0.2): accuracy=0.9221052631578948
(c=0.21000000000000002): accuracy=0.9222368421052631
(c=0.22): accuracy=0.9221052631578948
(c=0.23): accuracy=0.9223684210526316
(c=0.24000000000000002): accuracy=0.9225
(c=0.25):