In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from code.utils import *
from code.binormal_separation import bns

## Data

Load and join the training and test sets.

In [2]:
train = data(features('data/training/'), labels('data/training-class'))
test = data(features('data/test/'), labels('data/test-class'))

## Featurize

In [3]:
bnary = True
incld = 'none'
tknzr = tokenizer().tokenize
vocab = vocabulary(train.text.values, test.text.values, tknzr)

In [4]:
X_train, X_test = represent(train, test, as_binary=bnary,
                            tokenizer=tknzr, vocabulary=vocab)

In [5]:
X_train = append_features(X_train, train, include=incld, tokenizer=tknzr)
X_test = append_features(X_test, test, include=incld, tokenizer=tknzr)

y_train = train.label.values
y_test = test.label.values

In [6]:
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

## Model

### Fit and Predict

In [7]:
clf = LinearSVC()

In [8]:
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [9]:
y_hat = clf.predict(X_test)

### Score

In [10]:
accuracy_score(y_test, y_hat)

0.97178683385579934

In [11]:
precision_score(y_test, y_hat)

0.8666666666666667

In [12]:
recall_score(y_test, y_hat)

0.65000000000000002

In [13]:
f1_score(y_test, y_hat)

0.74285714285714288