In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score

from code.utils import *

## Data

In [2]:
train = data(features('data/training/'), labels('data/training-class'))

In [3]:
test = data(features('data/test/'), labels('data/test-class'))

## Featurize

In [4]:
vocabulary = vocabulary(train.text.values, test.text.values)

In [5]:
cv = CountVectorizer(vocabulary=vocabulary)

In [6]:
X_train = cv.fit_transform(train.text.values)
y_train = train.label.values

In [7]:
X_train.shape

(2638, 46421)

In [8]:
X_test = cv.fit_transform(test.text.values)
y_test = test.label.values

In [9]:
X_test.shape

(1595, 46421)

## Model

### Fit and Predict

In [10]:
clf = LinearSVC()

In [11]:
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [12]:
y_hat = clf.predict(X_test)

### Score

In [13]:
accuracy_score(y_test, y_hat)

0.96677115987460815

In [14]:
f1_score(y_test, y_hat)

0.69364161849710992

In [15]:
recall_score(y_test, y_hat)

0.59999999999999998