In [143]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import coo_matrix, vstack, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score

from code.utils import *
experiment = False
train_path = 'data/rstraining/' if experiment else 'data/training/'
train_label_path = 'data/rstraining-class' if experiment else 'data/training-class'
test_path = 'data/rstest/' if experiment else 'data/test/'
test_label_path = 'data/rstest-class' if experiment else 'data/test-class'
slang_path = 'data/slang.txt'

## Data

In [145]:
train = data(features(train_path), labels(train_label_path))

In [146]:
test = data(features(test_path), labels(test_label_path))

## Featurize

In [147]:
vocab = vocabulary(train.text.values, test.text.values)

In [148]:
slang = slangwords(slang_path)

In [158]:
cv = CountVectorizer(vocabulary=vocab)
slang_cv = CountVectorizer(vocabulary=slang)

In [167]:
X_train = cv.fit_transform(train.text.values)
y_train = train.label.values
slang_train = slang_cv.fit_transform(train.text.values)
X_train = hstack([X_train, np.divide(slang_train.sum(axis=1), X_train.sum(axis=1))])

In [168]:
X_test = cv.fit_transform(test.text.values)
y_test = test.label.values
slang_test = slang_cv.fit_transform(test.text.values)
X_test = hstack([X_test, np.divide(slang_test.sum(axis=1), X_test.sum(axis=1))])

In [169]:
X_test.shape

(1595, 46422)

In [171]:
X_train.shape

(2638, 46422)

## Model

### Fit and Predict

In [172]:
clf = LinearSVC()

In [173]:
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [175]:
y_hat = clf.predict(X_test)

### Score

In [176]:
accuracy_score(y_test, y_hat)

0.96677115987460815

In [177]:
f1_score(y_test, y_hat)

0.69364161849710992

In [178]:
recall_score(y_test, y_hat)

0.59999999999999998