In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from code.utils import *

experiment = False
train_path = 'data/rstraining/' if experiment else 'data/training/'
train_label_path = 'data/rstraining-class' if experiment else 'data/training-class'
test_path = 'data/rstest/' if experiment else 'data/test/'
test_label_path = 'data/rstest-class' if experiment else 'data/test-class'
slang_path = 'data/slang.txt'

## Data

Load and join the training and test sets.

In [2]:
train = data(features(train_path), labels(train_label_path))
test = data(features(test_path), labels(test_label_path))

train['test'] = 0
test['test'] = 1

In [3]:
data_set = train.append(test, ignore_index=True)

In [4]:
data_set.shape

(4233, 4)

## Featurize

Add binary indicators for profane language and slang words.

In [5]:
profane = word_lists('data/profane.txt')
slang = word_lists('lists/slang.txt')
semantic_validity = parse_precomputed_features('lists/semantic_validity.txt')

In [6]:
data_set['profane'] = data_set.text.apply(lambda text: contains(profane, text))
data_set['slang'] = data_set.text.apply(lambda text: contains(slang, text))
data_set['validity'] = data_set.file.apply(lambda file: float(semantic_validity[file]))

Add slang feature. *(Add description.)*

In [7]:
cv = CountVectorizer(vocabulary=slang, tokenizer=tokenizer().tokenize)
data_slang = cv.fit_transform(data_set.text.values)

Create the binary document term matrix and label array.

In [8]:
cv = CountVectorizer(tokenizer=tokenizer().tokenize, binary=True)

X = cv.fit_transform(data_set.text.values)
y = data_set.label.values

Include profane language and slang words indicators as well as slang feature.

In [9]:
X = hstack([X,
            csr_matrix(data_set.profane.tolist()).T,
            csr_matrix(data_set.slang.tolist()).T,
            np.divide(data_slang.sum(axis=1), X.sum(axis=1)),
            csr_matrix(data_set.validity.tolist()).T],
           format='csr')

Split into training and testing matrices.

In [10]:
mask_train = data_set.test.values == 0
mask_test = data_set.test.values == 1

X_train = X[mask_train, :]
X_test = X[mask_test, :]

y_train = y[mask_train]
y_test = y[mask_test]

In [11]:
X_train.shape

(2638, 61046)

In [12]:
X_test.shape

(1595, 61046)

## Model

Note: on **all** current features.

### Fit and Predict

In [13]:
clf = LinearSVC()

In [14]:
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [15]:
y_hat = clf.predict(X_test)

### Score

In [16]:
accuracy_score(y_test, y_hat)

0.9736677115987461

In [17]:
precision_score(y_test, y_hat)

0.89189189189189189

In [18]:
recall_score(y_test, y_hat)

0.66000000000000003

In [19]:
f1_score(y_test, y_hat)

0.75862068965517249