In [4]:
import os
import pandas as pd

In [5]:
# Data from https://github.com/t-davidson/hate-speech-and-offensive-language
data_dir = "hatedata/"

In [6]:
hspeech = pd.read_csv(os.path.join(data_dir, 'labeled_data.csv'))
y = list(hspeech['class'].values)
x = list(hspeech['tweet'].values)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vectorizer = TfidfVectorizer()
X_v = vectorizer.fit_transform(x)

In [30]:
X_v

<24783x35852 sparse matrix of type '<class 'numpy.float64'>'
	with 324058 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_v, y)

In [7]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='stratified')

In [8]:
y_dummy_pred = dummy_clf.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_dummy_pred, digits=3))

             precision    recall  f1-score   support

          0      0.069     0.072     0.070       348
          1      0.774     0.778     0.776      4789
          2      0.182     0.175     0.178      1059

avg / total      0.633     0.635     0.634      6196



In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
y_pred = clf.predict(X_test)

In [47]:
y_train_pred = clf.predict(X_train)

In [43]:
from sklearn.metrics import classification_report

In [46]:
from collections import Counter
print(Counter(y))

Counter({1: 19190, 2: 4163, 0: 1430})


In [44]:
print(classification_report(y_test, y_pred, digits=3))

             precision    recall  f1-score   support

          0      0.000     0.000     0.000       361
          1      0.783     0.999     0.878      4789
          2      0.953     0.077     0.143      1046

avg / total      0.766     0.786     0.703      6196



  'precision', 'predicted', average, warn_for)


In [48]:
print(classification_report(y_train, y_train_pred, digits=3))

             precision    recall  f1-score   support

          0      0.000     0.000     0.000      1069
          1      0.800     1.000     0.889     14401
          2      0.983     0.187     0.314      3117

avg / total      0.785     0.806     0.741     18587



  'precision', 'predicted', average, warn_for)


In [19]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jeffpyke/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [29]:
# https://stackoverflow.com/questions/26195699/sklearn-how-to-speed-up-a-vectorizer-eg-tfidfvectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (ps.stem(w) for w in analyzer(doc))

tfidf = StemmedTfidfVectorizer(ngram_range=(1,3))

In [30]:
Xv = tfidf.fit_transform(x)



In [31]:
Xv[0]

<1x455214 sparse matrix of type '<class 'numpy.float64'>'
	with 61 stored elements in Compressed Sparse Row format>

In [32]:
from sklearn.model_selection import train_test_split
Xv_train, Xv_test, yv_train, yv_test = train_test_split(Xv, y)

In [33]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(Xv_train, yv_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
yv_pred = lr.predict(Xv_test)

In [37]:
from sklearn.metrics import classification_report
print(classification_report(yv_test, yv_pred, digits=3))

             precision    recall  f1-score   support

          0      0.714     0.028     0.054       359
          1      0.827     0.993     0.903      4828
          2      0.894     0.341     0.494      1009

avg / total      0.832     0.831     0.787      6196

