In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import pickle

In [2]:
# Load in the data:
def load_data(text_filename, label_filename, nrows=None):
    # Import data\hate\test_text.txt txt file into dataframe
    df_text = pd.read_csv(text_filename, sep='\t', header=None, nrows=nrows)
    # Import corresponding labels into dataframe
    df_labels = pd.read_csv(label_filename, sep='\t', header=None, nrows=nrows)
    # Combine the two datasets into one with header
    df = pd.concat([df_text, df_labels], axis=1)
    # Rename columns
    df.columns = ['text', 'label']
    return df

df_train = load_data('data/hate/train_text.txt', 'data/hate/train_labels.txt', nrows=5000)
df_test = load_data('data/hate/test_text.txt', 'data/hate/test_labels.txt', nrows=1000) # For some reason the test data can't load in all 2970 lines correctly
df_test

Unnamed: 0,text,label
0,"@user , you are correct that Reid certainly is...",0
1,Whoever just unfollowed me you a bitch,1
2,@user @user Those People Invaded Us!!! They DO...,1
3,"stop JUDGING bitches by there cover, jus cuz s...",1
4,how about i knock heads off and send them gift...,1
...,...,...
995,Bitch shut the fuck with your muggle ass...ava...,1
996,My cousins and dude made me slow down on being...,0
997,Shakespeare was really wildin' back in the day...,0
998,@user Are those not grapes bitch?,0


## Vectorizer:

In [3]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_train['text'])
X

<5000x11862 sparse matrix of type '<class 'numpy.int64'>'
	with 54540 stored elements in Compressed Sparse Row format>

## Model:

In [4]:
model = SGDClassifier(loss='log')
model.fit(X, df_train['label'])

SGDClassifier(loss='log')

## Evaluate:

In [8]:
pickle.dump( vectorizer, open( "vec.p", "wb" ) )
pickle.dump( model, open( "model.p", "wb" ) )


In [11]:
# Measure the performance
X_test = vectorizer.transform(df_test['text'])
y_pred = model.predict(X_test)
target_names = ['not hat', 'hate']
print(classification_report(df_test['label'], y_pred, target_names=target_names))

              precision    recall  f1-score   support

     not hat       0.63      0.53      0.58       579
        hate       0.47      0.57      0.51       421

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.54      1000
weighted avg       0.56      0.55      0.55      1000

