In [61]:
import pandas as pd
import re
import numpy as np
import string
from textstat.textstat import textstat
from textblob import TextBlob
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

%matplotlib inline

In [33]:
class PorterTokenizer(object):
    """Custom PorterTokenizer for TfidfVectorizer"""

    def __init__(self):
        self.stemmer = PorterStemmer()

    def __call__(self, doc):
        translate_table = dict((ord(char), None) for char in string.punctuation)
        return [self.stemmer.stem(t) for t in word_tokenize(doc.translate(translate_table))]

class Vectorizer(object):
    """Vecotizer wrapper for sklearn TfidfVectorizer.

    Allows passing of custom tokenizer

    TODO: add more custom tokenizers"""

    def __init__(self,
                 tokenizer=None,
                 encoding='utf-8',
                 stop_words='english',
                 min_df=1,
                 ngram_range=None):
        self.tokenizers = {'porter': PorterTokenizer()}
        self.vectorizer = TfidfVectorizer(tokenizer=self.tokenizers[tokenizer],
                                          encoding=encoding,
                                          stop_words=stop_words,
                                          min_df=min_df,
                                          ngram_range=ngram_range)

    def fit(self, X):
        self.vectorizer.fit(X)
        return self

    def fit_transform(self, X):
        return self.vectorizer.fit_transform(X)

    def transform(self, X):
        return self.vectorizer.transform(x)

In [62]:
# feature extraction
def remove_handles(content):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)"," ",content).split())

def count_handles(content):
    return len(re.findall("(@[A-Za-z0-9]+)",content))

def bool_handles(content):
    match = re.search("(@[A-Za-z0-9]+)", content)
    if match:
        return 1
    else: return 0

def count_hashtags(content):
    return len(re.findall("(#[A-Za-z0-9]+)",content))

def bool_hashtags(content):
    match = re.search("(#[A-Za-z0-9]+)", content)
    if match:
        return 1
    else: return 0

def is_retweet(content):
    return int("RT " in content)

def has_url(content):
    return int("https://" in content or "http://" in content)

def build_POS_list(content):
    content = content.decode('latin-1')
    return ' '.join([item[1] for item in pos_tag(word_tokenize(content))])

def create_features(df, feature_cols, vec, pos_vectorizer):
    features = df[feature_cols].values
    n_gram_vector = vec.vectorizer.transform(df['tweet_no_handle'].values)
    pos_ngram_vector = pos_vectorizer.transform(df['pos_tags'].values)
    feature_vector = np.concatenate((n_gram_vector.todense(), features, pos_ngram_vector.todense()), axis=1)
    
    return feature_vector

# Model specifications for tweets
def test_model(base_model, param_grid):
    grid_clf = GridSearchCV(base_model, param_grid, cv=5)
    grid_clf.fit(train_features, y_train)
    preds = grid_clf.predict(test_features)
    print(classification_report(y_test, preds))
    return grid_clf

def top_words(clf, label, top):
    data = []
    for i in clf.best_estimator_.coef_[label, :].argsort()[::-1][:top]:
        top_words = (i, clf.best_estimator_.coef_[0, i], vec.vectorizer.get_feature_names()[i])
        return_data.append("{}".format(top_words[2]))
    return data

In [63]:
text_only_df = pd.read_csv('data/labels_and_text_only.csv', index_col=0)

In [None]:
## Features/ Columns

In [None]:
text_only_df['tweet_no_handle'] = text_only_df['tweet_text'].apply(remove_handles)
text_only_df['reading_ease'] = text_only_df['tweet_no_handle'].apply(textstat.flesch_reading_ease)
text_only_df['reading_grade'] = text_only_df['tweet_no_handle'].apply(textstat.flesch_kincaid_grade)
text_only_df['sentiment'] = text_only_df['tweet_no_handle'].map(lambda x: TextBlob(x.decode('latin-1')).polarity)
text_only_df['subjectivity'] = text_only_df['tweet_no_handle'].map(lambda x: TextBlob(x.decode('latin-1')).subjectivity)
text_only_df['mentions_count'] = text_only_df['tweet_text'].apply(count_handles)
text_only_df['mentions_bool'] = text_only_df['tweet_text'].apply(bool_handles)
text_only_df['hashtag_count'] = text_only_df['tweet_text'].apply(count_hashtags)
text_only_df['hashtag_bool'] = text_only_df['tweet_text'].apply(bool_hashtags)
text_only_df['has_url'] = text_only_df['tweet_text'].apply(is_retweet)
text_only_df['tweet_length'] = text_only_df['tweet_no_handle'].apply(len)
text_only_df['word_count'] = text_only_df['tweet_no_handle'].apply(textstat.lexicon_count)
text_only_df['syllable_count'] = text_only_df['tweet_no_handle'].apply(textstat.syllable_count)
text_only_df['pos_tags'] = text_only_df['tweet_no_handle'].apply(build_POS_list)

In [None]:
# Determine logisitic model with x and y

In [None]:
X = text_only_df.drop(['tweet_text', 'labels'], axis=1)
y = text_only_df['labels']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [None]:
vec = Vectorizer(tokenizer='porter',
                   encoding='latin-1',
                   min_df=5,
                   ngram_range=(1,3))
pos_vectorizer = CountVectorizer(ngram_range=(1,3), min_df=5)
text_only_df['pos_tag_ngrams'] = pos_vectorizer.fit_transform(text_only_df['pos_tags']).todense().tolist()

In [None]:
#text_only_df.columns

In [None]:
feature_cols = [u'reading_ease',
                u'reading_grade', 
                u'sentiment', 
                u'subjectivity', 
                u'mentions_count',
                u'mentions_bool', 
                u'hashtag_count', 
                u'hashtag_bool', 
                u'has_url',
                u'tweet_length', 
                u'word_count', 
                u'syllable_count']

In [None]:
train_text = X_train['tweet_no_handle'] 
vec.fit(train_text);
pos_vectorizer.fit(X_train['pos_tags'].tolist())

In [None]:
train_features = create_features(X_train, feature_cols, vec, pos_vectorizer)

In [None]:
test_features = create_features(X_test, feature_cols, vec, pos_vectorizer)

In [None]:
test_features.shape

## Modeling

In [None]:
lr_model = LogisticRegression(class_weight='balanced', C=1)

In [None]:
lr_model.fit(train_features, y_train)

In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000]}
]

log_r = LogisticRegression(class_weight='balanced')

In [None]:
my_model = test_model(log_r, param_grid)

In [55]:
## Dump the model and best fit

In [56]:
twitter_top_words = top_words(my_model, 0, 100)

nigger
faggot
queer
wetback
kill
fag
monkey
dyke
faggot like
sand nigger
retard
littl faggot
nigga
white trash
jew
homo
towel head
like faggot
faggot ass
spic
porch
faggot bitch
muzzi
trump
ur
dick


IndexError: list index out of range

In [57]:
joblib.dump(my_model, 'model/hate-speech-classifier.pkl') 

['model/hate-speech-classifier.pkl']

In [60]:
joblib.dump(vec, 'model/hate-speech-vector.pkl') 

['model/hate-speech-vector.pkl']