In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


%matplotlib inline

In [2]:
tweet_df = pd.read_csv('../data/labeled-twitter-hate-speech.csv')

## Checking Things Out

In [3]:
tweet_df.columns

Index([u'_unit_id', u'_golden', u'_unit_state', u'_trusted_judgments',
       u'_last_judgment_at', u'does_this_tweet_contain_hate_speech',
       u'does_this_tweet_contain_hate_speech:confidence', u'_created_at',
       u'orig__golden', u'orig__last_judgment_at', u'orig__trusted_judgments',
       u'orig__unit_id', u'orig__unit_state', u'_updated_at',
       u'orig_does_this_tweet_contain_hate_speech',
       u'does_this_tweet_contain_hate_speech_gold',
       u'does_this_tweet_contain_hate_speech_gold_reason',
       u'does_this_tweet_contain_hate_speechconfidence', u'tweet_id',
       u'tweet_text'],
      dtype='object')

In [9]:
tweet_df.orig__golden.unique()

array([True, nan], dtype=object)

In [5]:
tweet_df.orig__golden.value_counts()

True    67
Name: orig__golden, dtype: int64

In [7]:
tweet_df.does_this_tweet_contain_hate_speech.value_counts()

The tweet is not offensive                               7274
The tweet uses offensive language but not hate speech    4836
The tweet contains hate speech                           2399
Name: does_this_tweet_contain_hate_speech, dtype: int64

In [8]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14509 entries, 0 to 14508
Data columns (total 20 columns):
_unit_id                                           14509 non-null int64
_golden                                            14509 non-null bool
_unit_state                                        14509 non-null object
_trusted_judgments                                 14509 non-null int64
_last_judgment_at                                  14442 non-null object
does_this_tweet_contain_hate_speech                14509 non-null object
does_this_tweet_contain_hate_speech:confidence     14509 non-null float64
_created_at                                        0 non-null float64
orig__golden                                       67 non-null object
orig__last_judgment_at                             0 non-null float64
orig__trusted_judgments                            67 non-null float64
orig__unit_id                                      67 non-null float64
orig__unit_state               

In [10]:
tweet_df.does_this_tweet_contain_hate_speechconfidence.value_counts()

1.0    67
Name: does_this_tweet_contain_hate_speechconfidence, dtype: int64

In [13]:
golden_df = tweet_df[tweet_df['orig__golden'] == True]

In [14]:
golden_df.does_this_tweet_contain_hate_speech.value_counts()

The tweet uses offensive language but not hate speech    29
The tweet is not offensive                               21
The tweet contains hate speech                           17
Name: does_this_tweet_contain_hate_speech, dtype: int64

In [15]:
golden_df._trusted_judgments.value_counts()

91    8
93    7
90    7
88    7
94    6
92    6
95    5
89    5
87    5
98    3
86    3
96    2
97    1
85    1
84    1
Name: _trusted_judgments, dtype: int64

## Golden Tweets

Determined that `golden` tweets represent tweets reviewed by a large number of users, so seems worthwhile to make use of the column as a boolean value. Filling the null values in that column with `False` as a result.

In [17]:
tweet_df.orig__golden.fillna(False, inplace=True)

In [18]:
tweet_df.orig__golden.value_counts()

False    14442
True        67
Name: orig__golden, dtype: int64

## NLP on Tweet Text

Separating out the tweet text and labels to focus on the NLP techniques first before returning to other values provided with data.

In [21]:
tweet_df.columns

Index([u'_unit_id', u'_golden', u'_unit_state', u'_trusted_judgments',
       u'_last_judgment_at', u'does_this_tweet_contain_hate_speech',
       u'does_this_tweet_contain_hate_speech:confidence', u'_created_at',
       u'orig__golden', u'orig__last_judgment_at', u'orig__trusted_judgments',
       u'orig__unit_id', u'orig__unit_state', u'_updated_at',
       u'orig_does_this_tweet_contain_hate_speech',
       u'does_this_tweet_contain_hate_speech_gold',
       u'does_this_tweet_contain_hate_speech_gold_reason',
       u'does_this_tweet_contain_hate_speechconfidence', u'tweet_id',
       u'tweet_text'],
      dtype='object')

In [27]:
tweet_df.rename?

In [28]:
tweet_df.rename(columns={'does_this_tweet_contain_hate_speech':'labels'}, inplace=True)

In [29]:
tweet_df.columns

Index([u'_unit_id', u'_golden', u'_unit_state', u'_trusted_judgments',
       u'_last_judgment_at', u'labels',
       u'does_this_tweet_contain_hate_speech:confidence', u'_created_at',
       u'orig__golden', u'orig__last_judgment_at', u'orig__trusted_judgments',
       u'orig__unit_id', u'orig__unit_state', u'_updated_at',
       u'orig_does_this_tweet_contain_hate_speech',
       u'does_this_tweet_contain_hate_speech_gold',
       u'does_this_tweet_contain_hate_speech_gold_reason',
       u'does_this_tweet_contain_hate_speechconfidence', u'tweet_id',
       u'tweet_text'],
      dtype='object')

In [30]:
text_only_cols = ['labels', 'tweet_text']

In [44]:
text_only_df = tweet_df.copy()[text_only_cols]

In [45]:
text_only_df['labels'].value_counts()

The tweet is not offensive                               7274
The tweet uses offensive language but not hate speech    4836
The tweet contains hate speech                           2399
Name: labels, dtype: int64

In [46]:
def convert_labels(row):
    if row == 'The tweet is not offensive':
        return 'not offensive'
    elif row == 'The tweet uses offensive language but not hate speech':
        return 'offensive'
    else:
        return 'hate'

In [47]:
text_only_df['labels'] = text_only_df['labels'].apply(convert_labels)

In [49]:
text_only_df['labels'].value_counts()

not offensive    7274
offensive        4836
hate             2399
Name: labels, dtype: int64

In [50]:
text_only_df.to_csv('../data/labels_and_text_only.csv')

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer

In [54]:
class PorterTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [71]:
vectorizer = TfidfVectorizer(tokenizer=PorterTokenizer(), encoding='latin-1', ngram_range=(1,3), stop_words='english')

In [72]:
X = text_only_df['tweet_text'].values
y = text_only_df['labels'].values

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [74]:
X_trans = vectorizer.fit_transform(X_train)

In [75]:
X_trans.shape

(13058, 217020)