In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


%matplotlib inline

ImportError: No module named pandas

In [None]:
tweet_df = pd.read_csv('../data/labeled-twitter-hate-speech.csv')

## Checking Things Out

In [None]:
tweet_df.columns

In [None]:
tweet_df.orig__golden.unique()

In [None]:
tweet_df.orig__golden.value_counts()

In [None]:
tweet_df.does_this_tweet_contain_hate_speech.value_counts()

In [None]:
tweet_df.info()

In [None]:
tweet_df.does_this_tweet_contain_hate_speechconfidence.value_counts()

In [None]:
golden_df = tweet_df[tweet_df['orig__golden'] == True]

In [None]:
golden_df.does_this_tweet_contain_hate_speech.value_counts()

In [None]:
golden_df._trusted_judgments.value_counts()

## Golden Tweets

Determined that `golden` tweets represent tweets reviewed by a large number of users, so seems worthwhile to make use of the column as a boolean value. Filling the null values in that column with `False` as a result.

In [None]:
tweet_df.orig__golden.fillna(False, inplace=True)

In [None]:
tweet_df.orig__golden.value_counts()

## NLP on Tweet Text

Separating out the tweet text and labels to focus on the NLP techniques first before returning to other values provided with data.

In [None]:
tweet_df.columns

In [None]:
tweet_df.rename?

In [None]:
tweet_df.rename(columns={'does_this_tweet_contain_hate_speech':'labels'}, inplace=True)

In [None]:
tweet_df.columns

In [None]:
text_only_cols = ['labels', 'tweet_text']

In [None]:
text_only_df = tweet_df.copy()[text_only_cols]

In [None]:
text_only_df['labels'].value_counts()

In [None]:
def convert_labels(row):
    if row == 'The tweet is not offensive':
        return 'not offensive'
    elif row == 'The tweet uses offensive language but not hate speech':
        return 'offensive'
    else:
        return 'hate'

In [None]:
text_only_df['labels'] = text_only_df['labels'].apply(convert_labels)

In [None]:
text_only_df['labels'].value_counts()

In [None]:
text_only_df.to_csv('../data/labels_and_text_only.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
class PorterTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [None]:
vectorizer = TfidfVectorizer(tokenizer=PorterTokenizer(), encoding='latin-1', ngram_range=(1,3), stop_words='english')

In [None]:
X = text_only_df['tweet_text'].values
y = text_only_df['labels'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
X_trans = vectorizer.fit_transform(X_train)

In [None]:
X_trans.shape