In [5]:
import pandas as pd
import numpy as np
import json

import re
import string
import unidecode
import demoji

# nltk
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


In [2]:
!pip install unidecode

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |█▍                              | 10kB 11.6MB/s eta 0:00:01[K     |██▊                             | 20kB 17.0MB/s eta 0:00:01[K     |████                            | 30kB 21.4MB/s eta 0:00:01[K     |█████▍                          | 40kB 17.3MB/s eta 0:00:01[K     |██████▉                         | 51kB 16.6MB/s eta 0:00:01[K     |████████▏                       | 61kB 18.2MB/s eta 0:00:01[K     |█████████▌                      | 71kB 14.0MB/s eta 0:00:01[K     |██████████▉                     | 81kB 12.2MB/s eta 0:00:01[K     |████████████▏                   | 92kB 12.5MB/s eta 0:00:01[K     |█████████████▋                  | 102kB 11.2MB/s eta 0:00:01[K     |███████████████                 | 112kB 11.2MB/s eta 0:00:01[K     |████████████████▎               | 12

In [3]:
!pip install demoji

Collecting demoji
  Downloading https://files.pythonhosted.org/packages/88/6a/34379abe01c9c36fe9fddc4181dd935332e7d0159ec3fae76f712e49bcea/demoji-0.4.0-py2.py3-none-any.whl
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: colorama, demoji
Successfully installed colorama-0.4.4 demoji-0.4.0


In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Load Data

In [7]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Load MMHS150K_GT**

In [8]:
with open('/content/drive/My Drive/Colab Notebooks/Group Project/MMHS150K_GT.json') as f:
    data_json = json.load(f)

In [9]:
mmhs = pd.DataFrame(data_json).T.reset_index(drop=True)

In [10]:
print('The MMHS150K dataset has ' + str(len(mmhs)) + ' data points.' )

The MMHS150K dataset has 149823 data points.


**Data Formatting**

In [12]:
NotHate = [1 if 0 in i else 0 for i in mmhs['labels']]
Racist = [1 if 1 in i else 0 for i in mmhs['labels']]
Sexist = [1 if 2 in i else 0 for i in mmhs['labels']]
Homophobe = [1 if 3 in i else 0 for i in mmhs['labels']]
Religion = [1 if 4 in i else 0 for i in mmhs['labels']]
OtherHate = [1 if 5 in i else 0 for i in mmhs['labels']]

In [13]:
labels_list = [NotHate, Racist, Sexist, Homophobe, Religion, OtherHate]
labels_name_list = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

In [14]:
for i in range(len(labels_list)):
    mmhs[labels_name_list[i]] = labels_list[i] 

**Load NAACL_SRW**

In [15]:
naacl = pd.read_excel("/content/drive/My Drive/Colab Notebooks/Group Project/NAACL_SRW_2016.xlsx", index_col = 0)

In [16]:
naacl.reset_index(drop=True, inplace=True)

In [17]:
print('The NAACL_SRW_2016 dataset has ' + str(len(naacl)) + ' data points.' )

The NAACL_SRW_2016 dataset has 16035 data points.


In [18]:
naacl['Label'].value_counts()

none      10933
sexism     3167
racism     1935
Name: Label, dtype: int64

In [19]:
NotHate_2 = [1 if 'none' in i else 0 for i in naacl['Label']]
Racist_2 = [1 if 'racism' in i else 0 for i in naacl['Label']]
Sexist_2 = [1 if 'sexism' in i else 0 for i in naacl['Label']]

In [20]:
naacl['NotHate'] = NotHate_2
naacl['Racist'] = Racist_2
naacl['Sexist'] = Sexist_2

### Joining the 2 datasets

In [21]:
mmhs.rename(columns={'tweet_text':'Tweets'}, inplace=True)

In [22]:
full_df = pd.concat([mmhs[['Tweets', 'NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']], naacl[['Tweets', 'NotHate', 'Racist', 'Sexist']]], ignore_index=True).fillna(0)

In [23]:
full_df[labels_name_list] = full_df[labels_name_list].astype(int)

In [24]:
print('The full dataset has ' + str(len(full_df)) + ' data points.' )

The full dataset has 165858 data points.


# Data Pre-Processing
Basic cleaning: Remove punctuation, hyperlinks, usernames (words after @), word accents. Lowercase tweets.


In [25]:
shrink_whitespace_reg = re.compile(r'\s{2,}')

In [26]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"(won't|won’t)", "will not", phrase)
    phrase = re.sub(r"(can't|can’t)", "can not", phrase)

    # general
    phrase = re.sub(r"n(\'|\’)t", " not", phrase)
    phrase = re.sub(r"(\'|\’)re", " are", phrase)
    phrase = re.sub(r"(\'|\’)s", " is", phrase)
    phrase = re.sub(r"(\'|\’)d", " would", phrase)
    phrase = re.sub(r"(\'|\’)ll", " will", phrase)
    phrase = re.sub(r"(\'|\’)t", " not", phrase)
    phrase = re.sub(r"(\'|\’)ve", " have", phrase)
    phrase = re.sub(r"(\'|\’)m", " am", phrase)
    return phrase

Retaining Emojis in our Pre-Processing steps for Hypothesis testing.
They will be removed right before model training.

In [27]:
emoji_pat = '[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]'

def clean_tweets_emoji(tweet):
    clean = str(tweet).lower() # lowercase tweets
    clean = decontracted(clean) # decontract tweets
    clean = re.sub(r'@[\w+]*', '', clean) # remove usernames
    clean = re.sub(r"http\S+", "", clean) # remove hyperlinks

    reg = re.compile(r'({})|[^a-zA-Z]'.format(emoji_pat)) 
    clean = reg.sub(lambda x: ' {} '.format(x.group(1)) if x.group(1) else ' ', clean)
    clean = clean.strip()
    
    return shrink_whitespace_reg.sub(' ', clean)

In [28]:
full_df['clean_tweets_emoji'] = full_df['Tweets'].apply(lambda x: clean_tweets_emoji(x))

### Tokenize and Lemmatize Tweets

- Tokenize: Using TweetTokenizer
- Lemmatize: By mapping tokens to their POS tag, and getting the base POS (using first alphabet)

In [29]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [30]:
tknzr = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

In [34]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [35]:
tweets_emoji_lemma = [[lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tknzr.tokenize(tweet)] for tweet in full_df['clean_tweets_emoji']]

### Remove Stopwords
* Adding "rt" (retweet) to stopword list as it has no predictive power

In [37]:
stopword_list = stopwords.words('english')

In [38]:
stopword_list.append('rt')

In [39]:
# Remove stopwords
tweets_emoji_train = [[w for w in t if w not in stopword_list] for t in tweets_emoji_lemma]

In [None]:
# Joining back to df

In [40]:
full_df['tweets_emoji_train'] = tweets_emoji_train
full_df['tweets_emoji_train'] = full_df['tweets_emoji_train'].apply(lambda x: ' '.join([w for w in x]))

**Replace Emojis with Emoji name**

In [41]:
demoji.download_codes()

Downloading emoji data ...
... OK (Got response in 0.52 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [42]:
def replace_emoji(tweet):
    for emoji, word in demoji.findall(tweet).items():
        tweet = tweet.replace(emoji, re.sub(' ', '_', word))
    return tweet

In [43]:
full_df['tweets_emoji_train'] = full_df['tweets_emoji_train'].apply(lambda x: replace_emoji(x))

**Remove Emojis**

In [67]:
def remove_emoji(tweet):
  reg = re.compile(r'({})|[^a-zA-Z]'.format(emoji_pat)) 
  clean = reg.sub(r' ', tweet)
  clean = clean.strip()
  return shrink_whitespace_reg.sub(' ', clean)

In [70]:
full_df.rename(columns={'clean_tweets_emoji': 'tweets_train'}, inplace=True)

In [72]:
full_df['tweets_train'] = tweets_emoji_train
full_df['tweets_train'] = full_df['tweets_train'].apply(lambda x: ' '.join([w for w in x]))
full_df['tweets_train'] = full_df['tweets_train'].apply(lambda x: remove_emoji(x))

# Remove tweets with length <= 3

In [75]:
length = []

for i in full_df['tweets_train'].str.split(' '):
    length.append(len(i))

full_df['length'] = length

full_clean_df = full_df[full_df['length'] >= 3].reset_index(drop=True)

In [79]:
full_clean_df.drop(labels='length', axis=1, inplace=True)

# Create Train Data Hypothesis Testing of removing "nigga"

In [80]:
full_clean_df

Unnamed: 0,Tweets,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate,tweets_train,tweets_emoji_train
0,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1,0,0,0,0,0,nigga momma youngboy spit real shit nigga,nigga momma youngboy spit real shit nigga
1,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1,1,0,0,0,0,xxsugvngxx ran holy nigga today,xxsugvngxx ran holy nigga today loudly_crying_...
2,“EVERYbody calling you Nigger now!” https://t....,1,1,0,0,0,0,everybody call nigger,everybody call nigger
3,“ real ass bitch give a fuck boutta nigga” htt...,1,0,0,0,0,0,real bitch give fuck boutta nigga,real bitch give fuck boutta nigga
4,@WhiteHouse @realDonaldTrump Fuck ice. White s...,0,1,0,0,0,1,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage
...,...,...,...,...,...,...,...,...,...
143272,RT @niamh_bull: #katandandre #mkr far out if s...,0,0,1,0,0,0,katandandre mkr far someone let whoop u grrrrrr,katandandre mkr far someone let whoop u grrrrrr
143273,BAHAHAHA! OMG.. Kat has no idea.... NONE!!! ...,0,0,1,0,0,0,bahahaha omg kat idea none mkr,bahahaha omg kat idea none mkr
143274,RT @daniel_kaye: It's not Kat and Andre - it's...,0,0,1,0,0,0,kat andre cuntandandre tweeps get hashtag go mkr,kat andre cuntandandre tweeps get hashtag go mkr
143275,"Never mind nails down a chalk board, all you n...",0,0,1,0,0,0,never mind nail chalk board need annie voice s...,never mind nail chalk board need annie voice s...


In [85]:
full_clean_df['tweets_nig_train'] = full_clean_df['tweets_train'].apply(lambda x: re.sub('nigga', '', x).strip())

In [86]:
full_clean_df.head(10)

Unnamed: 0,Tweets,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate,tweets_train,tweets_emoji_train,tweets_nig_train
0,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1,0,0,0,0,0,nigga momma youngboy spit real shit nigga,nigga momma youngboy spit real shit nigga,momma youngboy spit real shit
1,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1,1,0,0,0,0,xxsugvngxx ran holy nigga today,xxsugvngxx ran holy nigga today loudly_crying_...,xxsugvngxx ran holy today
2,“EVERYbody calling you Nigger now!” https://t....,1,1,0,0,0,0,everybody call nigger,everybody call nigger,everybody call nigger
3,“ real ass bitch give a fuck boutta nigga” htt...,1,0,0,0,0,0,real bitch give fuck boutta nigga,real bitch give fuck boutta nigga,real bitch give fuck boutta
4,@WhiteHouse @realDonaldTrump Fuck ice. White s...,0,1,0,0,0,1,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage
5,@DefNotJerm So.... you turn to twitter for it ...,1,0,0,0,0,0,turn twitter instead beating nigga contact police,turn twitter instead beating nigga contact police,turn twitter instead beating contact police
6,@WhatUpJT I swear I was waiting for her to mou...,1,0,0,0,0,0,swear wait mouth word nigger,swear wait mouth word nigger face_with_tears_o...,swear wait mouth word nigger
7,I’m 💯 behind you nigga u my thug brother🖤 http...,1,0,0,0,0,0,behind nigga u thug brother,hundred_points behind nigga u thug brother bla...,behind u thug brother
8,bf: move your bighead 😅😂 gf: NIGGA WHAT?? THA...,0,1,1,0,0,0,bf move bighead gf nigga daddy left,bf move bighead grinning_face_with_sweat face_...,bf move bighead gf daddy left
9,@OriginalSlimC This fat nigga slander is getti...,1,0,0,1,0,0,fat nigga slander get outta hand,fat nigga slander get outta hand loudly_crying...,fat slander get outta hand


In [88]:
full_clean_df.to_excel("/content/drive/My Drive/Colab Notebooks/Group Project/full_clean_df.xlsx")