# Hate Speech Detector

## Helper methods

In [1]:
import pandas as pd
import numpy as np

import os
import csv
import pickle
import re

from tqdm.notebook import tqdm

In [2]:
def sanitize_text(text, preserve_hashtags=False):
    text = text.replace('\n', '')  # remove newline character
    text = text.replace('\r', '')  # remove return character
    text = text.replace('NEWLINE_TOKEN', '')  # remove NEWLINE_TOKEN text
    words = text.split(' ')
    words = [w for w in words if not w.startswith('http')]  # remove links
    words = [w for w in words if not w.startswith('@')]  # remove user mentions
    if not preserve_hashtags:
        words = [w for w in words if not w.startswith('#')]  # remove hashtags
    words = [w for w in words if not w.startswith('&') or not w.endswith(';')]  # remove html entities
    text = ' '.join(words)
    
    return text

def sanitize_label(label):
    label = str(label)
    label = label.replace('\n', '')  # remove newline character
    label = label.replace('\r', '')  # remove return character
    
    return label

In [3]:
def cardinality(data):
    return {'data': len(data)}

def class_frequencies(data):
    class_freq = {}
    for d in data:
        if d[-1] not in class_freq:
            class_freq[d[-1]] = 1
        else:
            class_freq[d[-1]] += 1
    return class_freq

def class_balance(data):
    freq = class_frequencies(data)
    total = sum([v for k, v in freq.items()])
    return {k: round(float(v)*100/total, 2) for k, v in freq.items()}

In [4]:
def hashtags(loader, lowercase=False, top_count=200):
    data = loader()
    cl_hts_dict = {}
    
    for d in data:
        text = d[0].lower() if lowercase else d[0]
        hts = re.findall(r"#(\w+)", text)
        cl = d[1]
        if cl not in cl_hts_dict:
            cl_hts_dict[cl] = {}
        for ht in hts:
            if ht not in cl_hts_dict[cl]:
                cl_hts_dict[cl][ht] = 1
            else:
                cl_hts_dict[cl][ht] += 1
    top_hts_dict = {c: sorted(hd.items(), key=lambda x: x[1], reverse=True)[:top_count] for c, hd in cl_hts_dict.items()}
    
    return top_hts_dict

In [5]:
def visualize_quality(loader):
    data = loader()
    cls = [d[-1] for d in data]
    print('Sample data:')
    print(data[0])

    print('Total: {}'.format(cardinality(data)))
    print('Freq: {}'.format(class_frequencies(data)))
    print('Balance: {}'.format(class_balance(data)))

## Data Quality Analysis

### Poleval 2019

In [6]:
def preprocess_poleval2019():
    with open('hsd/Poleval2019/train_texts.txt', 'r') as f:
        texts = f.readlines()
    with open('hsd/Poleval2019/test_texts.txt', 'r') as f:
        texts.extend(f.readlines())
    
    with open('hsd/Poleval2019/train_labels.txt', 'r') as f:
        labels = f.readlines()
    with open('hsd/Poleval2019/test_labels.txt', 'r') as f:
        labels.extend(f.readlines())
    
    data = [[sanitize_text(t, preserve_hashtags=True), sanitize_label(l)] for t, l in tqdm(zip(texts, labels))]
    
    with open('hsd/Poleval2019/data.pkl', 'w') as f:
        pickle.dump(data, f)

if not os.path.exists('hsd/Poleval2019/data.pkl'):
    preprocess_poleval2019()

In [7]:
def load_poleval2019():
    with open('hsd/Poleval2019/data.pkl', 'r') as f:
        data = pickle.load(f)
    return data

In [8]:
visualize_quality(load_poleval2019)

Sample data:
['Dla mnie faworytem do tytu\xc5\x82u b\xc4\x99dzie Cracovia. Zobaczymy, czy typ si\xc4\x99 sprawdzi.', '0']
Total: {'data': 11041}
Freq: {'1': 278, '0': 10056, '2': 707}
Balance: {'1': 2.52, '0': 91.08, '2': 6.4}


In [16]:
data = load_poleval2019()
h_list = []
for d in data:
    words = d[0].split(' ')
    h_list += [w for w in words if w.startswith('#')]
len(h_list)

426

In [15]:
len(np.unique(h_list))

203

### StormfrontWS

In [9]:
def preprocess_stormfrontws():
    texts = []
    for txt in os.listdir('hsd/StormfrontWS/all_files'):
        with open('hsd/StormfrontWS/all_files/' + txt, 'r') as f:
            texts.append([txt.replace('.txt', ''), f.read()])
    with open('hsd/StormfrontWS/labels.csv', 'r') as f:
        labels = list(csv.reader(f))
        labels = [[label[0], label[-1]] for label in labels[1:]]
    
    data = []
    for text in tqdm(texts):
        cl = filter(lambda l: l[0] == text[0], labels)[0][-1]
        data.append([sanitize_text(text[1], preserve_hashtags=True), sanitize_label(cl)])
        
    with open('hsd/StormfrontWS/data.pkl', 'w') as f:
        pickle.dump(data, f)

if not os.path.exists('hsd/StormfrontWS/data.pkl'):
    preprocess_stormfrontws()

In [10]:
def load_stormfrontws():
    with open('hsd/StormfrontWS/data.pkl', 'r') as f:
        data = pickle.load(f)
    return data

In [11]:
visualize_quality(load_stormfrontws)

Sample data:
['I bet all the Ted Pike stuff is on the ADL hit list , look for that to start disappearing down the memory hole .', 'noHate']
Total: {'data': 10944}
Freq: {'idk/skip': 73, 'noHate': 9507, 'hate': 1196, 'relation': 168}
Balance: {'hate': 10.93, 'noHate': 86.87, 'idk/skip': 0.67, 'relation': 1.54}


### Davidson et al.

In [12]:
def preprocess_davidson():
    with open('hsd/DavidsonEtAl/labeled_data.csv') as f:
        raw = list(csv.reader(f))[1:]
        data = [[sanitize_text(r[6], preserve_hashtags=True), sanitize_label(r[5])] for r in raw]
        
    with open('hsd/DavidsonEtAl/data.pkl', 'w') as f:
        pickle.dump(data, f)


if not os.path.exists('hsd/DavidsonEtAl/data.pkl'):
    preprocess_davidson()

In [13]:
def load_davidson():
    with open('hsd/DavidsonEtAl/data.pkl', 'r') as f:
        data = pickle.load(f)
    return data

In [14]:
visualize_quality(load_davidson)

Sample data:
["!!! RT As a woman you shouldn't complain about cleaning up your house. as a man you should always take the trash out...", '2']
Total: {'data': 24783}
Freq: {'1': 19190, '0': 1430, '2': 4163}
Balance: {'1': 77.43, '0': 5.77, '2': 16.8}


### Impermium

In [18]:
# def preprocess_impermium():
#     data = []
#     with open('hsd/Impermium/train.csv') as f:
#         for row in list(csv.reader(f))[1:]:
#             data.append([sanitize_text(row[2], preserve_hashtags=True), sanitize_label(row[0])])
#     with open('hsd/Impermium/test.csv') as f:
#         for row in list(csv.reader(f))[1:]:
#             data.append([sanitize_text(row[2], preserve_hashtags=True), sanitize_label(row[0])])
#     with open('hsd/Impermium/verification.csv') as f:
#         for row in list(csv.reader(f))[1:]:
#             data.append([sanitize_text(row[2], preserve_hashtags=True), sanitize_label(row[1])])
    
#     with open('hsd/Impermium/data.pkl', 'w') as f:
#         pickle.dump(data, f)

# if not os.path.exists('hsd/Impermium/data.pkl'):
#     preprocess_impermium()

FileNotFoundError: [Errno 2] No such file or directory: 'hsd/Impermium/train.csv'

In [13]:
# def load_impermium():
#     with open('hsd/Impermium/data.pkl', 'r') as f:
#         data = pickle.load(f)
#     return data

In [14]:
# visualize_quality(load_impermium)

FileNotFoundError: [Errno 2] No such file or directory: 'hsd/Impermium/data.pkl'

In [6]:
def preprocess_reddit():
    data = []
    with open('hsd/Reddit/train.csv') as f:
        for row in list(csv.reader(f))[1:]:
            data.append([sanitize_text(row[1], preserve_hashtags=True), sanitize_label(row[2])])
    with open('hsd/Reddit/test.csv') as f:
        for row in list(csv.reader(f))[1:]:
            data.append([sanitize_text(row[1], preserve_hashtags=True), sanitize_label(row[2])])
    
    with open('hsd/Reddit/data.pkl', 'wb') as f:
        pickle.dump(data, f)

if not os.path.exists('hsd/Reddit/data.pkl'):
    preprocess_reddit()

In [7]:
def load_reddit():
    with open('hsd/Reddit/data.pkl', 'rb') as f:
        data = pickle.load(f)
    return data

In [8]:
visualize_quality(load_reddit)

Sample data:
["i joined gab to remind myself how retarded jew haters are. You wouldn't be typing on your abacus without them you retard.", '0']
Total: {'data': 55132}
Freq: {'0': 47053, '1': 8079}
Balance: {'0': 85.35, '1': 14.65}


### Toxic Comment Classification Challenge

In [18]:
def preprocess_toxicccc():
    equal = lambda a,b: len(a)==len(b) and len(a)==sum([1 for i,j in zip(a,b) if i==j])
    data = []
    with open('hsd/ToxicCCC/train.csv', 'r') as f:
        for row in list(csv.reader(f))[1:]:
            cl = 0 if equal(row[-6:], ['0']*6) else 1
    
    with open('hsd/ToxicCCC/test_texts.csv', 'r') as f:
        with open('hsd/ToxicCCC/test_labels.csv', 'r') as ff:
            for row_t, row_l in zip(list(csv.reader(f))[1:], list(csv.reader(ff))[1:]):
                if not equal(row_l[-6:], ['-1']*6):
                    cl = 0 if equal(row_l[-6:], ['0']*6) else 1
                    data.append([sanitize_text(row_t[1], preserve_hashtags=True), sanitize_label(cl)])
    
    with open('hsd/ToxicCCC/data.pkl', 'w') as f:
        pickle.dump(data, f)

if not os.path.exists('hsd/ToxicCCC/data.pkl'):
    preprocess_toxicccc()

In [19]:
def load_toxicccc():
    with open('hsd/ToxicCCC/data.pkl', 'r') as f:
        data = pickle.load(f)
    return data

In [20]:
visualize_quality(load_toxicccc)

Sample data:
['Thank you for understanding. I think very highly of you and would not revert without discussion.', '0']
Total: {'data': 63978}
Freq: {'1': 6243, '0': 57735}
Balance: {'1': 9.76, '0': 90.24}


### Wikipedia Detox

In [21]:
def preprocess_wikidetox():
    data = []
    with open('hsd/WikiDetox/attack_annotated_comments.tsv', 'r') as f:
        ann_comments = list(csv.reader(f, delimiter='\t'))[1:]
        with open('hsd/WikiDetox/attack_annotations.tsv', 'r') as ff:
            annotations = list(csv.reader(ff, delimiter='\t'))[1:]
            for ac in tqdm(ann_comments):
                cl = filter(lambda ann: ann[0] == ac[0], annotations)[0][-1]
                data.append([sanitize_text(ac[1], preserve_hashtags=True), sanitize_label(cl)])
    
    with open('hsd/WikiDetox/data.pkl', 'w') as f:
        pickle.dump(data, f)

if not os.path.exists('hsd/WikiDetox/data.pkl'):
    preprocess_wikidetox()

In [22]:
def load_wikidetox():
    with open('hsd/WikiDetox/data.pkl', 'r') as f:
        data = pickle.load(f)
    return data

In [23]:
visualize_quality(load_wikidetox)

Sample data:
["`-This is not ``creative``.  Those are the dictionary definitions of the terms ``insurance`` and ``ensurance`` as properly applied to ``destruction``.  If you don't understand that, fine, legitimate criticism, I'll write up ``three man cell`` and ``bounty hunter`` and then it will be easy to understand why ``ensured`` and ``insured`` are different - and why both differ from ``assured``.The sentence you quote is absolutely neutral.  You just aren't familiar with the underlying theory of strike-back (e.g. submarines as employed in nuclear warfare) guiding the insurance, nor likely the three man cell structure that kept the IRA from being broken by the British.  If that's my fault, fine, I can fix that to explain.  But ther'es nothing ``personal`` or ``creative`` about it.I'm tired of arguing with you.  Re: the other article, ``multi-party`` turns up plenty, and there is more use of ``mutually`` than ``mutual``.  If I were to apply your standard I'd be moving ``Mutual Assur

## Hashtags Analysis

In [24]:
datasets = ['Poleval 2019', 'StormfrontWS', 'Davidson at al.', 'Imperium', 'Toxic Comment Classification Challenge', 'Wikipedia Detox']
loaders = [load_poleval2019, load_stormfrontws, load_davidson, load_impermium, load_toxicccc, load_wikidetox]

for loader, dataset in zip(loaders, datasets):
    hts = hashtags(loader, top_count=10)
    
    print(dataset)
    for c, hs in hts.items():
        print('{}: {}'.format(c, sorted([h[0] for h in hs])))
    print('')

Poleval 2019
1: ['Minela20', 'PiSkomuna']
0: ['BezRetuszu', 'NAME', 'TheVoiceOfPoland', 'UstawaDegradacyjna', 'Woronicza17', 'Wypozyczeni', 'dividetourwarsaw', 'tweetme', 'wieszwiecej', 'woronicza17']
2: ['BezRetuszu', 'Morawiecki', 'NBAVote', 'PiSkomuna', 'Woronicza17', 'delegalizacja', 'odjebanie', 'pytamboniewiem', 's', 'woronicza17']

StormfrontWS
hate: []
noHate: ['1', '11', '2', '366388', '39', '4', 'post696651', 'post700257', 'x202a', 'x202c']
idk/skip: []
relation: ['students']

Davidson at al.
1: ['12288', '128514', '128530', '128553', '128557', '65039', '8217', '8220', '8221', '8230']
0: ['128514', '128557', '8217', '8220', '8221', '8230', 'ISIS', 'LosAngeles', 'faggots', 'tcot']
2: ['128514', '8217', '8220', '8221', '8230', 'ISIS', 'Yankees', 'hoosiers', 'iubb', 'tcot']

Imperium
1: ['333333', '38', 'BASED', 'HopeAndAlgae', 'LL', 'cking', 'nWo4Life', 'skitfuckindaddle', 't', 'trollssuck']
0: ['1', '2', '3', '333333', 'FuckSummer', 'WeAreInControl', 'iSheep', 'ixzz1yIC3WHXV',