# Hate Speech Detector

## Helper methods

In [1]:
import pandas as pd
import numpy as np

import os
import csv
import pickle
import re

from tqdm.notebook import tqdm

In [2]:
def sanitize_text(text, preserve_hashtags=False):
    text = text.replace('\n', '')  # remove newline character
    text = text.replace('\r', '')  # remove return character
    text = text.replace('NEWLINE_TOKEN', '')  # remove NEWLINE_TOKEN text
    words = text.split(' ')
    words = [w for w in words if not w.startswith('http')]  # remove links
    words = [w for w in words if not w.startswith('@')]  # remove user mentions
    if not preserve_hashtags:
        words = [w for w in words if not w.startswith('#')]  # remove hashtags
    words = [w for w in words if not w.startswith('&') or not w.endswith(';')]  # remove html entities
    text = ' '.join(words)
    
    return text

def sanitize_label(label):
    label = str(label)
    label = label.replace('\n', '')  # remove newline character
    label = label.replace('\r', '')  # remove return character
    
    return label

In [3]:
def cardinality(data):
    return {'data': len(data)}

def class_frequencies(data):
    class_freq = {}
    for d in data:
        if d[-1] not in class_freq:
            class_freq[d[-1]] = 1
        else:
            class_freq[d[-1]] += 1
    return class_freq

def class_balance(data):
    freq = class_frequencies(data)
    total = sum([v for k, v in freq.items()])
    return {k: round(float(v)*100/total, 2) for k, v in freq.items()}

In [4]:
def hashtags(loader, lowercase=False, top_count=200):
    data = loader()
    cl_hts_dict = {}
    
    for d in data:
        text = d[0].lower() if lowercase else d[0]
        hts = re.findall(r"#(\w+)", text)
        cl = d[1]
        if cl not in cl_hts_dict:
            cl_hts_dict[cl] = {}
        for ht in hts:
            if ht not in cl_hts_dict[cl]:
                cl_hts_dict[cl][ht] = 1
            else:
                cl_hts_dict[cl][ht] += 1
    top_hts_dict = {c: sorted(hd.items(), key=lambda x: x[1], reverse=True)[:top_count] for c, hd in cl_hts_dict.items()}
    
    return top_hts_dict

In [5]:
def visualize_quality(loader):
    data = loader()
    cls = [d[-1] for d in data]
    print('Sample data:')
    print(data[0])

    print('Total: {}'.format(cardinality(data)))
    print('Freq: {}'.format(class_frequencies(data)))
    print('Balance: {}'.format(class_balance(data)))

## Data Quality Analysis

### Poleval 2019

In [6]:
def preprocess_poleval2019():
    with open('hsd/Poleval2019/train_texts.txt', 'r') as f:
        texts = f.readlines()
    with open('hsd/Poleval2019/test_texts.txt', 'r') as f:
        texts.extend(f.readlines())
    
    with open('hsd/Poleval2019/train_labels.txt', 'r') as f:
        labels = f.readlines()
    with open('hsd/Poleval2019/test_labels.txt', 'r') as f:
        labels.extend(f.readlines())
    
    data = [[sanitize_text(t, preserve_hashtags=True), sanitize_label(l)] for t, l in tqdm(zip(texts, labels))]
    
    with open('hsd/Poleval2019/data.pkl', 'w') as f:
        pickle.dump(data, f)

if not os.path.exists('hsd/Poleval2019/data.pkl'):
    preprocess_poleval2019()

In [7]:
def load_poleval2019():
    with open('hsd/Poleval2019/data.pkl', 'rb') as f:
        u = pickle._Unpickler(f)
        u.encoding = 'utf-8'
        data = u.load()
    return data

In [8]:
visualize_quality(load_poleval2019)

Sample data:
['Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.', '0']
Total: {'data': 11041}
Freq: {'0': 10056, '2': 707, '1': 278}
Balance: {'0': 91.08, '2': 6.4, '1': 2.52}


In [9]:
data = load_poleval2019()
h_list = []
for d in data:
    words = d[0].split(' ')
    h_list += [w for w in words if w.startswith('#')]
len(h_list)

426

In [10]:
len(np.unique(h_list))

203

### StormfrontWS

In [11]:
# def preprocess_stormfrontws():
#     texts = []
#     for txt in os.listdir('hsd/StormfrontWS/all_files'):
#         with open('hsd/StormfrontWS/all_files/' + txt, 'r') as f:
#             texts.append([txt.replace('.txt', ''), f.read()])
#     with open('hsd/StormfrontWS/labels.csv', 'r') as f:
#         labels = list(csv.reader(f))
#         labels = [[label[0], label[-1]] for label in labels[1:]]
    
#     data = []
#     for text in tqdm(texts):
#         cl = filter(lambda l: l[0] == text[0], labels)[0][-1]
#         data.append([sanitize_text(text[1], preserve_hashtags=True), sanitize_label(cl)])
        
#     with open('hsd/StormfrontWS/data.pkl', 'w') as f:
#         pickle.dump(data, f)

# if not os.path.exists('hsd/StormfrontWS/data.pkl'):
#     preprocess_stormfrontws()

In [12]:
# def load_stormfrontws():
#     with open('hsd/StormfrontWS/data.pkl', 'r') as f:
#         data = pickle.load(f)
#     return data

In [13]:
# visualize_quality(load_stormfrontws)

### Davidson et al.

In [14]:
def preprocess_davidson():
    with open('hsd/DavidsonEtAl/labeled_data.csv') as f:
        raw = list(csv.reader(f))[1:]
        data = [[sanitize_text(r[6], preserve_hashtags=True), sanitize_label(r[5])] for r in raw]
        
    with open('hsd/DavidsonEtAl/data.pkl', 'w') as f:
        pickle.dump(data, f)


if not os.path.exists('hsd/DavidsonEtAl/data.pkl'):
    preprocess_davidson()

In [15]:
def load_davidson():
    with open('hsd/DavidsonEtAl/data.pkl', 'rb') as f:
        u = pickle._Unpickler(f)
        u.encoding = 'utf-8'
        data = u.load()
    return data

In [16]:
visualize_quality(load_davidson)

Sample data:
["!!! RT As a woman you shouldn't complain about cleaning up your house. as a man you should always take the trash out...", '2']
Total: {'data': 24783}
Freq: {'2': 4163, '1': 19190, '0': 1430}
Balance: {'2': 16.8, '1': 77.43, '0': 5.77}


### Impermium

In [17]:
# def preprocess_impermium():
#     data = []
#     with open('hsd/Impermium/train.csv') as f:
#         for row in list(csv.reader(f))[1:]:
#             data.append([sanitize_text(row[2], preserve_hashtags=True), sanitize_label(row[0])])
#     with open('hsd/Impermium/test.csv') as f:
#         for row in list(csv.reader(f))[1:]:
#             data.append([sanitize_text(row[2], preserve_hashtags=True), sanitize_label(row[0])])
#     with open('hsd/Impermium/verification.csv') as f:
#         for row in list(csv.reader(f))[1:]:
#             data.append([sanitize_text(row[2], preserve_hashtags=True), sanitize_label(row[1])])
    
#     with open('hsd/Impermium/data.pkl', 'w') as f:
#         pickle.dump(data, f)

# if not os.path.exists('hsd/Impermium/data.pkl'):
#     preprocess_impermium()

In [18]:
# def load_impermium():
#     with open('hsd/Impermium/data.pkl', 'r') as f:
#         data = pickle.load(f)
#     return data

In [19]:
# visualize_quality(load_impermium)

In [20]:
def preprocess_reddit():
    data = []
    with open('hsd/Reddit/clean_data.csv') as f:
        for row in list(csv.reader(f))[1:]:
            data.append([sanitize_text(row[1], preserve_hashtags=True), sanitize_label(row[2])])
    
    with open('hsd/Reddit/data.pkl', 'wb') as f:
        pickle.dump(data, f)

if not os.path.exists('hsd/Reddit/data.pkl'):
    preprocess_reddit()

In [21]:
def load_reddit():
    with open('hsd/Reddit/data.pkl', 'rb') as f:
        data = pickle.load(f)
    return data

In [22]:
visualize_quality(load_reddit)

Sample data:
["i joined gab to remind myself how retarded jew haters are. You wouldn't be typing on your abacus without them you retard.", '1']
Total: {'data': 55132}
Freq: {'1': 19860, '2': 35272}
Balance: {'1': 36.02, '2': 63.98}


### Toxic Comment Classification Challenge

In [23]:
# def preprocess_toxicccc():
#     equal = lambda a,b: len(a)==len(b) and len(a)==sum([1 for i,j in zip(a,b) if i==j])
#     data = []
#     with open('hsd/ToxicCCC/train.csv', 'r') as f:
#         for row in list(csv.reader(f))[1:]:
#             cl = 0 if equal(row[-6:], ['0']*6) else 1
    
#     with open('hsd/ToxicCCC/test_texts.csv', 'r') as f:
#         with open('hsd/ToxicCCC/test_labels.csv', 'r') as ff:
#             for row_t, row_l in zip(list(csv.reader(f))[1:], list(csv.reader(ff))[1:]):
#                 if not equal(row_l[-6:], ['-1']*6):
#                     cl = 0 if equal(row_l[-6:], ['0']*6) else 1
#                     data.append([sanitize_text(row_t[1], preserve_hashtags=True), sanitize_label(cl)])
    
#     with open('hsd/ToxicCCC/data.pkl', 'w') as f:
#         pickle.dump(data, f)

# if not os.path.exists('hsd/ToxicCCC/data.pkl'):
#     preprocess_toxicccc()

In [24]:
# def load_toxicccc():
#     with open('hsd/ToxicCCC/data.pkl', 'r') as f:
#         data = pickle.load(f)
#     return data

In [25]:
# visualize_quality(load_toxicccc)

### Wikipedia Detox

In [26]:
# def preprocess_wikidetox():
#     data = []
#     with open('hsd/WikiDetox/attack_annotated_comments.tsv', 'r') as f:
#         ann_comments = list(csv.reader(f, delimiter='\t'))[1:]
#         with open('hsd/WikiDetox/attack_annotations.tsv', 'r') as ff:
#             annotations = list(csv.reader(ff, delimiter='\t'))[1:]
#             for ac in tqdm(ann_comments):
#                 cl = filter(lambda ann: ann[0] == ac[0], annotations)[0][-1]
#                 data.append([sanitize_text(ac[1], preserve_hashtags=True), sanitize_label(cl)])
    
#     with open('hsd/WikiDetox/data.pkl', 'w') as f:
#         pickle.dump(data, f)

# if not os.path.exists('hsd/WikiDetox/data.pkl'):
#     preprocess_wikidetox()

In [27]:
# def load_wikidetox():
#     with open('hsd/WikiDetox/data.pkl', 'r') as f:
#         data = pickle.load(f)
#     return data

In [28]:
# visualize_quality(load_wikidetox)

## Hashtags Analysis

In [29]:
datasets = ['Poleval 2019', 'Davidson at al.', 'Reddit']
loaders = [load_poleval2019, load_davidson, load_reddit]

for loader, dataset in zip(loaders, datasets):
    hts = hashtags(loader, top_count=10)
    
    print(dataset)
    for c, hs in hts.items():
        print('{}: {}'.format(c, sorted([h[0] for h in hs])))
    print('')

Poleval 2019
0: ['BezRetuszu', 'NAME', 'TheVoiceOfPoland', 'UstawaDegradacyjna', 'Woronicza17', 'Wypozyczeni', 'dividetourwarsaw', 'tweetme', 'wieszwiecej', 'woronicza17']
2: ['BezRetuszu', 'Kropka', 'Morawiecki', 'NBAVote', 'PiSkomuna', 'Woronicza', 'Woronicza17', 'pytamboniewiem', 'słowoNaNiedzielę', 'woronicza17']
1: ['Minela20', 'PiSkomuna']

Davidson at al.
2: ['128514', '8217', '8220', '8221', '8230', 'ISIS', 'Yankees', 'hoosiers', 'iubb', 'tcot']
1: ['12288', '128514', '128530', '128553', '128557', '65039', '8217', '8220', '8221', '8230']
0: ['128514', '128557', '8217', '8220', '8221', '8230', 'ISIS', 'LosAngeles', 'faggots', 'tcot']

Reddit
1: ['GIFWAR', 'GabFam', 'InfoWars', 'JobsNotMobs', 'KAG', 'MAGA', 'RedWave', 'SpeakFreely', 'Trump', 'WalkAway']
2: ['GIFWAR', 'GabFam', 'KAG', 'MAGA', 'Q', 'QAnon', 'RedWave', 'Trump', 'WWG1WGA', 'WalkAway']

