# Hate Speech Detector - EN - Features extraction for SVM & Dense model

Based on [this notebook](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb).

In [1]:
import os
import pandas as pd
import csv
import numpy as np
import pickle
from klepto.archives import dir_archive
import sys
import nltk
import string
import re
import fasttext
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
import syllables as sylla
from pymagnitude import *

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from tqdm.notebook import tqdm
%matplotlib inline

In [2]:
MODEL = 'svm'
dim = 10 if MODEL == 'svm' else 200

## Davidson et al. data loading

### Classes pre:
    0 - hate speech
    1 - offensive language
    2 - neither

In [3]:
if not os.path.exists('hsd/Reddit/perfect_data.pkl'):
    tweets, labels = [], []
    with open('hsd/Reddit/labeled_data.csv', 'r') as f:
        for d in tqdm(list(csv.reader(f))[1:]):
            tweets.append(d[6])  # tweet
            labels.append(d[5])  # class
    with open('hsd/Reddit/perfect_data.pkl', 'w') as f:
        def chcl(c):
            return 0 if c=='0' else 1
        labels = list(map(chcl, labels))
        pickle.dump((tweets, labels), f)
else:
    with open('hsd/Reddit/perfect_data.pkl', 'rb') as f:
        tweets, labels = pickle.load(f)

### Classes post:
    0 - no hate
    1 - hate speech

In [4]:
print('Tweets: {}'.format(len(tweets)))
print('Labels: {}'.format(len(labels)))

Tweets: 55132
Labels: 55132


In [5]:
list(zip(tweets[:5], labels[:5]))

[("i joined gab to remind myself how retarded jew haters are. You wouldn't be typing on your abacus without them you retard.",
  1),
 ('This is what the left is really scared of.    https://redd.it/9rfkts', 0),
 ('That literally looks like a monkey. Why are we supposed to pretend it’s a person bc it’s wearing a red hat?',
  0),
 ('Dumb Cunt', 1),
 ('It makes you an asshole.', 0)]

## Features extraction

In [6]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

sentiment_analyzer = VS()

In [7]:
def simple_preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    #hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    #parsed_text = re.sub(hashtag_regex, '', parsed_text)
    return parsed_text

def pos_tagger(nltk_tag): 
    if nltk_tag.startswith('J'): 
        return wordnet.ADJ 
    elif nltk_tag.startswith('V'): 
        return wordnet.VERB 
    elif nltk_tag.startswith('N'): 
        return wordnet.NOUN 
    elif nltk_tag.startswith('R'): 
        return wordnet.ADV 
    else:           
        return None

def word_tokenization(tweet):
    lemmatizer = WordNetLemmatizer() 
    tokens = word_tokenize(tweet)
    words = [word for word in tokens if word.isalpha()]
    # stop_words = set(stopwords.words('english'))
    # words = [w for w in words if not w in stop_words]
    tags = nltk.pos_tag(words)
    # words = [lemmatizer.lemmatize(w[0]) if pos_tagger(w[1]) is None else lemmatizer.lemmatize(w[0], pos_tagger(w[1])) for w in tags]
    tags = [x[1] for x in tags]
    return words, tags

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = text_string.encode('ascii', 'ignore').decode('ascii')
    parsed_text = re.sub(space_pattern, ' ', parsed_text)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = parsed_text.strip('#')
    list_words, tag_list = word_tokenization(parsed_text)
    parsed_text = " ".join(list_words)
    tag_str = ' '.join(tag_list)
    return parsed_text, tag_str

def basic_tokenize(tweet):
    tweet = " ".join(re.split(" ", tweet.lower())).strip()
    return tweet.split()

# def get_pos_string(tweet):
#     text = preprocess(tweet)
#     tokens = word_tokenize(text)
#     tags = nltk.pos_tag(tokens)
#     tag_list = [x[1] for x in tags]
#     tag_str = ' '.join(tag_list)
    
    # return tag_str

def pad_words(words, length):
    if len(words) >= length:
        return words[:length]
    else:
        additional = length - len(words)
        return words + ['EMPTY']*additional

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet)[0] #Get text only
    
    syllables = sylla.estimate(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59, 1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)), 2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0 if "rt" in words else 1
    features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    return features

### Supervised fastText wordtokens training

In [8]:
if not os.path.exists('hsd/Reddit/fasttext.ft'):
    with open('hsd/Reddit/fasttext.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            text = preprocess(t)[0]
            if len(text) > 0:
                f.write('__label__{} {}\n'.format(l, text))

# load fasttext model or train & save if none
if os.path.exists('hsd/Reddit/fasttext_{}.bin'.format(MODEL)):
    ft_model = fasttext.load_model('hsd/Reddit/fasttext_{}.bin'.format(MODEL))
else:
    ft_model = fasttext.train_supervised('hsd/Reddit/fasttext.ft',
                                         lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_model.save_model('hsd/Reddit/fasttext_{}.bin'.format(MODEL))

### Wordtoken features

In [9]:
def get_wordtoken_fts(data):
    
    sentences_words = []
    for d in data:
        sentence = preprocess(d)[0]
        sentences_words.append(sentence.split(' '))
    
    opt_length = int(np.median([len(sw) for sw in sentences_words]))
    sentences_words = [pad_words(sw, opt_length) for sw in sentences_words]
    
    ft_vectors = []
    for sw in sentences_words:
        ft_vector = []
        for w in sw:
            ft_vector.extend(ft_model[w])
        ft_vectors.append(ft_vector)
    
    return ft_vectors

In [10]:
wordtoken_features = get_wordtoken_fts(tweets)

In [11]:
wordtoken_features[0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.009640693,
 0.0067152996,
 0.0598391,
 0.060052037,
 0.027157854,
 -0.07011807,
 0.02078991,
 -0.08584728,
 0.012917716,
 0.070992336,
 -0.5416798,
 0.29994857,
 -0.7409683,
 -0.18130288,
 -0.42622593,
 0.08789844,
 0.30140865,
 -0.018459707,
 -0.19386247,
 0.42800558,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.63417816,
 0.4362548,
 -0.90651417,
 -0.30027047,
 -0.5111813,
 -0.03127592,
 0.3363991,
 -0.17830524,
 -0.05456752,
 0.5377789,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 9.820641,
 -5.5694637,
 12.352773,
 4.0737033,
 5.9522233,
 0.3452986,
 -4.803345,
 1.2176367,
 1.8868251,
 -5.9742723,
 3.0824692,
 -1.768424,
 3.9607162,
 1.2683365,
 1.6864979,
 0.06917644,
 -1.4328097,
 0.478131,
 0.59928036,
 -1.7303797,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.044294905,
 0.08971748,
 -0.07666792,
 -0.034

### Supervised fastText pos training

In [12]:
if not os.path.exists('hsd/Reddit/fasttext_pos.ft'):
    with open('hsd/Reddit/fasttext_pos.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, preprocess(t)[1]))

# load fasttext pos model or train & save if none
if os.path.exists('hsd/Reddit/fasttext_pos_{}.bin'.format(MODEL)):
    ft_pos_model = fasttext.load_model('hsd/Reddit/fasttext_pos_{}.bin'.format(MODEL))
else:
    ft_pos_model = fasttext.train_supervised('hsd/Reddit/fasttext_pos.ft',
                                             lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_pos_model.save_model('hsd/Reddit/fasttext_pos_{}.bin'.format(MODEL))

### Part of speech (PoS) features

In [13]:
def get_pos_fts(data):

    #Get POS tags for tweets and save as a string
    pos_sentences = []
    for d in data:
        pos_string = preprocess(d)[1]
        pos_sentences.append(pos_string)
        
        
    pos_tags = []
    for ps in pos_sentences:
        pos_tags.append(ps.split(' '))
    
    opt_length = int(np.median([len(pt) for pt in pos_tags]))
    pos_tags = [pad_words(pt, opt_length) for pt in pos_tags]
    
    ft_vectors = []
    for pt in pos_tags:
        ft_vector = []
        for t in pt:
            ft_vector.extend(ft_pos_model[t])
        ft_vectors.append(ft_vector)
    
    return ft_vectors

In [14]:
pos_features = get_pos_fts(tweets)

In [15]:
pos_features[0]

[-0.3552974,
 0.17120244,
 3.0223274,
 0.28159243,
 -1.7013808,
 -0.9112481,
 0.3820561,
 0.2949178,
 -0.68541044,
 -0.24389629,
 0.56111044,
 -0.22296062,
 -0.17741354,
 -0.4266051,
 0.49082732,
 -0.47689247,
 -0.042696957,
 -0.7325433,
 0.14173636,
 0.89051956,
 -0.3552974,
 0.17120244,
 3.0223274,
 0.28159243,
 -1.7013808,
 -0.9112481,
 0.3820561,
 0.2949178,
 -0.68541044,
 -0.24389629,
 -0.1884019,
 0.18592413,
 0.99887717,
 0.048742313,
 -0.7376654,
 -0.6006107,
 -0.6597169,
 0.6042834,
 0.08046319,
 -0.65350616,
 -0.05441922,
 -0.09492745,
 0.6491668,
 -0.8366038,
 0.26320815,
 -0.5242677,
 -0.2310701,
 -0.33418062,
 0.58393586,
 0.32620114,
 0.042382233,
 -0.12674437,
 -0.020608384,
 0.37767172,
 0.33082724,
 -0.08046584,
 -0.47676143,
 0.38759142,
 0.18853232,
 -0.7913379,
 -0.64299756,
 0.2933613,
 1.1777091,
 0.10103384,
 0.0038934315,
 0.3675759,
 -0.16113637,
 0.6470854,
 1.1688254,
 -0.0013943309,
 -0.68914145,
 -0.19697,
 0.32705945,
 0.26792356,
 0.033401813,
 0.35274333

### Other features

In [16]:
other_features = np.array([other_features(t) for t in tweets])

In [17]:
other_features[:5]

array([[ 1.1200e+01,  5.3760e+01,  3.4000e+01,  1.5454e+00,  1.1600e+02,
         1.2100e+02,  2.2000e+01,  2.2000e+01,  2.2000e+01,  2.4100e-01,
         9.7000e-02,  6.6300e-01, -6.2780e-01,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  1.0000e+00],
       [ 2.3000e+00,  9.4300e+01,  1.1000e+01,  1.2222e+00,  4.1000e+01,
         6.8000e+01,  1.0000e+01,  9.0000e+00,  8.0000e+00,  2.6200e-01,
         0.0000e+00,  7.3800e-01, -4.9270e-01,  0.0000e+00,  0.0000e+00,
         1.0000e+00,  1.0000e+00],
       [ 1.0000e+01,  6.0630e+01,  3.1000e+01,  1.4762e+00,  1.0300e+02,
         1.0700e+02,  2.1000e+01,  2.1000e+01,  1.8000e+01,  6.1000e-02,
         1.0900e-01,  8.3000e-01,  2.7320e-01,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  1.0000e+00],
       [-3.0000e+00,  1.2021e+02,  2.0000e+00,  1.0000e+00,  9.0000e+00,
         9.0000e+00,  2.0000e+00,  2.0000e+00,  2.0000e+00,  1.0000e+00,
         0.0000e+00,  0.0000e+00, -7.5790e-01,  0.0000e+00,  0.0000e+00,
         0.0000e+00

In [18]:
np.array(wordtoken_features).shape

(55132, 190)

In [19]:
np.array(pos_features).shape

(55132, 190)

In [20]:
np.array(other_features).shape

(55132, 17)

### All features and feature names

In [21]:
#Now join them all up
features = np.concatenate([wordtoken_features, pos_features, other_features],axis=1)

In [22]:
features.shape

(55132, 397)

## Save features & labels

In [23]:
archive = dir_archive('hsd/Reddit/X_y_{}'.format(MODEL), {'features': features, 'labels': labels}, serialized=True)
archive.dump()
del archive