# Hate Speech Detector - PL - Features extraction for SVM & Dense model

Based on [this notebook](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb).

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import pickle
from klepto.archives import dir_archive

import sys
import nltk
import string
import re
import fasttext
from polyglot.text import Text
import syllables as sylla

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
%matplotlib inline

In [2]:
MODEL = 'svm'
dim = 10 if MODEL == 'svm' else 200

## Poleval 2019 data loading

### Classes pre:
    0 - non-harmful
    1 - cyberbullying
    2 - hate speech

In [3]:
if not os.path.exists('hsd/Poleval2019/perfect_data.pkl'):
    with open('hsd/Poleval2019/train_texts.txt', 'r') as f:
        tweets = f.readlines()
    with open('hsd/Poleval2019/test_texts.txt', 'r') as f:
        tweets.extend(f.readlines())
    
    with open('hsd/Poleval2019/train_labels.txt', 'r') as f:
        labels = f.readlines()
    with open('hsd/Poleval2019/test_labels.txt', 'r') as f:
        labels.extend(f.readlines())
    
    with open('hsd/Poleval2019/perfect_data.pkl', 'w') as f:
        def chcl(c):
            return 0 if c=='0\r\n' else 1
        labels = list(map(chcl, labels))
        pickle.dump((tweets, labels), f)
else:
    with open('hsd/Poleval2019/perfect_data.pkl', 'r') as f:
        tweets, labels = pickle.load(f)

### Classes post:
    0 - no hate
    1 - hate speech

In [4]:
list(zip(tweets[:5], labels[:5]))

[('Dla mnie faworytem do tytu\xc5\x82u b\xc4\x99dzie Cracovia. Zobaczymy, czy typ si\xc4\x99 sprawdzi.\r\n',
  0),
 ('@anonymized_account @anonymized_account Brawo ty Daria kibic ma by\xc4\x87 na dobre i z\xc5\x82e\r\n',
  0),
 ('@anonymized_account @anonymized_account Super, polski premier sk\xc5\x82ada kwiaty na grobach kolaborant\xc3\xb3w. Ale doczekali\xc5\x9bmy czas\xc3\xb3w.\r\n',
  0),
 ('@anonymized_account @anonymized_account Musi. Innej drogi nie mamy.\r\n',
  0),
 ('Odrzut natychmiastowy, kwa\xc5\x9bna mina, mam problem\r\n', 0)]

## Features extraction

In [5]:
def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(hashtag_regex, '', parsed_text)
    return parsed_text

def pos(text):
    import morfeusz2
    morf = morfeusz2.Morfeusz()

    analysis = morf.analyse(line)
    
    return [interp[2] for i, j, interp in analysis]
    

def pad_words(words, length):
    if len(words) >= length:
        return words[:length]
    else:
        additional = length - len(words)
        return words + ['PUSTY']*additional

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    sent_analysis = Text(tweet)
    sentiment = {}
    sentiment['neg_cnt'] = 0
    sentiment['neu_cnt'] = 0
    sentiment['pos_cnt'] = 0
    for w in sent_analysis.words:
        mapping = {-1: 'neg_cnt', 0: 'neu_cnt', 1: 'pos_cnt'}
        try:
            sentiment[mapping[w.polarity]] += 1
        except ValueError, UnicodeError:
            sentiment['neu_cnt'] += 1
    
    words = preprocess(tweet) #Get text only
    
    syllables = sylla.estimate(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59, 1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)), 2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0 if "rt" in words else 1
    features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms,
                sentiment['neg_cnt'], sentiment['neu_cnt'], sentiment['pos_cnt'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    return features

### Supervised fastText wordtokens training

In [6]:
if not os.path.exists('hsd/Poleval2019/fasttext.ft'):
    with open('hsd/Poleval2019/fasttext.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, preprocess(t)))

# load fasttext model or train & save if none
if os.path.exists('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL)):
    ft_model = fasttext.load_model('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL))
else:
    ft_model = fasttext.train_supervised('hsd/Poleval2019/fasttext.ft',
                                         lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_model.save_model('hsd/Poleval2019/fasttext_{}.bin'.format(MODEL))

### Wordtoken features

In [7]:
def get_wordtoken_fts(data):
    
    sentences_words = []
    for d in tqdm(data):
        sentence = preprocess(d)
        sentences_words.append(sentence.split(' '))
    
    opt_length = int(np.median([len(sw) for sw in sentences_words]))
    sentences_words = [pad_words(sw, opt_length) for sw in sentences_words]
    
    ft_vectors = []
    for sw in tqdm(sentences_words):
        ft_vector = []
        for w in sw:
            ft_vector.extend(ft_model[w])
        ft_vectors.append(ft_vector)
    
    return ft_vectors

In [8]:
wordtoken_features = get_wordtoken_fts(tweets)

HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))




HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))




In [9]:
wordtoken_features[0]

[-0.25972363,
 0.6093256,
 0.0930879,
 -0.15155588,
 -0.10487659,
 0.06611673,
 0.13856022,
 -0.04793478,
 0.10335264,
 0.16084248,
 -0.56134087,
 0.8714383,
 0.074199535,
 -0.19224817,
 0.05787058,
 0.028133845,
 0.40526202,
 -0.21943921,
 0.23110494,
 0.4049238,
 -0.06258048,
 0.05566072,
 -0.052368417,
 -0.06970625,
 -0.06115733,
 0.016731113,
 -0.010257306,
 -0.04316763,
 0.010475652,
 -0.0587221,
 -0.36280036,
 0.79676443,
 0.18718208,
 -0.27790293,
 0.007267143,
 -0.07893273,
 0.38540623,
 -0.073079444,
 0.12955764,
 0.26674655,
 -0.030550629,
 0.07951717,
 0.053388935,
 0.06686291,
 -0.05846211,
 -0.05604955,
 0.09254257,
 0.06846073,
 -0.07168886,
 0.118393034,
 -0.40160927,
 0.7123825,
 0.052546557,
 -0.23500583,
 -0.041932467,
 -0.021902254,
 0.35851163,
 -0.15595633,
 0.2748768,
 0.29753724,
 -0.09069144,
 0.0479362,
 0.08462821,
 -0.0060301884,
 -0.06289325,
 -0.08564071,
 0.09997467,
 0.033521224,
 0.0127636725,
 -0.0061767288,
 -0.10216817,
 -0.009165661,
 -0.06572169,
 0

### Supervised fastText wordtokens training

In [10]:
# if no morfeusz2 installed then save preprocessed tweets and load pos strings from outer source
sentences = [preprocess(t) for t in tweets]
with open('hsd/Poleval2019/preprocessed.pkl', 'w') as f:
    pickle.dump(sentences, f)

In [11]:
if not os.path.exists('hsd/Poleval2019/fasttext_pos.ft'):
    # only if morfeusz2 is installed
    '''with open('hsd/Poleval2019/fasttext_pos.ft', 'a') as f:
        for t, l in list(zip(tweets, labels)):
            f.write('__label__{} {}\n'.format(l, pos(t)))'''
    # otherwise load pos strings from outer source
    with open('hsd/Poleval2019/pos_sentences.pkl', 'r') as f:
        pos_sentences = pickle.load(f)
    with open('hsd/Poleval2019/fasttext_pos.ft', 'a') as f:
        for ps, l in list(zip(pos_sentences, labels)):
            f.write('__label__{} {}\n'.format(l, ps))
        

# load fasttext pos model or train & save if none
if os.path.exists('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL)):
    ft_pos_model = fasttext.load_model('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL))
else:
    ft_pos_model = fasttext.train_supervised('hsd/Poleval2019/fasttext_pos.ft',
                                             lr=0.5, epoch=50, wordNgrams=3, dim=dim)
    ft_pos_model.save_model('hsd/Poleval2019/fasttext_pos_{}.bin'.format(MODEL))

### Part of speech (PoS) features

In [12]:
def get_pos_fts(data):
    
    # only if morfeusz2 is installed
    '''pos_sentences = [pos(sentence) for sentence in tqdm(sentences)]'''
    # otherwise load pos strings from outer source
    with open('hsd/Poleval2019/pos_sentences.pkl', 'r') as f:
        pos_sentences = pickle.load(f)
    
    
    pos_tags = []
    for ps in pos_sentences:
        pos_tags.append(ps.split(' '))
    
    opt_length = int(np.median([len(pt) for pt in pos_tags]))
    pos_tags = [pad_words(pt, opt_length) for pt in pos_tags]
    
    ft_vectors = []
    for pt in tqdm(pos_tags):
        ft_vector = []
        for t in pt:
            ft_vector.extend(ft_pos_model[t])
        ft_vectors.append(ft_vector)
    
    return ft_vectors

In [13]:
pos_features = get_pos_fts(tweets)

HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))




In [14]:
pos_features[0]

[-0.68847364,
 1.2415278,
 0.29382572,
 -0.93917364,
 0.066950694,
 0.30000833,
 -0.1931048,
 0.52006835,
 0.4964681,
 0.58200437,
 -0.11428785,
 0.025025072,
 0.076467164,
 -0.10548172,
 -0.051385183,
 -0.19466363,
 0.123377524,
 -0.11968064,
 -0.0775263,
 -0.10946142,
 -0.09061464,
 0.08752341,
 0.03792799,
 -0.2066791,
 -0.027338302,
 -0.015878374,
 0.027957069,
 -0.033831187,
 0.060454987,
 0.015502732,
 -0.07118461,
 0.14237219,
 0.10834007,
 -0.05040214,
 -0.047159214,
 0.11034108,
 0.011591187,
 -0.0011767191,
 0.14348975,
 0.11537423,
 -0.08136844,
 0.22632498,
 -0.032268047,
 -0.0692468,
 -0.019915791,
 0.112384774,
 0.03064721,
 0.14133495,
 -0.033671804,
 0.0579389,
 -0.093241684,
 0.19488356,
 0.0826606,
 -0.16631353,
 0.060278412,
 0.060881287,
 -0.015351265,
 0.03486257,
 0.011026373,
 0.10180261,
 0.18740678,
 -0.353895,
 -0.03661656,
 0.33652523,
 -0.048303,
 -0.17948602,
 0.1320285,
 -0.17392011,
 -0.16467643,
 -0.23173858,
 0.07813326,
 -0.2308999,
 -0.04021206,
 0.18

### Other features

In [15]:
other_features = np.array([other_features(t) for t in tqdm(tweets)])

HBox(children=(IntProgress(value=0, max=11041), HTML(value=u'')))

No handlers could be found for logger "polyglot.detect.base"





In [16]:
other_features[:5]

array([[ 11.7   ,  32.51  ,  23.    ,   1.9166,  82.    ,  83.    ,
         12.    ,  12.    ,  12.    ,   0.    ,  15.    ,   0.    ,
          0.    ,   0.    ,   0.    ,   1.    ],
       [  4.8   ,  78.25  ,  14.    ,   1.4   ,  47.    ,  86.    ,
         12.    ,  10.    ,  10.    ,   1.    ,  12.    ,   1.    ,
          0.    ,   2.    ,   0.    ,   1.    ],
       [ 14.4   ,  11.1   ,  24.    ,   2.1817,  92.    , 131.    ,
         13.    ,  11.    ,  11.    ,   0.    ,  17.    ,   1.    ,
          0.    ,   2.    ,   0.    ,   1.    ],
       [  5.2   ,  66.41  ,   8.    ,   1.5999,  30.    ,  69.    ,
          7.    ,   5.    ,   5.    ,   0.    ,  11.    ,   0.    ,
          0.    ,   2.    ,   0.    ,   1.    ],
       [ 12.3   ,  17.46  ,  13.    ,   2.1665,  49.    ,  50.    ,
          6.    ,   6.    ,   6.    ,   1.    ,   6.    ,   1.    ,
          0.    ,   0.    ,   0.    ,   1.    ]])

### All features and feature names

In [17]:
#Now join them all up
features = np.concatenate([wordtoken_features, pos_features, other_features],axis=1)

In [18]:
features.shape

(11041, 416)

## Save features & labels

In [19]:
archive = dir_archive('hsd/Poleval2019/X_y_{}'.format(MODEL), {'features': features, 'labels': labels}, serialized=True)
archive.dump()
del archive