# Hate Speech Detector 2.0
---
**Unannotated tweets hate speech prediction** - for whole tweets

1. Load unannotated lemmatized tweets.
2. Load best classification model (**clf**) with appropriate vectorization model (**vec**) if applicable.
3. Predict hate speech for above data.
4. Save prediction results into .csv file.
5. Perform above analysis for three best classifiers.

In [1]:
import numpy as np
import pandas as pd
import csv

from tqdm.notebook import tqdm

from src.dataframes.utils import combine_row_wisely

from src.vectorizers.TextOwnTrainedFTVectorizer import TextOwnTrainedFTVectorizer
from src.nn.models.RecurrentNet import RecurrentNet
from src.nn.models.Conv1dRecurrentNet import Conv1dRecurrentNet
from sklearn.linear_model import SGDClassifier
from src.classifiers.SimpleMLVectorClassifier import SimpleMLVectorClassifier
from src.classifiers.DLVectorClassifier import DLVectorClassifier
import torch

from src.constants import (ALL_SANITIZED_PATH,
                           ALL_POC_SCORES_PATH, ALL_TOPIC_POC_SCORES_PATH, ALL_OTHER_SCORES_PATH,
                           PREDICTION_RESULTS_DIR, LABELS)

## Data loading

### Tweet POC scores for each class

In [2]:
df_poc_scores = pd.read_csv(ALL_POC_SCORES_PATH).drop(['id'], axis=1)
df_poc_scores.head(2)

Unnamed: 0,wyz_POC_min,wyz_POC_mean,wyz_POC_max,groz_POC_min,groz_POC_mean,groz_POC_max,wyk_POC_min,wyk_POC_mean,wyk_POC_max,odcz_POC_min,...,pon_POC_max,styg_POC_min,styg_POC_mean,styg_POC_max,szan_POC_min,szan_POC_mean,szan_POC_max,vulg_POC_min,vulg_POC_mean,vulg_POC_max
0,0.0,0.0,0.0,-0.5,-0.001731,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,-0.5,-0.001385,0.25,0.0,0.0,0.0,-0.333333,...,0.333333,-0.5,-0.000152,0.5,0.0,0.0,0.0,0.0,0.0,0.0


### Tweet topic POC for each class

In [3]:
df_topic_poc_scores = pd.read_csv(ALL_TOPIC_POC_SCORES_PATH).drop(['id'], axis=1)
df_topic_poc_scores.head(2)

Unnamed: 0,wyz_topic_POC_min,wyz_topic_POC_mean,wyz_topic_POC_max,groz_topic_POC_min,groz_topic_POC_mean,groz_topic_POC_max,wyk_topic_POC_min,wyk_topic_POC_mean,wyk_topic_POC_max,odcz_topic_POC_min,...,pon_topic_POC_max,styg_topic_POC_min,styg_topic_POC_mean,styg_topic_POC_max,szan_topic_POC_min,szan_topic_POC_mean,szan_topic_POC_max,vulg_topic_POC_min,vulg_topic_POC_mean,vulg_topic_POC_max
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.052632,...,0.052632,-0.052632,-0.002632,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Tweet other scores

In [4]:
df_other_scores = pd.read_csv(ALL_OTHER_SCORES_PATH).drop(['id'], axis=1)
df_other_scores.head(2)

Unnamed: 0,s_neg,s_neu,s_pos,n_chars,n_sylls,n_words,nu_words,nl_chars,nl_sylls,nl_words,nlu_words
0,0,15,0,73,28,13,13,66,23,12,12
1,1,15,0,81,28,15,13,76,24,17,14


### Lexical data

In [5]:
unused_fields = ['vulg_POC_min', 'vulg_POC_mean', 'vulg_POC_max']

df_lex_data = df_poc_scores.drop(unused_fields, axis=1)
df_lex_data.head(2)

Unnamed: 0,wyz_POC_min,wyz_POC_mean,wyz_POC_max,groz_POC_min,groz_POC_mean,groz_POC_max,wyk_POC_min,wyk_POC_mean,wyk_POC_max,odcz_POC_min,...,odcz_POC_max,pon_POC_min,pon_POC_mean,pon_POC_max,styg_POC_min,styg_POC_mean,styg_POC_max,szan_POC_min,szan_POC_mean,szan_POC_max
0,0.0,0.0,0.0,-0.5,-0.001731,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000864,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,-0.5,-0.001385,0.25,0.0,0.0,0.0,-0.333333,...,0.0,0.0,0.000576,0.333333,-0.5,-0.000152,0.5,0.0,0.0,0.0


### Simple feature ML data

In [6]:
df_simpleML_data = combine_row_wisely([df_poc_scores, df_topic_poc_scores, df_other_scores])
df_simpleML_data.head(2)

Unnamed: 0,wyz_POC_min,wyz_POC_mean,wyz_POC_max,groz_POC_min,groz_POC_mean,groz_POC_max,wyk_POC_min,wyk_POC_mean,wyk_POC_max,odcz_POC_min,...,s_neu,s_pos,n_chars,n_sylls,n_words,nu_words,nl_chars,nl_sylls,nl_words,nlu_words
0,0.0,0.0,0.0,-0.5,-0.001731,0.0,0.0,0.0,0.0,0.0,...,15,0,73,28,13,13,66,23,12,12
1,0.0,0.0,0.0,-0.5,-0.001385,0.25,0.0,0.0,0.0,-0.333333,...,15,0,81,28,15,13,76,24,17,14


### Lemmatized tweets

In [7]:
df_data = pd.read_csv(ALL_SANITIZED_PATH)[['tweet']]
df_data.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,tweet
0,w czwartek muszę poprawić sądy i trybunały
1,Żale Nałęcza i riposta Macierewicza: Pan był w...


## Models loading

In [8]:
CLF_CLASSES = [DLVectorClassifier, SimpleMLVectorClassifier, DLVectorClassifier]
SHORT_NAMES = ['RNN', 'SGDVC', '1dCNN+GRU-HP']
CLF_KWARGS = [
    {'short_name': '5-1-0', 'k_folds': 5, 'vec_class': TextOwnTrainedFTVectorizer, 'nn_class': RecurrentNet,
     'nn_type': 'recurrent_w2', 'vec_params': dict({'model_type': 's', 'short_name': 'super'}),
     'nn_params': dict({'n_layers': 5, 'drop_prob': 0.1, 'bidirectional': False})},
    {'k_folds': 5, 'short_name': 'SGD-l2',
     'vec_class': TextOwnTrainedFTVectorizer, 'clf_class': SGDClassifier,
     'vec_kwargs': {'length': 300, 'model_type': 's', 'short_name': 'super', 'verbose': 0},
     **dict({'penalty': 'l2', 'class_weight': 'balanced'})},
    {'short_name': '1dcgru_adamw-ams-rop', 'k_folds': 5, 'vec_class': TextOwnTrainedFTVectorizer, 'nn_class': Conv1dRecurrentNet,
     'nn_type': 'hparams_conv1d_w2',
     'nn_hparams': dict({'_epochs': 50, '_optim': torch.optim.AdamW, '_optim_params': dict({'amsgrad': True}),
                         '_sched': torch.optim.lr_scheduler.ReduceLROnPlateau,
                         '_sched_params': dict({'patience': 5, 'factor': 0.97}),}),
     'vec_params': dict({'model_type': 's', 'short_name': 'super'}), 'nn_params': dict({'nn_type': 'gru', 'out_channels': 8, 'hidden_size': 100, 'bidirectional': True})},
]

In [9]:
for clf_class, short_name, clf_kwargs in tqdm(zip(CLF_CLASSES, SHORT_NAMES, CLF_KWARGS),
                                              total=len(SHORT_NAMES), leave=False):
    if short_name == 'Lexical':
        X_data = df_lex_data
    elif short_name == 'LRFC':
        X_data = df_simpleML_data
    else:
        X_data = df_data
        
    clf = clf_class(**clf_kwargs)
    clf.load()
    
    y_pred = clf.predict(X_data)
    
    with open(PREDICTION_RESULTS_DIR.replace('{}', f'predictions_{short_name}'), 'w') as f:
        csv.writer(f).writerow(LABELS)
        for y_p in y_pred:
            csv.writer(f).writerow(y_p)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=7647.0), HTML(value='')))

  "num_layers={}".format(dropout, num_layers))


HBox(children=(FloatProgress(value=0.0, max=7647.0), HTML(value='')))

