# Hate Speech Detector 2.0
---
**Text vectorization models**

1. Load lemmatized tweets from appropriate .csv.
2. Implement text- (sentence-)level vectorizers:
    1. TF-IDF text vectorizer
    2. pretrained FastText text vectorizer
    3. manually (un)supervisedly trained FastText text vectorizer
    3. pretrained and manually retrained BERT text vectorizer
    4. pretrained and manually retrained RoBERTA text vectorizer
3. Visualize results by 7 different hate-speech labels plus 1 overall confusion matrices.
4. Perform some test on several examples of lemmatized tweets for each vectorizer.

In [1]:
import numpy as np
import pandas as pd

from src.vectorizers.TextTFIDFVectorizer import TextTFIDFVectorizer
from src.vectorizers.TextPretrainedFTVectorizer import TextPretrainedFTVectorizer
from src.vectorizers.TextOwnTrainedFTVectorizer import TextOwnTrainedFTVectorizer
from src.vectorizers.TextPretrainedBERTVectorizer import TextPretrainedBERTVectorizer
from src.vectorizers.TextOwnTrainedBERTVectorizer import TextOwnTrainedBERTVectorizer
from src.dataframes.utils import combine_row_wisely
from src.constants import LABELS, DUPLICATED_PATH, LEMMAS_PATH

## Sanitized and lemmatized tweets (with classes for FT)

In [2]:
texts = pd.read_csv(DUPLICATED_PATH)[['tweet']]
texts[:2]

Unnamed: 0,tweet
0,w czwartek muszę poprawić sądy i trybunały
1,Żale Nałęcza i riposta Macierewicza: Pan był w...


In [3]:
classes = pd.read_csv(DUPLICATED_PATH)[LABELS]
classes[:2]

Unnamed: 0,wyzywanie,grożenie,wykluczanie,odczłowieczanie,poniżanie,stygmatyzacja,szantaż
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0


In [4]:
combined = combine_row_wisely([texts, classes])
combined[:2]

Unnamed: 0,tweet,wyzywanie,grożenie,wykluczanie,odczłowieczanie,poniżanie,stygmatyzacja,szantaż
0,w czwartek muszę poprawić sądy i trybunały,0,0,0,0,0,0,0
1,Żale Nałęcza i riposta Macierewicza: Pan był w...,0,0,0,0,0,0,0


## Vectorization models

### TF text vectorizer

In [5]:
vec = TextTFIDFVectorizer(model_type='tf', short_name='TF')

In [6]:
vectors = vec.fit_transform(texts)
vec.save()



In [7]:
vectors[:2]

array([[0.51075392, 0.        , 0.        , 0.08512565, 0.        ,
        0.        , 0.08512565, 0.        , 0.        , 0.        ,
        0.        , 0.08512565, 0.08512565, 0.        , 0.08512565,
        0.08512565, 0.08512565, 0.08512565, 0.08512565, 0.08512565,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.25537696, 0.        , 0.        , 0.        ,
        0.08512565, 0.08512565, 0.08512565, 0.        , 0.08512565,
        0.08512565, 0.08512565, 0.08512565, 0.        , 0.08512565,
        0.        , 0.        , 0.        , 0.        , 0.17025131,
        0.08512565, 0.        , 0.        , 0.        , 0.        ,
        0.08512565, 0.        , 0.        , 0.        , 0.08512565,
        0.08512565, 0.08512565, 0.        , 0.        , 0.        ,
        0.08512565, 0.        , 0.        , 0.17025131, 0.08512565,
        0.08512565, 0.25537696, 0.08512565, 0.        , 0.        ,
        0.17025131, 0.        , 0.08512565, 0.08

In [8]:
vectors[:2].shape

(2, 100)

### TF-IDF text vectorizer

In [9]:
vec = TextTFIDFVectorizer()

In [10]:
vectors = vec.fit_transform(texts)
vec.save()

In [11]:
vectors[:2]

array([[0.43500937, 0.        , 0.        , 0.13552643, 0.        ,
        0.        , 0.13655211, 0.        , 0.        , 0.        ,
        0.        , 0.09149785, 0.12941759, 0.        , 0.07549624,
        0.08624552, 0.08768593, 0.08862602, 0.10902097, 0.11104721,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.22026295, 0.        , 0.        , 0.        ,
        0.14339631, 0.10204362, 0.08111512, 0.        , 0.12378373,
        0.07249697, 0.07250156, 0.09059659, 0.        , 0.07383131,
        0.        , 0.        , 0.        , 0.        , 0.14776553,
        0.11151752, 0.        , 0.        , 0.        , 0.        ,
        0.08174343, 0.        , 0.        , 0.        , 0.08857442,
        0.07683385, 0.12320858, 0.        , 0.        , 0.        ,
        0.07429101, 0.        , 0.        , 0.16246826, 0.11551853,
        0.114348  , 0.23463452, 0.11579328, 0.        , 0.        ,
        0.14499394, 0.        , 0.13884269, 0.07

In [12]:
vectors[:2].shape

(2, 100)

### Pretrained FastText text vectorizer

In [13]:
vec = TextPretrainedFTVectorizer()

In [14]:
vectors = vec.fit_transform(texts)
vec.save()



In [15]:
vectors[:2]

array([[ 6.35566860e-02, -3.80448437e-05, -6.05443157e-02,
         3.61447781e-02,  7.22364476e-03,  6.26421208e-03,
        -6.61555380e-02,  5.58974817e-02,  7.45544881e-02,
        -4.48520146e-02, -3.00023947e-02, -1.56874657e-02,
         1.02093823e-01, -7.96100795e-02,  5.82791977e-02,
         2.59558633e-02,  1.06699660e-01, -1.71161778e-02,
         7.81706497e-02, -9.18626843e-04, -7.34722847e-03,
         8.88879038e-03,  7.81900287e-02,  4.83358055e-02,
        -5.05870692e-02,  6.05397858e-02, -2.08692681e-02,
        -1.02740414e-01,  9.61084217e-02,  5.48792966e-02,
         4.03101258e-02,  8.74003693e-02,  1.28479928e-01,
         1.38761429e-02, -4.81189601e-03, -9.23408568e-02,
         8.32420439e-02, -5.98926051e-03,  9.11056623e-02,
         2.87376270e-02,  2.71206349e-02,  1.19929299e-01,
        -1.09568626e-01, -1.49388343e-01,  1.33022010e-01,
         7.30473399e-02,  7.81429857e-02,  4.55358438e-03,
        -7.31640905e-02, -1.25313960e-02,  6.91416711e-0

In [16]:
vectors[:2].shape

(2, 100)

### Manually unsupervisedly trained FastText text vectorizer

In [17]:
vec = TextOwnTrainedFTVectorizer(epochs=1)

In [18]:
vectors = vec.fit_transform(texts)
vec.save()

In [19]:
vectors[:2]

array([[-3.10492609e-02,  2.90360637e-02,  5.45730963e-02,
         2.61704437e-02,  9.26428009e-03,  9.66150779e-03,
        -6.05744077e-03, -3.97101827e-02, -4.34304737e-02,
        -1.16285877e-02,  1.12360213e-02,  1.00422604e-02,
        -1.31002534e-02, -3.08832526e-02, -4.50352952e-02,
        -5.57801267e-03,  7.21032091e-04, -2.84234807e-02,
        -8.13584775e-03,  2.01089233e-02, -1.60313416e-02,
        -6.72431514e-02, -4.91997832e-03, -1.87500496e-03,
         1.16548976e-02, -1.50554189e-02, -1.66342733e-03,
        -3.45856622e-02,  6.45815134e-02,  1.20730149e-02,
        -4.27548550e-02,  3.85371409e-02,  1.75699517e-02,
         8.30805302e-03,  3.90101370e-04, -1.10131251e-02,
        -2.17567012e-02, -2.53346507e-02,  1.73892677e-02,
        -2.41160523e-02, -3.53921438e-03, -2.65146699e-02,
        -1.92603562e-02,  3.33755575e-02,  4.01627301e-04,
        -5.36503457e-03, -6.49354085e-02,  8.73947330e-03,
        -5.63754626e-02,  2.02454696e-03, -3.39481840e-0

In [20]:
vectors[:2].shape

(2, 300)

### Manually supervisedly trained FastText text vectorizer

In [21]:
vec = TextOwnTrainedFTVectorizer(model_type='s', short_name='super', epochs=1)

In [22]:
vectors = vec.fit_transform(combined)
vec.save()

In [23]:
vectors[:2]

array([[-3.14897858e-02,  9.89979729e-02, -3.75098456e-03,
        -3.16894837e-02,  7.19760684e-03,  4.84380312e-02,
        -2.50619762e-02, -6.02988852e-03,  3.06613632e-02,
         3.44553813e-02, -3.36788632e-02,  3.62291746e-02,
         3.75699773e-02,  2.57996302e-02, -5.88292070e-02,
         7.26068765e-02, -1.34168118e-02, -2.70003509e-02,
         3.47434990e-02, -8.92759394e-03,  1.58274230e-02,
         9.39892307e-02, -4.43975208e-03, -1.72029696e-02,
        -4.55200998e-03, -1.07462937e-02,  7.36345947e-02,
        -3.47669236e-02, -1.15567688e-02,  5.61653599e-02,
        -8.45235884e-02,  4.72616032e-02, -1.44777801e-02,
        -7.18957186e-03, -2.38179099e-02,  3.69875096e-02,
         3.56100313e-02, -7.20345601e-02, -3.02258376e-02,
        -4.21552509e-02,  3.95618305e-02,  8.98763537e-02,
         5.38320951e-02, -6.82078451e-02,  9.53124650e-03,
        -2.09752489e-02, -3.29320792e-05,  5.67323237e-04,
        -4.23786826e-02,  4.36317176e-02, -6.39965236e-0

In [24]:
vectors[:2].shape

(2, 300)

### Pretrained BERT text vectorizer

In [25]:
vec = TextPretrainedBERTVectorizer(verbose=1)

In [26]:
vectors = vec.fit_transform(texts)
vec.save()

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [27]:
vectors[:2]

array([[ 0.05386081,  0.29680243,  1.0124377 , ..., -0.01944965,
        -0.03733633,  0.49210665],
       [-0.08791272,  0.13680823,  1.1463429 , ..., -0.19564423,
        -0.0947031 ,  0.60817933]], dtype=float32)

In [28]:
vectors[:2].shape

(2, 768)

### Manually retrained BERT text vectorizer

In [29]:
vec = TextOwnTrainedBERTVectorizer(verbose=1)

In [30]:
vectors = vec.fit_transform(texts)
vec.save()

HBox(children=(FloatProgress(value=0.0, max=158.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [31]:
vectors[:2]

array([[ 0.05386081,  0.29680243,  1.0124377 , ..., -0.01944965,
        -0.03733633,  0.49210665],
       [-0.08791272,  0.13680823,  1.1463429 , ..., -0.19564423,
        -0.0947031 ,  0.60817933]], dtype=float32)

In [32]:
vectors[:2].shape

(2, 768)

### Pretrained RoBERTa text vectorizer

In [33]:
vec = TextPretrainedBERTVectorizer(model_type='roberta', short_name='RoBERTa', verbose=1)

In [34]:
vectors = vec.fit_transform(texts)
vec.save()

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [35]:
vectors[:2]

array([[ 1.1633935 ,  0.11483085,  0.501669  , ..., -0.07505671,
         0.12143123,  0.32040703],
       [ 0.66352725, -0.05653151,  0.22963758, ...,  0.0068558 ,
         0.06846575,  0.7916516 ]], dtype=float32)

In [36]:
vectors[:2].shape

(2, 768)

### Manually retrained RoBERTa text vectorizer

In [37]:
vec = TextOwnTrainedBERTVectorizer(model_type='roberta', short_name='RoBERTa', verbose=1)

In [38]:
vectors = vec.fit_transform(texts)
vec.save()

HBox(children=(FloatProgress(value=0.0, max=158.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [39]:
vectors[:2]

array([[ 1.1633935 ,  0.11483085,  0.501669  , ..., -0.07505671,
         0.12143123,  0.32040703],
       [ 0.66352725, -0.05653151,  0.22963758, ...,  0.0068558 ,
         0.06846575,  0.7916516 ]], dtype=float32)

In [40]:
vectors[:2].shape

(2, 768)