# Hate Speech Detector 2.0
---
**Text vectorization models**

1. Load lemmatized tweets from appropriate .csv.
2. Implement text- (sentence-)level vectorizers:
    1. TF-IDF text vectorizer
    2. pretrained FastText text vectorizer
    3. manually (un)supervisedly trained FastText text vectorizer
    3. pretrained and manually retrained BERT text vectorizer
    4. pretrained and manually retrained RoBERTA text vectorizer
3. Visualize results by 7 different hate-speech labels plus 1 overall confusion matrices.
4. Perform some test on several examples of lemmatized tweets for each vectorizer.

In [1]:
import numpy as np
import pandas as pd

from src.vectorizers.TextTFIDFVectorizer import TextTFIDFVectorizer
from src.vectorizers.TextPretrainedFTVectorizer import TextPretrainedFTVectorizer
from src.vectorizers.TextOwnTrainedFTVectorizer import TextOwnTrainedFTVectorizer
from src.vectorizers.TextPretrainedBERTVectorizer import TextPretrainedBERTVectorizer
from src.vectorizers.TextOwnTrainedBERTVectorizer import TextOwnTrainedBERTVectorizer
from src.dataframes.utils import combine_row_wisely
from src.constants import LABELS, LEMMAS_PATH, DUPLICATED_PATH

## Lemmatized tweets (with classes for FT)

In [2]:
texts = pd.read_csv(LEMMAS_PATH).drop(['id'], axis=1)
texts[:2]

Unnamed: 0,lemmatized
0,w czwartek musieć poprawić sąd i trybunał
1,żale nałęcz i riposta macierewicz pan być w ko...


In [3]:
classes = pd.read_csv(DUPLICATED_PATH)[LABELS]
classes[:2]

Unnamed: 0,wyzywanie,grożenie,wykluczanie,odczłowieczanie,poniżanie,stygmatyzacja,szantaż
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0


In [4]:
combined = combine_row_wisely([texts, classes])
combined[:2]

Unnamed: 0,lemmatized,wyzywanie,grożenie,wykluczanie,odczłowieczanie,poniżanie,stygmatyzacja,szantaż
0,w czwartek musieć poprawić sąd i trybunał,0,0,0,0,0,0,0
1,żale nałęcz i riposta macierewicz pan być w ko...,0,0,0,0,0,0,0


## Vectorization models

### TF text vectorizer

In [5]:
vec = TextTFIDFVectorizer(model_type='tf', short_name='TF')

In [6]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [9]:
vec.load()
vectors = vec.transform(texts)

In [10]:
vectors[:2]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57

In [11]:
vectors[:2].shape

(2, 100)

### TF-IDF text vectorizer

In [12]:
vec = TextTFIDFVectorizer()

In [13]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [16]:
vec.load()
vectors = vec.transform(texts)

In [17]:
vectors[:2]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.67504268, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.16

In [18]:
vectors[:2].shape

(2, 100)

### Pretrained FastText text vectorizer

In [19]:
vec = TextPretrainedFTVectorizer()

In [20]:
#vectors = vec.fit_transform(texts)
#vec.save()



In [23]:
vec.load()
vectors = vec.transform(texts)



In [24]:
vectors[:2]

array([[ 0.02361084,  0.00204726, -0.0719777 ,  0.01944675,  0.01211925,
         0.01201868, -0.04772362,  0.05569402,  0.08455206, -0.03439712,
        -0.02759823, -0.01871043,  0.12676221, -0.09190037,  0.0570388 ,
         0.02830316,  0.09739498, -0.02951306,  0.06637428, -0.03983064,
        -0.03086204, -0.01965543,  0.07382791,  0.04622262, -0.04390278,
         0.11121772,  0.00314307, -0.08318941,  0.11192329,  0.03691766,
         0.01806635,  0.05060876,  0.11110332, -0.02886265,  0.04345432,
        -0.06407159,  0.08301959,  0.0164693 ,  0.08296771,  0.05840302,
         0.01965283,  0.10445993, -0.13008796, -0.1792348 ,  0.13693285,
         0.08974626,  0.07586031,  0.03370782, -0.05875361, -0.00616176,
         0.06963425, -0.06344458, -0.03752139, -0.0128589 , -0.09305156,
        -0.01488448,  0.0224688 , -0.11307816, -0.07442889, -0.02304844,
        -0.06240419, -0.04357524,  0.02506868,  0.11605044, -0.05153962,
        -0.00136478, -0.01013218, -0.06065181, -0.0

In [25]:
vectors[:2].shape

(2, 100)

### Manually unsupervisedly trained FastText text vectorizer

In [5]:
vec = TextOwnTrainedFTVectorizer()

In [6]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [9]:
vec.load()
vectors = vec.transform(texts)



In [10]:
vectors[:2]

array([[-5.22933318e-04,  8.18915199e-04, -8.73970333e-04,
        -6.64916297e-04, -6.90489658e-04,  3.35097866e-04,
         4.41892480e-04, -7.97317130e-04,  1.33618538e-03,
         9.92082409e-04,  2.20413029e-04, -3.30409384e-05,
         1.67179242e-04,  1.74681860e-04,  9.80488025e-04,
         7.07365398e-04, -6.45703985e-04,  1.57014787e-04,
         1.09059911e-04, -8.93630146e-04,  2.23706535e-04,
         2.87789182e-04, -8.66617658e-04,  6.81798556e-04,
        -4.16020164e-04,  3.77403223e-04,  2.63652997e-04,
        -1.21999870e-03, -6.99133030e-04, -5.43784350e-04,
        -5.15011256e-04,  4.25449543e-05,  1.73010194e-04,
        -1.18115044e-03, -6.06657239e-04, -5.98550891e-04,
        -6.45079999e-04, -3.32209631e-04, -8.00077163e-04,
         3.11277137e-04,  1.45480677e-04,  5.63161739e-04,
         7.67733261e-04,  5.92063472e-04, -2.66509654e-04,
        -4.58210154e-04,  5.73791913e-04,  3.60232312e-04,
         1.97092042e-04,  7.77677167e-04, -1.41555374e-0

In [11]:
vectors[:2].shape

(2, 300)

### Manually supervisedly trained FastText text vectorizer

In [12]:
vec = TextOwnTrainedFTVectorizer(model_type='s', short_name='super')

In [13]:
#vectors = vec.fit_transform(combined)
#vec.save()

In [16]:
vec.load()
vectors = vec.transform(texts)



In [17]:
vectors[:2]

array([[-5.59509099e-02, -2.09344868e-02, -6.59329146e-02,
         4.30810032e-03,  2.11720373e-02, -3.45987976e-02,
        -5.21129705e-02, -1.01960711e-02,  3.11876591e-02,
         1.00542158e-01,  1.10454569e-02,  5.86889125e-02,
         1.07031204e-02,  2.51459889e-02,  2.51590833e-02,
         3.23834568e-02, -6.09717965e-02,  8.47988110e-03,
        -3.64774652e-02, -4.66671698e-02, -1.44341663e-02,
        -6.65498972e-02, -7.27060661e-02,  1.93975773e-02,
        -5.08875400e-02,  3.98128591e-02,  2.43356749e-02,
        -4.74352762e-02, -7.63674313e-03, -4.60581370e-02,
         2.37436946e-02,  9.10943281e-03, -1.09577123e-02,
        -5.88044636e-02, -3.56189981e-02, -4.80895955e-03,
         2.44870549e-03,  1.21317003e-02,  1.68787148e-02,
         7.92089924e-02, -7.48452544e-03, -6.90491945e-02,
        -5.30759292e-03,  1.42640555e-02, -2.53610946e-02,
        -5.29255020e-03, -3.43370736e-02,  5.08870035e-02,
         2.48513315e-02,  1.71067212e-02, -2.23500337e-0

In [18]:
vectors[:2].shape

(2, 300)

### Pretrained BERT text vectorizer

In [3]:
vec = TextPretrainedBERTVectorizer(verbose=1)

In [4]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [5]:
vec.load()
vectors = vec.transform(texts)

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [6]:
vectors[:2]

array([[ 0.02960951,  0.31538028,  1.3917639 , ...,  0.1266931 ,
        -0.10536456,  0.4475092 ],
       [-0.22364973,  0.20266272,  1.2020434 , ..., -0.09781446,
        -0.05888037,  0.7153386 ]], dtype=float32)

In [7]:
vectors[:2].shape

(2, 768)

### Manually retrained BERT text vectorizer

In [3]:
vec = TextOwnTrainedBERTVectorizer(verbose=1)

In [4]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [5]:
vec.load()
vectors = vec.transform(texts)

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [6]:
vectors[:2]

array([[ 0.02960951,  0.31538028,  1.3917639 , ...,  0.1266931 ,
        -0.10536456,  0.4475092 ],
       [-0.22364973,  0.20266272,  1.2020434 , ..., -0.09781446,
        -0.05888037,  0.7153386 ]], dtype=float32)

In [7]:
vectors[:2].shape

(2, 768)

### Pretrained RoBERTa text vectorizer

In [8]:
vec = TextPretrainedBERTVectorizer(model_type='roberta', short_name='RoBERTa', verbose=1)

In [9]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [10]:
vec.load()
vectors = vec.transform(texts)

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [11]:
vectors[:2]

array([[ 1.0318856 ,  0.24790911,  0.2972364 , ..., -0.07765526,
         0.13350219,  0.26021168],
       [ 0.8310168 , -0.0420981 ,  0.30857626, ..., -0.36685878,
         0.238936  ,  0.9289238 ]], dtype=float32)

In [12]:
vectors[:2].shape

(2, 768)

### Manually retrained RoBERTa text vectorizer

In [17]:
vec = TextOwnTrainedBERTVectorizer(model_type='roberta', short_name='RoBERTa', verbose=1)

In [18]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [19]:
vec.load()
vectors = vec.transform(texts)

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [20]:
vectors[:2]

array([[ 1.0318856 ,  0.24790911,  0.2972364 , ..., -0.07765526,
         0.13350219,  0.26021168],
       [ 0.8310168 , -0.0420981 ,  0.30857626, ..., -0.36685878,
         0.238936  ,  0.9289238 ]], dtype=float32)

In [21]:
vectors[:2].shape

(2, 768)