# Hate Speech Detector 2.0
---
**Character and word vectorization models**

1. Load lemmatized tweets from appropriate .csv.
2. Implement character- and word-level vectorizers:
    1. character vectorizer
    2. simple BoW word vectorizer
    3. pretrained Word2Vec word vectorizer
    4. manually trained Word2Vec word vectorizer

In [1]:
import numpy as np
import pandas as pd

from src.vectorizers.CharacterVectorizer import CharacterVectorizer
from src.vectorizers.WordSimpleBoWVectorizer import WordSimpleBoWVectorizer
from src.vectorizers.WordPretrainedVectorizer import WordPretrainedVectorizer
from src.vectorizers.WordOwnTrainedVectorizer import WordOwnTrainedVectorizer
from src.constants import DUPLICATED_PATH, LEMMAS_PATH

## Sanitized and lemmatized tweets

In [2]:
texts = pd.read_csv(DUPLICATED_PATH)['tweet'].values
texts[:2]

array(['w czwartek muszę poprawić sądy i trybunały',
       'Żale Nałęcza i riposta Macierewicza: Pan był w kompartii, czy ma prawo wy\xadgła\xadszać takie sądy? | niezalezna.pl '],
      dtype=object)

In [2]:
texts_lemmas = pd.read_csv(LEMMAS_PATH)['lemmatized'].values
texts_lemmas[:2]

array(['w czwartek musieć poprawić sąd i trybunał',
       'żale nałęcz i riposta macierewicz pan być w kompartia czy mieć prawo wyżgłaćszać taki sąd niezalezna.pl'],
      dtype=object)

## Vectorization models

### Character vectorizer

In [4]:
vec = CharacterVectorizer()

In [5]:
vectors = vec.fit_transform(texts_lemmas)
vec.save()



In [6]:
vec.load()
vectors = vec.transform(texts_lemmas)

In [7]:
vectors[:2]

array([[0.12 , 0.033, 0.1  , 0.123, 0.12 , 0.098, 0.115, 0.117, 0.102,
        0.108, 0.033, 0.11 , 0.118, 0.116, 0.106, 0.102, 0.264, 0.033,
        0.113, 0.112, 0.113, 0.115, 0.098, 0.12 , 0.106, 0.264, 0.033,
        0.116, 0.262, 0.101, 0.033, 0.106, 0.033, 0.117, 0.115, 0.122,
        0.099, 0.118, 0.111, 0.098, 0.323, 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   ],
       [0.381, 0.098, 0.109, 0.102, 0.033, 0.111, 0.098, 0.323, 0.282,
        0.1  , 0.123, 0.033, 0.106, 0.033, 0.115, 0.106, 0.113, 0.112,
        0.116, 0.117, 0.098, 0.033, 0.11 , 0.098, 0.1  , 0.10

In [8]:
vectors[:2].shape

(2, 100)

### Simple BoW word vectorizer

In [9]:
vec = WordSimpleBoWVectorizer()

In [10]:
vectors = vec.fit_transform(texts_lemmas)
vec.save()

HBox(children=(FloatProgress(value=0.0, max=15791.0), HTML(value='')))




In [11]:
vec.load()
vectors = vec.transform(texts_lemmas)

In [12]:
vectors[:2]

array([[5.00926714e-05, 1.00185343e-04, 1.50278014e-04, 2.00370686e-04,
        2.50463357e-04, 3.00556029e-04, 3.50648700e-04, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.000000

In [13]:
vectors[:2].shape

(2, 100)

### Pretrained Word2Vec word vectorizer

#### CBoW

In [14]:
vec = WordPretrainedVectorizer()

In [15]:
vectors = vec.fit_transform(texts_lemmas)
vec.save()

In [16]:
vec.load()
vectors = vec.transform(texts_lemmas)

In [17]:
vectors[:2]

array([[-2.73642337e+00, -3.16107280e+00,  1.53068082e+01,
         2.34499172e+01,  3.72586754e+00,  9.62982203e+00,
         1.44165031e+01,  9.67240969e+00, -5.02584845e+00,
         3.33383573e-02, -1.56756365e+00,  7.07226849e+00,
        -1.68556889e+00,  3.05178004e+00, -2.38518098e-01,
         6.61891770e+00, -5.13522209e-01, -1.15926978e+00,
         2.73793169e+00,  1.52650151e+00,  1.65862322e+00,
         2.23981401e+00,  2.04474429e+00,  4.99797179e+00,
        -2.98040781e+00, -8.55365474e-01,  2.07447397e+00,
         1.80908176e+00, -1.92068578e+00, -9.01087220e-01,
         2.30057813e+00,  8.68159613e+00, -3.52260568e+00,
         5.82729957e+00, -1.43409736e+00,  6.91335186e-01,
        -3.26171875e+00,  1.70125692e+00,  1.45427542e-02,
         1.00037745e+00, -2.41598377e+00, -2.62574468e+00,
         5.11525895e-01,  1.57580065e+00,  9.44868484e-01,
        -9.60086695e-01, -5.23668434e-01,  2.02216312e+00,
         3.09540854e-01, -1.93631376e+00, -1.76947671e+0

In [18]:
vectors[:2].shape

(2, 100)

#### SkipGram

In [19]:
vec = WordPretrainedVectorizer(model_type='skipg', short_name='SkipGram')

In [20]:
vectors = vec.fit_transform(texts_lemmas)
vec.save()

In [21]:
vec.load()
vectors = vec.transform(texts_lemmas)

In [22]:
vectors[:2]

array([[-1.91233314, -0.99749175, -0.19720769, -0.58724424, -1.56671805,
        -2.48956985,  0.9436126 ,  0.81045999,  1.11740016,  0.57724825,
        -0.11768081, -0.01455227,  0.0289995 ,  0.22767953,  0.52767133,
         0.05393096, -0.21253961,  0.07796972,  0.01667675, -0.01127327,
         0.14539806,  0.05456427,  0.46563494, -0.25340685,  0.15811815,
         0.3012914 ,  0.42581784,  0.48258321,  0.17247243, -0.18116713,
         0.53229956, -0.1481135 , -0.08884873, -0.26337746, -0.466543  ,
         0.09848219,  0.12317307, -0.34668819,  0.05143846, -0.48277303,
        -0.33915145,  0.09456529,  0.27298907,  0.18593807, -0.13411911,
        -0.08031714,  0.4241037 ,  0.11593739, -0.07027649, -0.10758047,
        -0.17061629, -0.24493884,  0.05219293,  0.08356102,  0.25988212,
        -0.14557204,  0.37500627, -0.28622569,  0.04693436,  0.16641836,
         0.00495267, -0.19294736,  0.17235411,  0.27998443,  0.42496389,
         0.17888331,  0.08552632, -0.19955849,  0.0

In [23]:
vectors[:2].shape

(2, 100)

### Manually trained Word2Vec word vectorizer

#### CBoW

In [3]:
vec = WordOwnTrainedVectorizer()

In [5]:
vec.load()
vectors = vec.transform(texts_lemmas)

In [6]:
vectors[:2]

array([[-4.67275190e-01,  1.33451838e-02,  7.16628595e-01,
         1.68201124e-01,  4.34761619e-01, -2.41831024e-01,
         2.71594397e-01,  1.58673249e-01, -5.40265505e-02,
        -1.76863601e-01, -1.76800497e-02,  1.34360303e-01,
        -8.86824947e-02, -1.24174652e-01, -3.96129353e-02,
        -3.55980722e-02, -7.33559293e-03,  3.79158617e-02,
        -4.96520857e-01,  5.38011259e-01, -5.21339007e-02,
        -3.57621324e-02,  2.02253397e-01, -4.81970727e-02,
        -2.77386185e-02, -2.48352264e-02, -6.26500319e-02,
        -1.62724198e-02,  2.88070411e-02, -5.60104938e-03,
        -2.74778120e-02,  1.14638083e-02, -1.82762565e-03,
         4.46378489e-02,  8.27367383e-02, -1.20742675e-01,
        -1.43164877e-02,  1.04700172e-02, -9.90827045e-03,
         7.30550181e-03,  1.12310258e-02,  1.44159504e-02,
        -6.00602735e-02, -7.70412955e-02, -3.28102739e-02,
         1.62861636e-02,  2.82508283e-02, -1.34108828e-02,
        -7.63525703e-03,  1.37527500e-02,  4.89449903e-0

In [7]:
vectors[:2].shape

(2, 100)

#### SkipGram

In [8]:
vec = WordOwnTrainedVectorizer(model_type='skipg', short_name='SkipGram')

In [10]:
vec.load()
vectors = vec.transform(texts_lemmas)

In [11]:
vectors[:2]

array([[-5.06693474e-01,  8.82547425e-02,  8.47404921e-01,
         2.12177093e-01,  4.76715256e-01, -1.82331741e-01,
         3.76418701e-01,  3.43945296e-02, -1.12337108e-01,
        -2.16374251e-01,  4.74382106e-02,  1.46697628e-01,
        -7.44319935e-02, -1.50203490e-01, -5.72400707e-02,
        -2.59965194e-02, -4.54033970e-03, -1.88472458e-02,
        -1.56251774e-02, -1.14747204e-02, -1.21565673e-02,
        -1.63710201e-02, -2.50101927e-01,  4.31523304e-02,
         2.10678759e-01,  4.81426386e-02,  2.49087114e-01,
         1.71168347e-01,  5.11504259e-03,  3.50123000e-02,
        -8.10051196e-02, -3.78530643e-02, -6.01773047e-03,
        -1.30215161e-03, -5.37794207e-04, -5.76822841e-02,
         6.42614050e-02, -1.64149033e-02,  4.55223402e-03,
        -5.19071068e-03, -4.74112250e-03, -1.53665341e-02,
         7.69191435e-02, -6.21476901e-02, -1.18282416e-02,
        -2.18451857e-02,  1.49203991e-03,  1.90085407e-02,
         2.68984797e-03, -1.22495605e-03,  9.69028406e-0

In [12]:
vectors[:2].shape

(2, 100)