# Hate Speech Detector 2.0
---
**Character and word vectorization models**

1. Load lemmatized tweets from appropriate .csv.
2. Implement character- and word-level vectorizers:
    1. character vectorizer
    2. simple BoW word vectorizer
    3. pretrained Word2Vec word vectorizer
    4. manually trained Word2Vec word vectorizer

In [1]:
import numpy as np
import pandas as pd

from src.vectorizers.CharacterVectorizer import CharacterVectorizer
from src.vectorizers.WordSimpleBoWVectorizer import WordSimpleBoWVectorizer
from src.vectorizers.WordPretrainedVectorizer import WordPretrainedVectorizer
from src.vectorizers.WordOwnTrainedVectorizer import WordOwnTrainedVectorizer
from src.constants import LEMMAS_PATH

## Lemmatized tweets

In [2]:
texts = pd.read_csv(LEMMAS_PATH)['lemmatized'].values
texts[:2]

array(['w czwartek musieć poprawić sąd i trybunał',
       'żale nałęcz i riposta macierewicz pan być w kompartia czy mieć prawo wyżgłaćszać taki sąd niezalezna.pl'],
      dtype=object)

## Vectorization models

### Character vectorizer

In [3]:
vec = CharacterVectorizer()

In [4]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [5]:
vec.load()
vectors = vec.transform(texts)



In [6]:
vectors[:2]

array([[0.12 , 0.033, 0.1  , 0.123, 0.12 , 0.098, 0.115, 0.117, 0.102,
        0.108, 0.033, 0.11 , 0.118, 0.116, 0.106, 0.102, 0.264, 0.033,
        0.113, 0.112, 0.113, 0.115, 0.098, 0.12 , 0.106, 0.264, 0.033,
        0.116, 0.262, 0.101, 0.033, 0.106, 0.033, 0.117, 0.115, 0.122,
        0.099, 0.118, 0.111, 0.098, 0.323, 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
        0.   ],
       [0.381, 0.098, 0.109, 0.102, 0.033, 0.111, 0.098, 0.323, 0.282,
        0.1  , 0.123, 0.033, 0.106, 0.033, 0.115, 0.106, 0.113, 0.112,
        0.116, 0.117, 0.098, 0.033, 0.11 , 0.098, 0.1  , 0.10

In [7]:
vectors[:2].shape

(2, 100)

### Simple BoW word vectorizer

In [8]:
vec = WordSimpleBoWVectorizer()

In [9]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [10]:
vec.load()
vectors = vec.transform(texts)

In [11]:
vectors[:2]

array([[5.00926714e-05, 1.00185343e-04, 1.50278014e-04, 2.00370686e-04,
        2.50463357e-04, 3.00556029e-04, 3.50648700e-04, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.000000

In [12]:
vectors[:2].shape

(2, 100)

### Pretrained Word2Vec word vectorizer

#### CBoW

In [13]:
vec = WordPretrainedVectorizer()

In [14]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [15]:
vec.load()
vectors = vec.transform(texts)

In [16]:
vectors[:2]

array([[[ 0.195236  ,  3.22168589, -2.17071509, ..., -1.27675104,
          1.71531999,  1.26954699],
        [-1.43476999, -1.61899602, -2.33411908, ..., -1.52191603,
          1.09365404,  0.91099298],
        [ 1.28305697,  2.59738708, -1.24323595, ..., -0.89167303,
         -1.94784904, -0.34189001],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 1.88104498, -2.75636601,  0.84484601, ..., -0.24384999,
          1.36423194, -0.43391201],
        [ 0.51864803,  1.62978899, -0.75086701, ..., -0.242217  ,
          0.124918  ,  1.22108603],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [17]:
vectors[:2].shape

(2, 100, 100)

#### SkipGram

In [18]:
vec = WordPretrainedVectorizer(model_type='skipg', short_name='SkipGram')

In [19]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [20]:
vec.load()
vectors = vec.transform(texts)

In [21]:
vectors[:2]

array([[[-0.084514  , -0.07819   , -0.25412199, ...,  0.121503  ,
         -0.17207   , -0.32889199],
        [ 0.27118301, -0.30184001, -0.29915899, ...,  0.26894701,
          0.02697   , -0.25220001],
        [-0.150782  , -0.20196301,  0.085562  , ...,  0.106482  ,
         -0.362986  , -0.113698  ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.09879   , -0.26662499,  0.010122  , ...,  0.26540601,
          0.39098099,  0.47208601],
        [ 0.013882  , -0.068228  , -0.03464   , ...,  0.097421  ,
         -0.18700901, -0.126609  ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [22]:
vectors[:2].shape

(2, 100, 100)

### Manually trained Word2Vec word vectorizer

#### CBoW

In [23]:
vec = WordOwnTrainedVectorizer()

In [24]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [25]:
vec.load()
vectors = vec.transform(texts)

In [26]:
vectors[:2]

array([[[ 0.2443385 ,  0.05622153, -0.08155499, ..., -0.21032465,
          0.18424031,  0.02308981],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.04889179,  0.12211271,  0.09825716, ..., -0.19097608,
         -0.02399425,  0.09755542],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [27]:
vectors[:2].shape

(2, 100, 100)

#### SkipGram

In [28]:
vec = WordOwnTrainedVectorizer(model_type='skipg', short_name='SkipGram')

In [29]:
#vectors = vec.fit_transform(texts)
#vec.save()

In [30]:
vec.load()
vectors = vec.transform(texts)

In [31]:
vectors[:2]

array([[[ 0.01029263, -0.00768218,  0.03274252, ..., -0.00079988,
          0.05054524,  0.09930298],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.12434429,  0.10393275,  0.08005554, ..., -0.02417017,
         -0.0043561 ,  0.23059891],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [32]:
vectors[:2].shape

(2, 100, 100)