In [4]:
from base_model import BaseModel

import numpy as np
from typing import List
from gensim.models import KeyedVectors
import unicodedata



MODEL_DIR = "../../../glove/glove_100_3_polish.txt"
GLOVE_SIZE = 100


In [5]:
word2vec = KeyedVectors.load_word2vec_format(MODEL_DIR)

In [6]:
class GloveModel(BaseModel):

    def remove_polish_chars(self, word):
        """
        Copyright: Tymoteusz Makowski
        """
        unicodedata.normalize('NFKD', word).encode("ascii", "ignore").decode("ascii")

    def process(self, sentences: List[List[str]]) -> List[np.ndarray]:
        

        def get_embedding(word):
            try:
                return word2vec.get_vector(word)
            except KeyError:
                try:
                    return word2vec.get_vector(self.remove_polish_chars(word))
                except KeyError:
                    return np.zeros(100, dtype=np.float32)

        return [np.mean([get_embedding(word) for word in sentence]) for sentence in sentences]


if __name__ == "__main__":
    glove = GloveModel()
    glove.process(["mam na imie tomek".split(), "lubie pizze".split()])

In [7]:
glove = GloveModel()


In [8]:
glove.process(["mam na imie tomek".split(), "lubie pizze".split()])

[-0.010311005, -0.0038530447]

In [19]:
def remove_polish_chars(word):
        """
        Copyright: Tymoteusz Makowski
        """
        unicodedata.normalize('NFKD', word).encode("ascii", "ignore").decode("ascii")


def get_embedding(word):
            try:
                return word2vec.get_vector(word)
            except KeyError:
                try:
                    return word2vec.get_vector(remove_polish_chars(word))
                except KeyError:
                    return np.zeros(100, dtype=np.float32)

In [20]:
get_embedding("ala")

array([-0.364655, -0.615625,  0.088845,  0.177926,  0.641963, -0.215127,
       -0.263899,  0.569959, -0.245611, -1.323174,  0.173055, -0.507045,
        0.138067, -0.528148, -0.595162, -0.340827,  0.11991 ,  0.056864,
        0.339928,  0.023663, -0.898189,  0.212992,  0.15253 , -0.029365,
       -0.702257,  0.078425,  0.347781,  0.278559,  0.160578, -0.14816 ,
       -0.290856, -0.644811,  0.980013,  0.254869,  0.299113,  0.067625,
       -0.40612 , -0.065691,  0.805405,  0.261632,  0.367573,  0.506243,
        0.329231, -0.200104, -0.113384, -0.489792, -0.057236, -0.474307,
        0.033125,  0.43053 , -0.262311,  0.07528 ,  0.060128, -0.094774,
        0.22237 ,  0.181473,  0.464303, -0.118389,  0.755448,  0.629063,
        0.439528, -0.313537,  0.423948, -0.409659, -0.167523,  0.304791,
        0.070676, -0.137024,  0.343803, -0.144327, -0.160714,  0.10304 ,
       -0.376636, -0.668168, -0.708677, -0.09034 ,  0.111334,  0.299989,
        0.313365, -0.748744, -0.610146, -0.148986, 

In [21]:
sentence = "jestem tomek i lubie pizze".split()

In [30]:
np.mean(np.array([get_embedding(word) for word in sentence]), axis=0)

array([ 9.31363925e-02, -5.78222051e-02,  4.37389985e-02,  1.42157406e-01,
       -1.60184413e-01,  9.07424092e-02,  3.45121622e-01, -2.33384017e-02,
        3.82601991e-02, -1.46198153e+00, -2.56196797e-01, -3.00292611e-01,
        3.37863594e-01, -1.65835813e-01, -1.30983606e-01,  5.00338003e-02,
       -2.06394792e-01,  1.49282040e-02, -8.51607993e-02,  8.18345994e-02,
        2.45150998e-01, -1.13848999e-01,  1.09585999e-02,  3.29848588e-01,
       -3.66087966e-02, -5.14360005e-03,  1.05044004e-02,  1.21612594e-01,
       -5.63719980e-02,  1.53470799e-01, -5.66600240e-04, -2.36378789e-01,
        4.73800808e-01, -2.20481995e-02,  1.27048403e-01,  2.37543195e-01,
       -6.90371990e-02,  1.71159804e-01, -2.38770604e-01, -8.80642012e-02,
       -1.49504811e-01,  2.08794802e-01, -3.28110009e-02, -2.41627008e-01,
        9.99981984e-02, -9.51928049e-02,  1.41481608e-01,  1.97258994e-01,
       -1.09594800e-01, -6.86437935e-02, -2.75682025e-02,  6.60200045e-03,
        1.05365798e-01,  

In [23]:
[get_embedding(word) for word in sentence]

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32),
 array([ 1.177360e-01,  2.948380e-01,  4.011440e-01,  7.