#### Определения

In [None]:
!pip install gensim
!pip install compress-fasttext

[0mCollecting compress-fasttext
  Downloading compress-fasttext-0.1.3.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: compress-fasttext
  Building wheel for compress-fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for compress-fasttext: filename=compress_fasttext-0.1.3-py3-none-any.whl size=14583 sha256=04cbc4dea0a926fb954d9155c32073c2cfb5b3e7ac7e5a974b6abefce065523a
  Stored in directory: /home/m_apishev/.cache/pip/wheels/c7/63/9f/39db0410175167cee5eeae4fde2405d957cd05c1d8811a51cf
Successfully built compress-fasttext
Installing collected packages: compress-fasttext
Successfully installed compress-fasttext-0.1.3
[0m

In [None]:
from gensim import models

In [None]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(x, y):
    return dot(x, y) / (norm(x) * norm(y))

print(cos_sim([1, 0, 0], (1, 0, 0)))
print(cos_sim([1, 0, 0], (0, 1, 0)))
print(cos_sim([1, 0, 0], (-1, 0, 0)))

1.0
0.0
-1.0


#### Обучим модель word2vec

In [None]:
sentences = []
with open('rus-ru_web-public_2019_100K-sentences.txt') as fin:
    for line in fin:
        sentences.append(line.split(' '))

In [None]:
model = models.Word2Vec()
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(4079335, 6874035)

In [None]:
try:
    print(cos_sim(model.wv['сеть'], model.wv['интернет']))
    print(cos_sim(model.wv['человек'], model.wv['интернет']))
    print(cos_sim(model.wv['интернет'], model.wv['интенет']))
except Exception as e:
    print(e)

0.9710866
0.5534083
"Key 'интенет' not present"


#### Обучим модель FastText

In [None]:
model = models.FastText(sentences, sg=1, vector_size=50)

In [None]:
try:
    print(cos_sim(model.wv['сеть'], model.wv['интернет']))
    print(cos_sim(model.wv['человек'], model.wv['интернет']))
    print(cos_sim(model.wv['интернет'], model.wv['интенет']))
except Exception as e:
    print(e)

0.60630214
0.3068741
0.87997293


#### Загрузим оптимизированную модель FastText

In [None]:
import compress_fasttext
model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
    'https://github.com/avidale/compress-fasttext/releases/download/gensim-4-draft/geowac_tokens_sg_300_5_2020-100K-20K-100.bin'
)

In [None]:
try:
    print(cos_sim(model['сеть'], model['интернет']))
    print(cos_sim(model['человек'], model['интернет']))
    print(cos_sim(model['интернет'], model['интенет']))
except Exception as e:
    print(e)

0.5231468445248225
0.27971834039337506
0.6044784950143409


## Домашнее задание
1. Скачать датасет с фразами симпосонов - https://github.com/sujanjoejacob/Text-mining-with-Simpsons-Data
2. Обучить FastText/Word2vec на фразах персонажей
3. Визуализировать embeddings по самым частотным словам (top 1000)
4. Найти самые близкие слова для:
  homer - marge + bart
  bart - lisa + school
  marge - homer + home
5. Попробовать построить классификатор bart/lisa с использованием этих эмбеддингов