#  Обучаем Word2Wec

## Подготовка dataset-а

In [14]:
import numpy as np
import pandas as pd


train = pd.read_parquet('data_fusion_train.parquet', engine='pyarrow')
train = train[train.category_id == -1].drop_duplicates('item_name')


In [21]:
product_names = train['item_name']

In [25]:
import pymorphy2
import re
from pymorphy2 import MorphAnalyzer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


patterns = "[A-Za-z0-9!#$%&№'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()
def lemmatize(doc):
    doc = re.sub(patterns, ' ', doc)
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_ru:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            
            tokens.append(token)
    if len(tokens) > 2:
        return tokens
    return None

[nltk_data] Downloading package stopwords to /home/web/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Обрабатываем 1/10 от всех данных

In [42]:
from tqdm import tqdm

result = []
num_iters = int(len(product_names)/1000)
last_iter = 0


for i in tqdm(range(int(num_iters/10))):
    i+=1
    if i == num_iters:
        result+=product_names[num_iters*1000:].apply(lemmatize).tolist()
    else:
        result+=product_names[last_iter*1000:i*1000].apply(lemmatize).tolist()
    last_iter += 1

100%|██████████| 310/310 [07:39<00:00,  1.48s/it]


In [77]:
data = [[_ for _ in ws if len(_) > 2] for ws in result if ws is not None]

In [78]:
from collections import defaultdict

word_freq = defaultdict(int)
for tokens in result:
    for token in tokens:
        word_freq[token] += 1

In [79]:
len(word_freq)

46838

In [80]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['далее',
 'хлеб',
 'вес',
 'пиво',
 'таб',
 'салат',
 'смотреть',
 'сыр',
 'напиток',
 'печение']

## Обучение

In [81]:
from gensim.models import Word2Vec


w2v_model = Word2Vec(
    min_count=10,
    window=2,
    size=300,
    negative=10,
    alpha=0.03,
    min_alpha=0.0007,
    sample=6e-5,
    sg=1)

In [82]:
w2v_model.build_vocab(data)

In [83]:
w2v_model.train(data, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(14696543, 27931980)

In [84]:
w2v_model.init_sims(replace=True)

In [87]:
w2v_model.wv.most_similar(positive=["хлеб"])

[('подовый', 0.70289146900177),
 ('черновский', 0.6585055589675903),
 ('схк', 0.6546527147293091),
 ('хби', 0.6544870138168335),
 ('паляница', 0.6496161222457886),
 ('ахлеб', 0.647832989692688),
 ('охк', 0.6436166763305664),
 ('бхк', 0.6404373645782471),
 ('нарезна', 0.6393171548843384),
 ('кишинёвский', 0.6391841769218445)]