#  Обучаем Word2Wec

## Подготовка dataset-а

In [1]:
import numpy as np
import pandas as pd


train = pd.read_parquet('data_fusion_train.parquet', engine='pyarrow')
train = train[train.category_id == -1].drop_duplicates('item_name')


In [2]:
product_names = train['item_name']

In [3]:
import pymorphy2
import re
from pymorphy2 import MorphAnalyzer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


patterns = "[A-Za-z0-9!#$%&№'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()
def lemmatize(doc):
    doc = re.sub(patterns, ' ', doc)
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_ru:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            
            tokens.append(token)
    if len(tokens) > 2:
        return tokens
    return None

[nltk_data] Downloading package stopwords to /home/web/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Обрабатываем 1/10 от всех данных

In [4]:
from tqdm import tqdm

result = []
num_iters = int(len(product_names)/1000)
last_iter = 0


for i in tqdm(range(num_iters)):
    i+=1
    if i == num_iters:
        result+=product_names[num_iters*1000:].apply(lemmatize).tolist()
    else:
        result+=product_names[last_iter*1000:i*1000].apply(lemmatize).tolist()
    last_iter += 1

100%|██████████| 3107/3107 [1:13:02<00:00,  1.41s/it]


In [7]:
data = [[_ for _ in ws if len(_) > 2] for ws in result if ws is not None]

In [8]:
from collections import defaultdict

word_freq = defaultdict(int)
for tokens in data:
    for token in tokens:
        word_freq[token] += 1

In [9]:
len(word_freq)

150971

In [10]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['далее',
 'смотреть',
 'таб',
 'пиво',
 'вес',
 'белый',
 'печение',
 'напиток',
 'салат',
 'сыр']

## Обучение

In [11]:
from gensim.models import Word2Vec


w2v_model = Word2Vec(
    min_count=10,
    window=2,
    size=300,
    negative=10,
    alpha=0.03,
    min_alpha=0.0007,
    sample=6e-5,
    sg=1)

In [12]:
w2v_model.build_vocab(data)

In [None]:
w2v_model.train(data, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
w2v_model.init_sims(replace=True)

In [None]:
w2v_model.wv.most_similar(positive=["хлеб"])

## Проверка качества модели

In [None]:
test = pd.read_parquet('data_fusion_train.parquet', engine='pyarrow')


In [None]:
test = test[test.category_id != -1].drop_duplicates('item_name')

In [None]:
test

In [None]:
product_names = test['item_name']

In [None]:
product_names

In [None]:
test_words = product_names.apply(lemmatize).tolist()

In [None]:
words_arrs = [[_ for _ in ws if len(_) > 2] if ws is not None else [] for ws in test_words]


In [None]:
embendding_len = len(w2v_model.wv.word_vec("хлеб"))
embendding_len

In [None]:
test_embenddings = []
for words in words_arrs:
    element_embenddings = []
    for word in words:
        try:
            element_embenddings.append(w2v_model.wv.word_vec(word))
        except KeyError:
            pass
    if len(element_embenddings) == 0:
        test_embenddings.append([0]*embendding_len)
    else:
        test_embenddings.append(sum(element_embenddings))
        
        

In [None]:
len(test_embenddings)

In [None]:
len(words_arrs)

In [None]:
data = test_embenddings
labels = test["category_id"].tolist()
print(len(data))
print(len(labels))

In [None]:
from sklearn.neighbors import KNeighborsClassifier


classifier = KNeighborsClassifier()
classifier.fit(data, labels)

## Оценка качества

In [None]:
from sklearn.metrics import classification_report
report = classification_report(classifier.predict(data), labels)


In [None]:
print(report)

## Визуализация

In [None]:
from sklearn import manifold

tsne = manifold.TSNE(n_components = 2, init = 'pca', random_state = 0)
data_2d_tsne = tsne.fit_transform(data)

In [None]:
from matplotlib import pyplot as PLT
%pylab inline

In [None]:
pylab.figure(figsize = (10, 6))
pylab.scatter(data_2d_tsne[:, 0], data_2d_tsne[:, 1], c = labels)

## Количество нулевых векторов

In [None]:
count = 0
for embendding in tqdm(data):
    if np.array_equal(embendding, [0]*300):
        count += 1


In [None]:
print(count*100/len(data), "% элементов выборки не обрабатываются моделью", sep='')

## Примеры необрабатываемых объектов

In [None]:
count = 0

for embendding, words_arr, product_name in zip(data, words_arrs, product_names.tolist()):
    if np.array_equal(embendding, [0]*300):
        print(product_name)
#         print(words_arr)
        count += 1
    if count >= 20:
        break

## Нужно обучать на всей выборке