In [74]:
from toolz import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import swifter
import plotly.graph_objects as go
# %matplotlib widget


import pathlib
from lenses import lens

from collections import Counter, OrderedDict

import re
import nltk
import pymorphy2
import fasttext.util

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

import pickle

In [2]:
nltk.download('popular')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_ru')
nltk.download('tagsets')
nltk.download('stopwords')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to

True

In [3]:
swifter.set_defaults(
    force_parallel=True,
)

In [4]:
lmap = compose(list, map)
ltake = compose(list, take)

In [6]:
data_path = pathlib.Path('data')

## Загружаем датафрейм с данными

In [7]:
df = pd.read_json(data_path.joinpath('ready_dataframe.json'))
df.set_index('id', inplace=True)
df.sort_index(inplace=True)
df['ingredients'] = df['ingredients'].swifter.apply(lens.Each().modify(compose(str.lower, str.strip)))
df

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0_level_0,title,course,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14247,Зеленый горошек по-французски,Закуски,Французская кухня,"[зеленый салат, зеленый горошек, сливочное мас..."
14248,Арбулястра,Завтраки,Французская кухня,"[куриное яйцо, зелень, тертый сыр пармезан, сл..."
14249,Зеленый салат с лимонной заправкой,Салаты,Европейская кухня,"[зеленый салат, уксус, растительное масло, сол..."
14250,Салат с сыром и зеленью,Салаты,Французская кухня,"[лимон, редис, огурцы, сыр, укроп, петрушка]"
14251,Картофельный салат с чесноком,Салаты,Итальянская кухня,"[картофель, петрушка, чеснок, винный уксус, мо..."
...,...,...,...,...
153048,Хлебный суп с яблоками,Супы,Еврейская кухня,"[яблоко, вода, изюм без косточек, черствый ржа..."
153049,Куриный бульон с кнейдлах,Супы,Еврейская кухня,"[куриный жир, мука из мацы, куриное яйцо, рубл..."
153050,Чечевичная похлебка,Супы,Еврейская кухня,"[репчатый лук, стебель сельдерея, морковь, чес..."
153051,Холодный суп из щавеля,Супы,Еврейская кухня,"[куриное яйцо, картофель, куриный бульон, щаве..."


## Работа с ингридиентами

In [8]:
ingredients = df['ingredients']
list(ingredients.iloc[:3])

[['зеленый салат',
  'зеленый горошек',
  'сливочное масло',
  'репчатый лук',
  'вода',
  'сахар',
  'соль',
  'кервель'],
 ['куриное яйцо',
  'зелень',
  'тертый сыр пармезан',
  'сливочное масло',
  'тертый имбирь'],
 ['зеленый салат',
  'уксус',
  'растительное масло',
  'соль',
  'молотый черный перец',
  'горчица']]

### Самые часто встречающиеся ингридиенты в рецептах

In [9]:
ingredients_counter = Counter()
for lst in ingredients:
    ingredients_counter.update(lst)
ingredients_counter.most_common(10)

[('соль', 20427),
 ('куриное яйцо', 11621),
 ('сахар', 10953),
 ('молотый черный перец', 10877),
 ('сливочное масло', 10323),
 ('пшеничная мука', 9950),
 ('чеснок', 9558),
 ('оливковое масло', 7932),
 ('репчатый лук', 7272),
 ('растительное масло', 6456)]

In [10]:
morph = pymorphy2.MorphAnalyzer()
def lemmatize(word):
    return morph.parse(word)[0].normal_form

### Токенизация

In [11]:
ingredients_tokenized = ingredients.swifter.apply(lens.Each().modify(nltk.word_tokenize))
ingredients_tokenized[:4]

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

id
14247    [[зеленый, салат], [зеленый, горошек], [сливоч...
14248    [[куриное, яйцо], [зелень], [тертый, сыр, парм...
14249    [[зеленый, салат], [уксус], [растительное, мас...
14250    [[лимон], [редис], [огурцы], [сыр], [укроп], [...
Name: ingredients, dtype: object

### Стоп-слова и фильтрация

In [12]:
stopwords = nltk.corpus.stopwords.words('russian')

In [13]:
rx = re.compile(r'[a-zA-Zа-яА-Я][a-zA-Zа-яА-Я\d\-]*')

In [None]:
ingredients_filtered = ingredients_tokenized.swifter.apply(
    lens.Each().modify(
        lambda tokenized: tuple(filter(
            lambda word: word not in stopwords and rx.match(word),
            tokenized))
    )
)

In [136]:
ingredients_filtered = ingredients_filtered.swifter.apply(compose(list, unique))
ingredients_filtered

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

id
14247     [(зеленый, салат), (зеленый, горошек), (сливоч...
14248     [(куриное, яйцо), (зелень,), (тертый, сыр, пар...
14249     [(зеленый, салат), (уксус,), (растительное, ма...
14250     [(лимон,), (редис,), (огурцы,), (сыр,), (укроп...
14251     [(картофель,), (петрушка,), (чеснок,), (винный...
                                ...                        
153048    [(яблоко,), (вода,), (изюм, косточек), (черств...
153049    [(куриный, жир), (мука, мацы), (куриное, яйцо)...
153050    [(репчатый, лук), (стебель, сельдерея), (морко...
153051    [(куриное, яйцо), (картофель,), (куриный, буль...
153081    [(дорада,), (помидоры,), (каперсы,), (петрушка...
Name: ingredients, Length: 41438, dtype: object

### Лемматизация

In [138]:
ingredients_lemmatized = ingredients_filtered.swifter.apply(lens.Each().Each().modify(lemmatize))
ingredients_lemmatized[:4]

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
all_words_lemmatized = Counter(concat(ingredients_lemmatized.swifter.apply(lens.Each().Each().collect())))
all_words_lemmatized_list = list(all_words_lemmatized)

#Добавил UNK и PAD
all_words_lemmatized_list = ['PAD', 'UNK']+all_words_lemmatized_list
all_words_lemmatized_list.sort()
all_words_lemmatized_list[:10]
# dict(all_words.items())

In [None]:
#ДОЛГО + много оперативки
fasttext.util.download_model('ru', if_exists='ignore')

ft = fasttext.load_model('cc.ru.300.bin')
# fasttext.util.reduce_model(ft, 100)
dim = ft.get_dimension()
dim

### Получаем векторы для слов 

In [21]:
word_embeddings_dict = {}
for word in all_words_lemmatized_list:
    if word  == 'PAD':
        word_embeddings_dict[word] = np.concatenate((np.zeros(dim),[1, 0]))
    elif word  == 'UNK':
        word_embeddings_dict[word] = np.concatenate((np.zeros(dim),[0, 1]))
    else:
        word_embeddings_dict[word] = np.concatenate((ft.get_word_vector(word), [0, 0]))

In [23]:
first(word_embeddings_dict.items())

('PAD',
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
    

Расширяем разметрости для PAD и UNK

In [24]:
dim += 2

In [25]:
# Освободаем оперативку
del ft

### Векторы для ингредиентов

In [29]:
all_ingredients_list =  list(set(concat(ingredients_lemmatized.swifter.apply(lens.Each().collect()))))
#Добавил UNK и PAD
all_ingredients_list = [('PAD',), ('UNK',)]+all_ingredients_list
all_ingredients_list 
ingredients_numbering_dict = {ingredient: i for i, ingredient in enumerate(all_ingredients_list)}
ltake(6, ingredients_numbering_dict.items())

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

[(('PAD',), 0),
 (('UNK',), 1),
 (('копчёный', 'ветчина', 'кость'), 2),
 (('кукурузный', 'хлопья'), 3),
 (('ржаной', 'хлеб', 'семечко'), 4),
 (('картофельный', 'чипсы'), 5)]

Для получения вектора для ингридиента просто суммируем векторы для каждого слова в этом ингридиенте:

In [30]:
ingredients_embeddings_dict = {}
for ingr in all_ingredients_list:
    ingredients_embeddings_dict[ingr] = np.zeros(dim)
    for word in ingr:
#         ingredients_embeddings_dict[ingr] += ft.get_word_vector(word) 
        ingredients_embeddings_dict[ingr] += word_embeddings_dict[word] # Должно заработать, когда 
                                                                         # ингредиенты будут лемматизированны
ltake(10, ingredients_embeddings_dict.items())[3]


(('кукурузный', 'хлопья'),
 array([ 0.08196878,  0.04994004, -0.08463618,  0.0283025 ,  0.05823767,
        -0.0159271 ,  0.10261862, -0.06118131, -0.01537847, -0.02303754,
        -0.05866228,  0.00229412,  0.06107598,  0.01127146, -0.00357291,
        -0.01133985,  0.02073701, -0.0178669 , -0.10418483, -0.11567056,
        -0.04456828,  0.07991009,  0.0560222 , -0.0025593 ,  0.10632645,
         0.01183371, -0.00435265, -0.03686194, -0.00286871,  0.04305854,
        -0.05926874, -0.06816797,  0.06792417, -0.0961192 ,  0.08346977,
        -0.04744818,  0.01714684, -0.15220929, -0.03040184, -0.02816062,
         0.11446448, -0.00387371,  0.04253486,  0.08256276, -0.01307493,
         0.11927472,  0.02814195,  0.0215842 ,  0.11699558,  0.01901858,
         0.12087446,  0.10483214, -0.04647234,  0.00520997, -0.15311343,
         0.10049783, -0.07267921, -0.03155889,  0.02527906, -0.11071557,
         0.07243429,  0.01520709,  0.1995794 ,  0.06411422, -0.1804648 ,
         0.12839152, -0.

Наконец, для каждого рецепта возьмем его векторизацию как сумму векторов его ингридиентов

In [31]:
recipe_embedding_sum = [
    sum(ingredients_embeddings_dict[ingredient]
        for ingredient in ingredient_list) 
    for ingredient_list in ingredients_lemmatized
]
len(recipe_embedding_sum)

41438

In [53]:
recipe_embedding_sum = ingredients_lemmatized.apply(lambda recipe:
    np.sum(lens.Each().modify(ingredients_embeddings_dict.__getitem__)(recipe), axis=0)
)
recipe_embedding_sum

id
14247     [0.7618384752422571, 0.08853759150952101, 0.15...
14248     [0.13557584770023823, 0.3944497350603342, -0.0...
14249     [0.5860096383839846, 0.07307140016928315, 0.18...
14250     [0.4274589493870735, -0.17271707206964493, -0....
14251     [0.436641464009881, 0.20584300439804792, 0.027...
                                ...                        
153048    [0.38410141598433256, 0.13397947791963816, -0....
153049    [0.9013344636186957, 0.3460128325968981, -0.23...
153050    [1.264339747838676, -0.08623092842753977, 0.37...
153051    [0.8304240852594376, -0.011165377218276262, -0...
153081    [0.5803682114928961, 0.40145984827540815, 0.12...
Name: ingredients, Length: 41438, dtype: object

In [57]:
recipe_embedding_avg = ingredients_lemmatized.apply(lambda recipe:
    np.average(lens.Each().modify(ingredients_embeddings_dict.__getitem__)(recipe), axis=0)
)
recipe_embedding_avg

id
14247     [0.09522980940528214, 0.011067198938690126, 0....
14248     [0.027115169540047647, 0.07888994701206684, -0...
14249     [0.09766827306399743, 0.012178566694880525, 0....
14250     [0.07124315823117892, -0.028786178678274155, -...
14251     [0.062377352001411576, 0.029406143485435417, 0...
                                ...                        
153048    [0.05487163085490465, 0.019139925417091166, -0...
153049    [0.10014827373541063, 0.03844587028854423, -0....
153050    [0.09030998198847685, -0.006159352030538555, 0...
153051    [0.09226934280660418, -0.0012405974686973626, ...
153081    [0.05276074649935419, 0.03649634984321892, 0.0...
Name: ingredients, Length: 41438, dtype: object

## Данные для нейросети

Всевозможные метки (course):

In [68]:
all_courses = {name: num for num, name in enumerate(df['course'][~df['course'].duplicated()].sort_values())}
all_courses

{'Выпечка и десерты': 0,
 'Завтраки': 1,
 'Закуски': 2,
 'Напитки': 3,
 'Основные блюда': 4,
 'Паста и пицца': 5,
 'Салаты': 6,
 'Соусы и маринады': 7,
 'Супы': 8}

Группируем кухни:

In [79]:
df_cuisines = pd.read_excel(data_path.joinpath('Сортировка_кухонь.xlsx'))
cuisines_groups = {}
for column in df_cuisines.columns:
    series = df_cuisines[column]
    cuisines_groups[str(column).strip()] = set(map(str.strip, list(series[series.notnull()])))
del cuisines_groups['Unnamed: 1']
inverse_cuisines_groups = {cuisine: cuisine_group
                           for (cuisine_group, cuisines) in cuisines_groups.items()
                           for cuisine in cuisines}
# inverse_cuisines_groups = OrderedDict(sorted(inverse_cuisines_groups.items()))
inverse_cuisines_groups

{'Дагестанская кухня': 'Кавказская',
 'Молдавская кухня': 'Европейская',
 'Британская кухня': 'Европейская',
 'Хорватская кухня': 'Балканская',
 'Туркменская кухня': 'Центральноазиатская',
 'Белорусская кухня': 'Славянская',
 'Сицилийская кухня': 'Европейская',
 'Аргентинская кухня': 'Южноамериканская',
 'Украинская кухня': 'Славянская',
 'Мировая кухня': 'Неклассифицируемое',
 'Сингапурская кухня': 'Юго-Восточная Азия',
 'Авторская кухня': 'Неклассифицируемое',
 'Румынская кухня': 'Балканская',
 'Американская кухня': 'Североамериканская',
 'Эстонская кухня': 'Европейская',
 'Русская кухня': 'Славянская',
 'Алжирская кухня': 'Африканская',
 'Восточно-индийская кухня': 'Юго-Восточная Азия',
 'Немецкая кухня': 'Европейская',
 'Одесская кухня': 'Славянская',
 'Марокканская кухня': 'Африканская',
 'Чеченская кухня': 'Кавказская',
 'Средиземноморская кухня': 'Европейская',
 'Австрийская кухня': 'Европейская',
 'Шведская кухня': 'Скандинавская',
 'Сербская кухня': 'Балканская',
 'Европейская

In [81]:
cuisines_set = sorted(set(inverse_cuisines_groups.values()))
# print(cuisines_set)
all_cuisines = {name: num for num, name in enumerate(cuisines_set)}
all_cuisines

{'Африканская': 0,
 'Балканская': 1,
 'Ближний Восток': 2,
 'Восточноазиатская': 3,
 'Европейская': 4,
 'Кавказская': 5,
 'Неклассифицируемое': 6,
 'Североамериканская': 7,
 'Скандинавская': 8,
 'Славянская': 9,
 'Центральноазиатская': 10,
 'Юго-Восточная Азия': 11,
 'Южноамериканская': 12}

### Делаем tf-idf

In [87]:
try:
    del tf
finally:
    tf = pd.DataFrame(index=df.index.rename('recipe_id'), 
                      columns=all_ingredients_list,
                      data=0,
                      dtype=np.float64)
for index, count in tf.iterrows():
    for ingredient in ingredients_lemmatized.loc[index]:
        count[ingredient] += 1
    count /= len(ingredients_lemmatized.loc[index])
tf

Unnamed: 0_level_0,"(PAD,)","(UNK,)","(копчёный, ветчина, кость)","(кукурузный, хлопья)","(ржаной, хлеб, семечко)","(картофельный, чипсы)","(глазурь, тёмный, шоколад, dr.oetker)","(лимонный, цедра)","(зефир,)","(французский, горчица)",...,"(кунжут,)","(зернистый, дижонскай, горчица)","(кунжутный, паста)","(сухой, дрожжи, dr.oetker)","(карамельный, соус)","(жареный, фундук)","(рислинг,)","(трюфельный, паста)","(вишнёвый, водка, кирша)","(ракушка, гребешков)"
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
idf = dict(zip(tf.columns, iterate(lambda _: 1, 1)))
for recipe in ingredients_lemmatized.values:
    for ingredient in recipe:
        idf[ingredient] += 1
idf = pd.Series(index=pd.Index(idf.keys(), tupleize_cols=False), data=idf.values(), dtype=np.float64)
idf /= len(tf.index)
idf = -idf.apply(np.log)
idf

(PAD,)                        10.631954
(UNK,)                        10.631954
(копчёный, ветчина, кость)     8.840194
(кукурузный, хлопья)           7.135446
(ржаной, хлеб, семечко)        9.533341
                                ...    
(жареный, фундук)              7.048435
(рислинг,)                     8.686043
(трюфельный, паста)            8.147047
(вишнёвый, водка, кирша)       8.840194
(ракушка, гребешков)           9.533341
Length: 4406, dtype: float64

In [91]:
try:
    del tfidf
finally:
    tfidf = tf*idf
del tf

Unnamed: 0_level_0,"(PAD,)","(UNK,)","(копчёный, ветчина, кость)","(кукурузный, хлопья)","(ржаной, хлеб, семечко)","(картофельный, чипсы)","(глазурь, тёмный, шоколад, dr.oetker)","(лимонный, цедра)","(зефир,)","(французский, горчица)",...,"(кунжут,)","(зернистый, дижонскай, горчица)","(кунжутный, паста)","(сухой, дрожжи, dr.oetker)","(карамельный, соус)","(жареный, фундук)","(рислинг,)","(трюфельный, паста)","(вишнёвый, водка, кирша)","(ракушка, гребешков)"
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
tfidf

Unnamed: 0_level_0,"(PAD,)","(UNK,)","(копчёный, ветчина, кость)","(кукурузный, хлопья)","(ржаной, хлеб, семечко)","(картофельный, чипсы)","(глазурь, тёмный, шоколад, dr.oetker)","(лимонный, цедра)","(зефир,)","(французский, горчица)",...,"(кунжут,)","(зернистый, дижонскай, горчица)","(кунжутный, паста)","(сухой, дрожжи, dr.oetker)","(карамельный, соус)","(жареный, фундук)","(рислинг,)","(трюфельный, паста)","(вишнёвый, водка, кирша)","(ракушка, гребешков)"
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
# Из tfidf берется строка соответствующая рецепту и ингредиенты только из рецепта
# Их значение умножается на эмбеддинги этих ингредиентов и складываются (т.е. вектор умнож. на матрицу)
recipe_embedding_sum_tf_idf = [
    tfidf.iloc[i][ingredient_list]@np.array([ingredients_embeddings_dict[ingredient]
        for ingredient in ingredient_list]) 
    for i,ingredient_list in enumerate(ingredients_lemmatized)
]
len(recipe_embedding_sum_tf_idf)



41438

In [127]:
for _, it in ingredients_lemmatized.reset_index().iterrows():
    tfidf.loc[it['id']][it['ingredients']]
    # print(it)



KeyboardInterrupt: 

In [117]:
recipe_embedding_avg_tf_idf = pd.Series(ingredients_lemmatized.reset_index().apply(
    lambda it:
        tfidf.loc[it['id']][it['ingredients']] @
        np.array(lens.Each().modify(ingredients_embeddings_dict.__getitem__)(it['ingredients'])),
    axis=1).values, index=ingredients_lemmatized.index)
recipe_embedding_avg_tf_idf



id
14247     [0.2312899307317464, -0.03798629201875053, 0.0...
14248     [-0.047695562428056684, 0.13134068914467661, 0...
14249     [0.26862193160372755, -0.07282666881717992, 0....
14250     [0.23934587523872478, -0.08555719879431489, -0...
14251     [0.09671493660172885, 0.027734709358337214, -0...
                                ...                        
153048    [0.15586152526899738, 0.04400364822517483, -0....
153049    [0.33346141437470733, 0.06992642450949106, -0....
153050    [0.2597255528548992, -0.028301624387459163, 0....
153051    [0.2130407644419675, -0.05759126959011176, -0....
153081    [0.15346220902734345, 0.08630516904926387, 0.0...
Length: 41438, dtype: object

In [None]:
pd.Series(ingredients_lemmatized.reset_index().apply(
    identity,
    axis=1).values, index=ingredients_lemmatized.index)
ingredients_lemmatized.apply(lambda lst:
    tfidf
)

id
14247     [(зелёный, салат), (зелёный, горошек), (сливоч...
14248     [(куриный, яйцо), (зелень,), (тёртый, сыр, пар...
14249     [(зелёный, салат), (уксус,), (растительный, ма...
14250     [(лимон,), (редис,), (огурец,), (сыр,), (укроп...
14251     [(картофель,), (петрушка,), (чеснок,), (винный...
                                ...                        
153048    [(яблоко,), (вода,), (изюм, косточка), (чёрств...
153049    [(куриный, жир), (мука, маца), (куриный, яйцо)...
153050    [(репчатый, лук), (стебель, сельдерей), (морко...
153051    [(куриный, яйцо), (картофель,), (куриный, буль...
153081    [(дорада,), (помидор,), (каперс,), (петрушка,)...
Name: ingredients, Length: 41438, dtype: object

In [111]:
recipe_embedding_avg_tf_idf = [
    tfidf.iloc[i][ingredient_list]@np.array([ingredients_embeddings_dict[ingredient]
        for ingredient in ingredient_list])/len(ingredient_list)
    for i,ingredient_list in enumerate(ingredients_lemmatized)
]
len(recipe_embedding_avg_tf_idf)



id
14247     [0.0289112413414683, -0.004748286502343816, 0....
14248     [-0.009539112485611336, 0.026268137828935324, ...
14249     [0.04477032193395459, -0.012137778136196653, 0...
14250     [0.03989097920645413, -0.014259533132385815, -...
14251     [0.013816419514532692, 0.003962101336905316, -...
                                ...                        
153048    [0.02226593218128534, 0.006286235460739261, -0...
153049    [0.03705126826385637, 0.007769602723276784, -0...
153050    [0.01855182520392137, -0.002021544599104226, 0...
153051    [0.0236711960491075, -0.006399029954456862, -0...
153081    [0.013951109911576677, 0.007845924459023988, 0...
Length: 41438, dtype: object

In [95]:
recipe_embedding_sum_tf_idf[0]

array([ 0.23128993, -0.03798629,  0.08602118,  0.12128009,  0.20274098,
       -0.29488352,  0.1511563 ,  0.16057011, -0.11122028, -0.20191662,
       -0.18900785,  0.12102213,  0.02355172, -0.0696775 , -0.0135034 ,
        0.08100771,  0.11578235, -0.24325664, -0.26948965, -0.17337811,
        0.19466821,  0.03421407, -0.2401844 , -0.181274  ,  0.1476598 ,
        0.11675763,  0.093238  ,  0.24194781,  0.0595658 ,  0.21012272,
        0.00580437,  0.06114275,  0.09545048, -0.09652855,  0.07809559,
       -0.04611486, -0.0769235 , -0.37303522, -0.15487498, -0.19957317,
        0.17810278, -0.07397115,  0.0041414 , -0.03305336,  0.02980138,
        0.11846393,  0.0098151 ,  0.02653244,  0.3364836 ,  0.01805439,
        0.03241376,  0.0986062 , -0.12623576,  0.0430819 , -0.30774169,
        0.33443025, -0.14449177, -0.18636496, -0.12615105, -0.29690667,
        0.1001099 , -0.01152412,  0.18944638,  0.05395495,  0.00454956,
        0.09138972,  0.20337467,  0.18198763, -0.11686654,  0.43

### Все нужные данные переводим в pickle

In [83]:
df_for_learning = pd.DataFrame(data={
    'id': df['id'],
    'recipe_embedding_sum': recipe_embedding_sum,
    'recipe_embedding_avg': recipe_embedding_avg,
    'recipe_embedding_sum_tf_idf': recipe_embedding_sum_tf_idf,
    'recipe_embedding_avg_tf_idf': recipe_embedding_avg_tf_idf,
    'course_mark': [all_courses[course] for course in df['course']],
    'cuisine_group_mark': [all_cuisines[inverse_cuisines_groups[cuisine]] for cuisine in df['cuisine']]
})
df_for_learning

Unnamed: 0,id,recipe_embedding_sum,recipe_embedding_avg,recipe_embedding_sum_tf_idf,recipe_embedding_avg_tf_idf,course_mark,cuisine_group_mark
0,28195,"[0.611611146479845, -0.0743011748418212, -0.09...","[0.15290278661996126, -0.0185752937104553, -0....","[0.5152516388568951, -0.09051781286938537, -0....","[0.12881290971422377, -0.022629453217346343, -...",0,10
1,28141,"[0.8909059994039126, -0.11706455610692501, -0....","[0.17818119988078251, -0.023412911221385002, -...","[0.7177284127096286, -0.13393777995143036, -0....","[0.14354568254192573, -0.026787555990286073, -...",0,10
2,27929,"[1.08976399153471, -0.4261779775843024, -0.389...","[0.15568057021924428, -0.06088256822632892, -0...","[0.7291096075148575, -0.37962093342836795, -0....","[0.10415851535926536, -0.05423156191833828, -0...",0,10
3,28192,"[1.1749160811305046, -0.5175759345293045, -0.3...","[0.1958193468550841, -0.08626265575488408, -0....","[0.9335582594569222, -0.5268387025674657, -0.2...","[0.15559304324282036, -0.08780645042791095, -0...",0,10
4,28463,"[0.6190221486613154, -0.14342674519866705, -0....","[0.12380442973226309, -0.02868534903973341, -0...","[0.5538295072472644, -0.1975480368662923, -0.3...","[0.11076590144945289, -0.03950960737325846, -0...",0,10
...,...,...,...,...,...,...,...
41433,43380,"[1.3148583127185702, -0.05140630202367902, -0....","[0.08765722084790468, -0.003427086801578601, -...","[0.2399726786919129, -0.04387937383021033, -0....","[0.01599817857946086, -0.002925291588680689, -...",8,2
41434,80446,"[1.0693077351897955, 0.24955729965586215, -0.1...","[0.07128718234598637, 0.01663715331039081, -0....","[0.2214572326364108, 0.01731656636259433, -0.0...","[0.014763815509094053, 0.0011544377575062887, ...",8,9
41435,136820,"[0.7934096870012581, 0.5892392117530107, -0.44...","[0.056672120500089865, 0.042088515125215054, -...","[0.07969059200268491, 0.10006368778428605, -0....","[0.005692185143048922, 0.007147406270306147, -...",8,9
41436,18014,"[0.834478304721415, 0.4054728071205318, 0.2355...","[0.07586166406558319, 0.03686116428368471, 0.0...","[0.15685011466058457, 0.1580715665107838, -0.0...","[0.014259101332780416, 0.014370142410071255, -...",8,9


In [84]:
dict_to_pickle = {
    'df_for_learning': df_for_learning,
    'all_courses': all_courses,
    'all_cuisines': all_cuisines,
    'word_embeddings_dict': word_embeddings_dict,
    'ingredients_numbering_dict': ingredients_numbering_dict,
    'dim': dim,
    'ingredients_lemmatized': ingredients_lemmatized,
    'all_ingredients_list': all_ingredients_list,
    'ingredients_embeddings_dict': ingredients_embeddings_dict,
    # 'tfidf': tfidf
}
with data_path.joinpath('processed_data_dict.pkl').open(mode='wb') as f:
    pickle.dump(dict_to_pickle, f)