In [1]:
from toolz import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import swifter
import plotly.graph_objects as go
# %matplotlib widget


import pathlib
from lenses import lens

from collections import Counter, OrderedDict

import re
import nltk
import pymorphy2
import fasttext.util

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

import pickle

In [2]:
nltk.download('popular')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_ru')
nltk.download('tagsets')
nltk.download('stopwords')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\aakomlev\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\aakomlev\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\aakomlev\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\aakomlev\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\aakomlev\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to

True

In [3]:
swifter.set_defaults(
    force_parallel=True,
)

In [4]:
lmap = compose(list, map)
ltake = compose(list, take)

In [5]:
data_path = pathlib.Path('data')

## Загружаем датафрейм с данными

In [6]:
df = pd.read_json(data_path.joinpath('ready_dataframe.json'))
df.set_index('id', inplace=True)
df.sort_index(inplace=True)
df['ingredients'] = df['ingredients'].apply(lens.Each().modify(compose(str.lower, str.strip)))
df

Unnamed: 0_level_0,title,course,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14247,Зеленый горошек по-французски,Закуски,Французская кухня,"[зеленый салат, зеленый горошек, сливочное мас..."
14248,Арбулястра,Завтраки,Французская кухня,"[куриное яйцо, зелень, тертый сыр пармезан, сл..."
14249,Зеленый салат с лимонной заправкой,Салаты,Европейская кухня,"[зеленый салат, уксус, растительное масло, сол..."
14250,Салат с сыром и зеленью,Салаты,Французская кухня,"[лимон, редис, огурцы, сыр, укроп, петрушка]"
14251,Картофельный салат с чесноком,Салаты,Итальянская кухня,"[картофель, петрушка, чеснок, винный уксус, мо..."
...,...,...,...,...
153048,Хлебный суп с яблоками,Супы,Еврейская кухня,"[яблоко, вода, изюм без косточек, черствый ржа..."
153049,Куриный бульон с кнейдлах,Супы,Еврейская кухня,"[куриный жир, мука из мацы, куриное яйцо, рубл..."
153050,Чечевичная похлебка,Супы,Еврейская кухня,"[репчатый лук, стебель сельдерея, морковь, чес..."
153051,Холодный суп из щавеля,Супы,Еврейская кухня,"[куриное яйцо, картофель, куриный бульон, щаве..."


## Работа с ингридиентами

In [7]:
ingredients = df['ingredients']
list(ingredients.iloc[:3])

[['зеленый салат',
  'зеленый горошек',
  'сливочное масло',
  'репчатый лук',
  'вода',
  'сахар',
  'соль',
  'кервель'],
 ['куриное яйцо',
  'зелень',
  'тертый сыр пармезан',
  'сливочное масло',
  'тертый имбирь'],
 ['зеленый салат',
  'уксус',
  'растительное масло',
  'соль',
  'молотый черный перец',
  'горчица']]

### Самые часто встречающиеся ингридиенты в рецептах

In [8]:
ingredients_counter = Counter()
for lst in ingredients:
    ingredients_counter.update(lst)
ingredients_counter.most_common(10)

[('соль', 20427),
 ('куриное яйцо', 11621),
 ('сахар', 10953),
 ('молотый черный перец', 10877),
 ('сливочное масло', 10323),
 ('пшеничная мука', 9950),
 ('чеснок', 9558),
 ('оливковое масло', 7932),
 ('репчатый лук', 7272),
 ('растительное масло', 6456)]

In [9]:
morph = pymorphy2.MorphAnalyzer()
def lemmatize(word):
    return morph.parse(word)[0].normal_form

### Токенизация

In [10]:
ingredients_tokenized = ingredients.swifter.apply(lens.Each().modify(nltk.word_tokenize))
ingredients_tokenized[:4]

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

id
14247    [[зеленый, салат], [зеленый, горошек], [сливоч...
14248    [[куриное, яйцо], [зелень], [тертый, сыр, парм...
14249    [[зеленый, салат], [уксус], [растительное, мас...
14250    [[лимон], [редис], [огурцы], [сыр], [укроп], [...
Name: ingredients, dtype: object

### Стоп-слова и фильтрация

In [11]:
stopwords = nltk.corpus.stopwords.words('russian')

In [12]:
rx = re.compile(r'[a-zA-Zа-яА-Я][a-zA-Zа-яА-Я\d\-]*')

In [13]:
ingredients_filtered = ingredients_tokenized.swifter.apply(
    lens.Each().modify(
        lambda tokenized: tuple(filter(
            lambda word: word not in stopwords and rx.match(word),
            tokenized))
    )
)
ingredients_filtered

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

id
14247     [(зеленый, салат), (зеленый, горошек), (сливоч...
14248     [(куриное, яйцо), (зелень,), (тертый, сыр, пар...
14249     [(зеленый, салат), (уксус,), (растительное, ма...
14250     [(лимон,), (редис,), (огурцы,), (сыр,), (укроп...
14251     [(картофель,), (петрушка,), (чеснок,), (винный...
                                ...                        
153048    [(яблоко,), (вода,), (изюм, косточек), (черств...
153049    [(куриный, жир), (мука, мацы), (куриное, яйцо)...
153050    [(репчатый, лук), (стебель, сельдерея), (морко...
153051    [(куриное, яйцо), (картофель,), (куриный, буль...
153081    [(дорада,), (помидоры,), (каперсы,), (петрушка...
Name: ingredients, Length: 41438, dtype: object

### Лемматизация

In [14]:
ingredients_lemmatized = ingredients_filtered.swifter.apply(lens.Each().Each().modify(lemmatize))
ingredients_lemmatized = ingredients_lemmatized.swifter.apply(compose(list, unique))
ingredients_lemmatized[:4]

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

id
14247    [(зелёный, салат), (зелёный, горошек), (сливоч...
14248    [(куриный, яйцо), (зелень,), (тёртый, сыр, пар...
14249    [(зелёный, салат), (уксус,), (растительный, ма...
14250    [(лимон,), (редис,), (огурец,), (сыр,), (укроп...
Name: ingredients, dtype: object

In [15]:
all_words_lemmatized = Counter(concat(ingredients_lemmatized.swifter.apply(lens.Each().Each().collect())))
all_words_lemmatized_list = list(all_words_lemmatized)

#Добавил UNK и PAD
all_words_lemmatized_list = ['PAD', 'UNK']+all_words_lemmatized_list
all_words_lemmatized_list.sort()
all_words_lemmatized_list[:10]
# dict(all_words.items())

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

['PAD',
 'UNK',
 'añejo',
 'baby',
 'baileys',
 'bambino',
 'bean',
 'biogourmet',
 'biscoff',
 'blue']

In [16]:
#ДОЛГО + много оперативки
fasttext.util.download_model('ru', if_exists='ignore')

ft = fasttext.load_model('cc.ru.300.bin')
# fasttext.util.reduce_model(ft, 100)
dim = ft.get_dimension()
dim

300

### Получаем векторы для слов 

In [17]:
word_embeddings_dict = {}
for word in all_words_lemmatized_list:
    if word  == 'PAD':
        word_embeddings_dict[word] = np.concatenate((np.zeros(dim),[1, 0]))
    elif word  == 'UNK':
        word_embeddings_dict[word] = np.concatenate((np.zeros(dim),[0, 1]))
    else:
        word_embeddings_dict[word] = np.concatenate((ft.get_word_vector(word), [0, 0]))

In [18]:
first(word_embeddings_dict.items())

('PAD',
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
    

Расширяем разметрости для PAD и UNK

In [19]:
dim += 2

In [20]:
# Освободаем оперативку
del ft

### Векторы для ингредиентов

In [21]:
all_ingredients_list =  list(set(concat(ingredients_lemmatized.swifter.apply(lens.Each().collect()))))
#Добавил UNK и PAD
all_ingredients_list = [('PAD',), ('UNK',)]+all_ingredients_list
all_ingredients_list 
ingredients_numbering_dict = {ingredient: i for i, ingredient in enumerate(all_ingredients_list)}
ltake(6, ingredients_numbering_dict.items())

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

[(('PAD',), 0),
 (('UNK',), 1),
 (('мизуна',), 2),
 (('боровик',), 3),
 (('свиной', 'бескостный', 'шея'), 4),
 (('зернистый', 'горчица'), 5)]

Для получения вектора для ингридиента просто суммируем векторы для каждого слова в этом ингридиенте:

In [22]:
ingredients_embeddings_dict = {}
for ingr in all_ingredients_list:
    ingredients_embeddings_dict[ingr] = np.zeros(dim)
    for word in ingr:
#         ingredients_embeddings_dict[ingr] += ft.get_word_vector(word) 
        ingredients_embeddings_dict[ingr] += word_embeddings_dict[word] # Должно заработать, когда 
                                                                         # ингредиенты будут лемматизированны
ltake(10, ingredients_embeddings_dict.items())[3]


(('боровик',),
 array([-4.33373675e-02,  7.37025738e-02, -1.84198003e-02,  3.93941998e-02,
         1.68087780e-02,  1.12212468e-02,  2.44383290e-02, -2.26427075e-02,
         3.29825878e-02, -6.08613789e-02, -5.05867712e-02,  4.36928794e-02,
        -4.04145531e-02, -5.02390554e-03, -1.26134101e-02,  6.29273206e-02,
         1.19863093e-01, -2.31544301e-02, -6.71497881e-02, -2.71325465e-02,
        -3.14931199e-03, -5.84771065e-03,  2.35398151e-02, -5.11627719e-02,
        -5.06936572e-02, -7.42959082e-02,  8.47465694e-02,  1.19349426e-02,
         3.35531272e-02,  1.97096989e-01,  2.74171364e-02, -4.73382249e-02,
         4.44328487e-02, -2.47115679e-02,  1.35144517e-02, -4.14040834e-02,
        -2.01089159e-02, -9.73712653e-02, -8.91408026e-02, -2.77488362e-02,
        -3.62676308e-02, -4.00171205e-02, -3.98198068e-02, -3.54603573e-04,
        -1.97979491e-02,  3.42309847e-02, -1.73696261e-02,  3.99916023e-02,
         3.62960622e-02,  4.13284451e-02,  1.71173997e-02,  5.46981804e-0

Наконец, для каждого рецепта возьмем его векторизацию как сумму векторов его ингридиентов

In [23]:
ingredients_lemmatized

id
14247     [(зелёный, салат), (зелёный, горошек), (сливоч...
14248     [(куриный, яйцо), (зелень,), (тёртый, сыр, пар...
14249     [(зелёный, салат), (уксус,), (растительный, ма...
14250     [(лимон,), (редис,), (огурец,), (сыр,), (укроп...
14251     [(картофель,), (петрушка,), (чеснок,), (винный...
                                ...                        
153048    [(яблоко,), (вода,), (изюм, косточка), (чёрств...
153049    [(куриный, жир), (мука, маца), (куриный, яйцо)...
153050    [(репчатый, лук), (стебель, сельдерей), (морко...
153051    [(куриный, яйцо), (картофель,), (куриный, буль...
153081    [(дорада,), (помидор,), (каперс,), (петрушка,)...
Name: ingredients, Length: 41438, dtype: object

In [24]:
def ingredient_to_embedding(ingredient):
    return ingredients_embeddings_dict[ingredient]

## Данные для нейросети

Всевозможные метки (course):

In [25]:
all_courses = {name: num for num, name in enumerate(df['course'][~df['course'].duplicated()].sort_values())}
all_courses

{'Выпечка и десерты': 0,
 'Завтраки': 1,
 'Закуски': 2,
 'Напитки': 3,
 'Основные блюда': 4,
 'Паста и пицца': 5,
 'Салаты': 6,
 'Соусы и маринады': 7,
 'Супы': 8}

Группируем кухни:

In [26]:
df_cuisines = pd.read_excel(data_path.joinpath('Сортировка_кухонь.xlsx'))
cuisines_groups = {}
for column in df_cuisines.columns:
    series = df_cuisines[column]
    cuisines_groups[str(column).strip()] = set(map(str.strip, list(series[series.notnull()])))
del cuisines_groups['Unnamed: 1']
inverse_cuisines_groups = {cuisine: cuisine_group
                           for (cuisine_group, cuisines) in cuisines_groups.items()
                           for cuisine in cuisines}
# inverse_cuisines_groups = OrderedDict(sorted(inverse_cuisines_groups.items()))
inverse_cuisines_groups

{'Немецкая кухня': 'Европейская',
 'Крымская кухня': 'Славянская',
 'Филиппинская кухня': 'Юго-Восточная Азия',
 'Индонезийская кухня': 'Юго-Восточная Азия',
 'Японская кухня': 'Восточноазиатская',
 'Тайская кухня': 'Юго-Восточная Азия',
 'Черкесская кухня': 'Кавказская',
 'Польская кухня': 'Европейская',
 'Молдавская кухня': 'Европейская',
 'Украинская кухня': 'Славянская',
 'Казахская кухня': 'Центральноазиатская',
 'Ирландская кухня': 'Европейская',
 'Ливанская кухня': 'Ближний Восток',
 'Белорусская кухня': 'Славянская',
 'Югославская кухня': 'Балканская',
 'Норвежская кухня': 'Скандинавская',
 'Чеченская кухня': 'Кавказская',
 'Итальянская кухня': 'Европейская',
 'Кавказская кухня': 'Кавказская',
 'Бурятская кухня': 'Центральноазиатская',
 'Скандинавская кухня': 'Скандинавская',
 'Австрийская кухня': 'Европейская',
 'Средиземноморская кухня': 'Европейская',
 'Шотландская кухня': 'Европейская',
 'Одесская кухня': 'Славянская',
 'Аргентинская кухня': 'Южноамериканская',
 'Португальс

In [38]:
cuisines_set = sorted(set(inverse_cuisines_groups.values()))
# cuisines_set = sorted(set(df['cuisine']))
# print(cuisines_set)
all_cuisines = {name: num for num, name in enumerate(cuisines_set)}
all_cuisines

{'Африканская': 0,
 'Балканская': 1,
 'Ближний Восток': 2,
 'Восточноазиатская': 3,
 'Европейская': 4,
 'Кавказская': 5,
 'Неклассифицируемое': 6,
 'Североамериканская': 7,
 'Скандинавская': 8,
 'Славянская': 9,
 'Центральноазиатская': 10,
 'Юго-Восточная Азия': 11,
 'Южноамериканская': 12}

### Разные эмбеддинги для рецептов:

In [39]:
recipe_embedding_sum = ingredients_lemmatized.apply(
    lambda recipe: sum(map(ingredient_to_embedding, recipe))
)
recipe_embedding_sum

id
14247     [0.7618384752422571, 0.08853759150952101, 0.15...
14248     [0.13557584770023823, 0.3944497350603342, -0.0...
14249     [0.5860096383839846, 0.07307140016928315, 0.18...
14250     [0.4274589493870735, -0.17271707206964493, -0....
14251     [0.436641464009881, 0.20584300439804792, 0.027...
                                ...                        
153048    [0.38410141598433256, 0.13397947791963816, -0....
153049    [0.9013344636186957, 0.3460128325968981, -0.23...
153050    [1.264339747838676, -0.08623092842753977, 0.37...
153051    [0.8304240852594376, -0.011165377218276262, -0...
153081    [0.5803682114928961, 0.40145984827540815, 0.12...
Name: ingredients, Length: 41438, dtype: object

In [40]:
recipe_embedding_avg = ingredients_lemmatized.apply(
    lambda recipe: np.average(lmap(ingredient_to_embedding, recipe), axis=0)
)
recipe_embedding_avg

id
14247     [0.09522980940528214, 0.011067198938690126, 0....
14248     [0.027115169540047647, 0.07888994701206684, -0...
14249     [0.09766827306399743, 0.012178566694880525, 0....
14250     [0.07124315823117892, -0.028786178678274155, -...
14251     [0.062377352001411576, 0.029406143485435417, 0...
                                ...                        
153048    [0.05487163085490465, 0.019139925417091166, -0...
153049    [0.10014827373541063, 0.03844587028854423, -0....
153050    [0.09030998198847685, -0.006159352030538555, 0...
153051    [0.09226934280660418, -0.0012405974686973626, ...
153081    [0.05276074649935419, 0.03649634984321892, 0.0...
Name: ingredients, Length: 41438, dtype: object

#### Делаем tf-idf

In [41]:
if 'tf' in globals():
    del tf
tf = pd.DataFrame(index=df.index.rename('recipe_id'), 
                  columns=all_ingredients_list,
                  data=0,
                  dtype=np.float64)
for index, count in tf.iterrows():
    for ingredient in ingredients_lemmatized.loc[index]:
        count[ingredient] += 1
    count /= len(ingredients_lemmatized.loc[index])
tf

Unnamed: 0_level_0,"(PAD,)","(UNK,)","(мизуна,)","(боровик,)","(свиной, бескостный, шея)","(зернистый, горчица)","(консервированный, краб)","(шафрановый, соус)","(булочка, кунжут)","(коктейль, морепродукт)",...,"(ржаной, хлеб, семечко)","(безе,)","(гороховый, смесь)","(соус, кабуля)","(безалкогольный, сидр)","(сыр, пармезан, корка)","(свежий, сердцевина, артишок)","(сливовый, вино)","(свежий, шампиньон)","(смесь, перец)"
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
idf = dict(zip(tf.columns, iterate(lambda _: 1, 1)))
for recipe in ingredients_lemmatized.values:
    for ingredient in recipe:
        idf[ingredient] += 1
idf = pd.Series(index=pd.Index(idf.keys(), tupleize_cols=False), data=idf.values(), dtype=np.float64)
idf /= len(tf.index)
idf = -idf.apply(np.log)
idf

(PAD,)                           10.631954
(UNK,)                           10.631954
(мизуна,)                         8.840194
(боровик,)                        9.938806
(свиной, бескостный, шея)         9.245659
                                   ...    
(сыр, пармезан, корка)            8.840194
(свежий, сердцевина, артишок)     9.938806
(сливовый, вино)                  8.840194
(свежий, шампиньон)               4.869902
(смесь, перец)                    5.455804
Length: 4406, dtype: float64

In [43]:
if 'tfidf' in globals():
    del tfidf
tfidf = tf*idf
tfidf

Unnamed: 0_level_0,"(PAD,)","(UNK,)","(мизуна,)","(боровик,)","(свиной, бескостный, шея)","(зернистый, горчица)","(консервированный, краб)","(шафрановый, соус)","(булочка, кунжут)","(коктейль, морепродукт)",...,"(ржаной, хлеб, семечко)","(безе,)","(гороховый, смесь)","(соус, кабуля)","(безалкогольный, сидр)","(сыр, пармезан, корка)","(свежий, сердцевина, артишок)","(сливовый, вино)","(свежий, шампиньон)","(смесь, перец)"
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Из tfidf берется строка соответствующая рецепту и ингредиенты только из рецепта
# Их значение умножается на эмбеддинги этих ингредиентов и складываются (т.е. вектор умнож. на матрицу)
recipe_embedding_sum_tf_idf = pd.Series([
    tfidf.iloc[i][ingredient_list]@np.array([ingredient_to_embedding(ingredient)
        for ingredient in ingredient_list]) 
    for i,ingredient_list in enumerate(ingredients_lemmatized)
], index=ingredients_lemmatized.index)
recipe_embedding_sum_tf_idf

id
14247     [0.2312899307317464, -0.037986292018750534, 0....
14248     [-0.047695562428056684, 0.13134068914467661, 0...
14249     [0.26862193160372755, -0.07282666881717992, 0....
14250     [0.23934587523872475, -0.08555719879431486, -0...
14251     [0.09671493660172885, 0.027734709358337214, -0...
                                ...                        
153048    [0.15586463496750225, 0.04398976159448573, -0....
153049    [0.3334614143747073, 0.06992642450949106, -0.0...
153050    [0.2597255528548992, -0.02830162438745916, 0.0...
153051    [0.21304318309636022, -0.05760207030286996, -0...
153081    [0.15346220902734348, 0.08630516904926387, 0.0...
Length: 41438, dtype: object

In [45]:
recipe_embedding_avg_tf_idf = pd.Series([
    tfidf.iloc[i][ingredient_list]@np.array([ingredients_embeddings_dict[ingredient]
        for ingredient in ingredient_list])/len(ingredient_list)
    for i,ingredient_list in enumerate(ingredients_lemmatized)
], index=ingredients_lemmatized.index)
recipe_embedding_avg_tf_idf

id
14247     [0.0289112413414683, -0.004748286502343817, 0....
14248     [-0.009539112485611336, 0.026268137828935324, ...
14249     [0.04477032193395459, -0.012137778136196653, 0...
14250     [0.039890979206454125, -0.01425953313238581, -...
14251     [0.013816419514532692, 0.003962101336905316, -...
                                ...                        
153048    [0.022266376423928893, 0.006284251656355104, -...
153049    [0.037051268263856366, 0.007769602723276784, -...
153050    [0.01855182520392137, -0.0020215445991042255, ...
153051    [0.02367146478848447, -0.006400230033652217, -...
153081    [0.01395110991157668, 0.007845924459023988, 0....
Length: 41438, dtype: object

### Все нужные данные переводим в pickle

In [47]:
df_for_learning = pd.DataFrame(data={
    'recipe_embedding_sum': recipe_embedding_sum,
    'recipe_embedding_avg': recipe_embedding_avg,
    'recipe_embedding_sum_tf_idf': recipe_embedding_sum_tf_idf,
    'recipe_embedding_avg_tf_idf': recipe_embedding_avg_tf_idf,
    'course_mark': [all_courses[course] for course in df['course']],
    'cuisine_group_mark': [all_cuisines[inverse_cuisines_groups[cuisine]] for cuisine in df['cuisine']]
#     'cuisine_group_mark': [all_cuisines[cuisine] for cuisine in df['cuisine']]
}, index=df.index)
df_for_learning

Unnamed: 0_level_0,recipe_embedding_sum,recipe_embedding_avg,recipe_embedding_sum_tf_idf,recipe_embedding_avg_tf_idf,course_mark,cuisine_group_mark
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
14247,"[0.7618384752422571, 0.08853759150952101, 0.15...","[0.09522980940528214, 0.011067198938690126, 0....","[0.2312899307317464, -0.037986292018750534, 0....","[0.0289112413414683, -0.004748286502343817, 0....",2,4
14248,"[0.13557584770023823, 0.3944497350603342, -0.0...","[0.027115169540047647, 0.07888994701206684, -0...","[-0.047695562428056684, 0.13134068914467661, 0...","[-0.009539112485611336, 0.026268137828935324, ...",1,4
14249,"[0.5860096383839846, 0.07307140016928315, 0.18...","[0.09766827306399743, 0.012178566694880525, 0....","[0.26862193160372755, -0.07282666881717992, 0....","[0.04477032193395459, -0.012137778136196653, 0...",6,4
14250,"[0.4274589493870735, -0.17271707206964493, -0....","[0.07124315823117892, -0.028786178678274155, -...","[0.23934587523872475, -0.08555719879431486, -0...","[0.039890979206454125, -0.01425953313238581, -...",6,4
14251,"[0.436641464009881, 0.20584300439804792, 0.027...","[0.062377352001411576, 0.029406143485435417, 0...","[0.09671493660172885, 0.027734709358337214, -0...","[0.013816419514532692, 0.003962101336905316, -...",6,4
...,...,...,...,...,...,...
153048,"[0.38410141598433256, 0.13397947791963816, -0....","[0.05487163085490465, 0.019139925417091166, -0...","[0.15586463496750225, 0.04398976159448573, -0....","[0.022266376423928893, 0.006284251656355104, -...",8,2
153049,"[0.9013344636186957, 0.3460128325968981, -0.23...","[0.10014827373541063, 0.03844587028854423, -0....","[0.3334614143747073, 0.06992642450949106, -0.0...","[0.037051268263856366, 0.007769602723276784, -...",8,2
153050,"[1.264339747838676, -0.08623092842753977, 0.37...","[0.09030998198847685, -0.006159352030538555, 0...","[0.2597255528548992, -0.02830162438745916, 0.0...","[0.01855182520392137, -0.0020215445991042255, ...",8,2
153051,"[0.8304240852594376, -0.011165377218276262, -0...","[0.09226934280660418, -0.0012405974686973626, ...","[0.21304318309636022, -0.05760207030286996, -0...","[0.02367146478848447, -0.006400230033652217, -...",8,2


In [48]:
dict_to_pickle = {
    'df_for_learning': df_for_learning,
    'all_courses': all_courses,
    'all_cuisines': all_cuisines,
    'word_embeddings_dict': word_embeddings_dict,
    'ingredients_numbering_dict': ingredients_numbering_dict,
    'dim': dim,
    'ingredients_lemmatized': ingredients_lemmatized,
    'all_ingredients_list': all_ingredients_list,
    'ingredients_embeddings_dict': ingredients_embeddings_dict,
}
with data_path.joinpath('processed_data_dict.pkl').open(mode='wb') as f:
    pickle.dump(dict_to_pickle, f)