# TODO
- レシピデータをCSV出力（低次元化用）
- インタラクション・ユーザデータを含むグラフデータを出力（Graph Embedding用）

In [13]:
import os,datetime
import pandas as pd
import numpy as np
from ast import literal_eval

folder = '../../data/foodcom/'

In [14]:
raw_recipes = pd.read_csv(folder+"RAW_recipes.csv")
pp_recipes = pd.read_csv(folder+"PP_recipes.csv")

raw_recipes[['calories','fat','sugar','sodium','protein','saturated fat','carbohydrates']] = raw_recipes.nutrition.str.split(",",expand=True)
raw_recipes['calories'] =  raw_recipes['calories'].apply(lambda x: x.replace('[',''))
raw_recipes['carbohydrates'] =  raw_recipes['carbohydrates'].apply(lambda x: x.replace(']',''))
raw_recipes['submitted'] = raw_recipes['submitted'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
raw_recipes[['calories','fat','sugar','sodium','protein','saturated fat','carbohydrates']] = raw_recipes[['calories','fat','sugar','sodium','protein','saturated fat','carbohydrates']].astype('float')
raw_recipes['tags'] = raw_recipes['tags'].apply(lambda x: literal_eval(x))
pp_recipes['ingredient_ids'] = pp_recipes['ingredient_ids'].apply(lambda x: literal_eval(x))
pp_recipes['techniques'] = pp_recipes['techniques'].apply(lambda x: literal_eval(x))
technique_data = [
    'bake','barbecue','blanch','blend','boil','braise','brine','broil','caramelize','combine','crock pot',
    'crush','deglaze','devein','dice','distill','drain','emulsify','ferment','freez','fry','grate','griddle',
    'grill','knead','leaven','marinate','mash','melt','microwave','parboil','pickle','poach','pour','pressure cook',
    'puree','refrigerat','roast','saute','scald','scramble','shred','simmer','skillet','slow cook','smoke','smooth',
    'soak','sous-vide','steam','stew','strain','tenderize','thicken','toast','toss','whip','whisk'
]
technique_data = pd.DataFrame([[i,t] for i,t in enumerate(technique_data)], columns=['technique_id','technique_name'])
def mapping_tech(x):
    return [t for v,t in (zip(x,technique_data.loc[:,'technique_id'])) if v == 1]
pp_recipes['technique_ids'] = pp_recipes['techniques'].apply(mapping_tech)
recipe_data = pp_recipes.merge(right=raw_recipes, left_on="id", right_on="id")
# display(recipe_data.head())
recipe_data = recipe_data[[
    'id', 'name', 'description', 'n_steps', 'n_ingredients',
    'ingredient_ids', 'technique_ids', 'steps',
    'minutes', 'contributor_id', 'submitted', 'tags',
    'calories', 'fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates'
]].rename(columns={'id':'recipe_id','name':'recipe_name'})
mapping = pd.read_pickle(folder+"ingr_map.pkl")
ingredient_data = mapping[~mapping.duplicated(subset='id')][['id','replaced']].rename(columns={'id':'ingredient_id','replaced':'ingredient_name'})

display('recipe_data',len(recipe_data),recipe_data.head(1))
display('ingredient_data',len(ingredient_data),ingredient_data.head(1))
display('technique_data',len(technique_data),technique_data.head(1))

'recipe_data'

178265

Unnamed: 0,recipe_id,recipe_name,description,n_steps,n_ingredients,ingredient_ids,technique_ids,steps,minutes,contributor_id,submitted,tags,calories,fat,sugar,sodium,protein,saturated fat,carbohydrates
0,424415,aromatic basmati rice rice cooker,from the ultimate rice cooker cookbook. the a...,6,5,"[389, 7655, 6270, 1527, 3406]","[9, 16, 51]","['rinse the rice in a fine strainer , then dra...",61,496803,2010-05-10,"[weeknight, time-to-make, course, main-ingredi...",228.2,2.0,2.0,8.0,9.0,1.0,15.0


'ingredient_data'

8023

Unnamed: 0,ingredient_id,ingredient_name
0,4308,lettuce


'technique_data'

58

Unnamed: 0,technique_id,technique_name
0,0,bake


In [15]:
# 辞書を作成して、ルックアップを高速化
ingredient_id_to_name = ingredient_data.set_index('ingredient_id')['ingredient_name'].to_dict()
technique_id_to_name = technique_data.set_index('technique_id')['technique_name'].to_dict()

def replace_ingredient_ids_with_names(ingredient_ids_list):
    return [ingredient_id_to_name.get(id) for id in ingredient_ids_list]

def replace_technique_ids_with_names(technique_ids_list):
    return [technique_id_to_name.get(id) for id in technique_ids_list]

# recipe_dataのingredient_idsとtechnique_idsを置換
recipe_data['ingredient_names'] = recipe_data['ingredient_ids'].apply(replace_ingredient_ids_with_names)
recipe_data['technique_names'] = recipe_data['technique_ids'].apply(replace_technique_ids_with_names)

In [16]:
recipe_data.head(1)

Unnamed: 0,recipe_id,recipe_name,description,n_steps,n_ingredients,ingredient_ids,technique_ids,steps,minutes,contributor_id,...,tags,calories,fat,sugar,sodium,protein,saturated fat,carbohydrates,ingredient_names,technique_names
0,424415,aromatic basmati rice rice cooker,from the ultimate rice cooker cookbook. the a...,6,5,"[389, 7655, 6270, 1527, 3406]","[9, 16, 51]","['rinse the rice in a fine strainer , then dra...",61,496803,...,"[weeknight, time-to-make, course, main-ingredi...",228.2,2.0,2.0,8.0,9.0,1.0,15.0,"[basmati rice, water, salt, cinnamon stick, gr...","[combine, drain, strain]"


In [17]:
interaction_data = pd.read_csv(folder+'RAW_interactions.csv')
interaction_data['date'] = interaction_data['date'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
interaction_data = interaction_data.sort_values(by="date")

display('interaction_data',len(interaction_data),interaction_data.head(1))
# display('user_data',len(user_data),user_data.head(1))

'interaction_data'

1132367

Unnamed: 0,user_id,recipe_id,date,rating,review
514604,2008,992,2000-01-25,5,better than any you can get at a restaurant!


In [18]:
recipe_data['average_rating'] = recipe_data['recipe_id'].apply(
    lambda x: interaction_data[interaction_data['recipe_id'] == x]['rating'].mean() if x in interaction_data['recipe_id'].values else np.nan
)

recipe_data['n_ratings'] = recipe_data['recipe_id'].apply(
    lambda x: len(interaction_data[interaction_data['recipe_id'] == x]) if x in interaction_data['recipe_id'].values else 0
)

In [19]:
# 評価数が10以上のレシピデータから平均評価と評価数が多い上位1000件をフラグ付けするコード
top_rated_recipes = recipe_data[recipe_data['n_ratings'] >= 10].nlargest(1000, ['average_rating', 'n_ratings'])
top_rated_recipes['top_1000'] = True

# 上位1000件にフラグが立っていないレシピにはFalseを設定
recipe_data = recipe_data.merge(top_rated_recipes[['recipe_id', 'top_1000']], on='recipe_id', how='left')
recipe_data['top_1000'] = recipe_data['top_1000'].fillna(False).astype(bool)

  recipe_data['top_1000'] = recipe_data['top_1000'].fillna(False).astype(bool)


In [20]:
recipe_data = recipe_data.astype({col: int for col in recipe_data.select_dtypes(['int64']).columns})
recipe_data = recipe_data.astype({col: float for col in recipe_data.select_dtypes(['float64']).columns})
recipe_data['submitted'] = recipe_data['submitted'].dt.strftime('%Y-%m-%d')

In [21]:
interaction_data['date'] = interaction_data['date'].dt.strftime('%Y-%m-%d')

In [22]:
recipe_data.to_csv('../../data/foodcom_recipes.csv', index=False)
interaction_data.to_csv('../../data/foodcom_interactions.csv', index=False)

In [23]:
from tqdm import tqdm
import json

# interaction_dataをレシピIDごとにグループ化し、辞書に格納します。
interaction_groups = interaction_data.groupby('recipe_id').apply(lambda df: df.to_dict(orient='records'), include_groups=False).to_dict()
# レシピデータに対して関数を適用し、必要な情報を含む辞書を作成します。
def create_recipe_dict(id, row):
    rid = row['recipe_id']
    interactions = interaction_groups.get(rid, [])
    data = {
        'id': rid,
        'name': row['recipe_name'],
        'description': row['description'],
        'tags': row['tags'],
        'minutes': row['minutes'],
        'contributor_id': row['contributor_id'],
        'techniques': {
            'names': row['technique_names'],
            'ids': row['technique_ids'],
        },
        'ingredients': {
            'num': row['n_ingredients'],
            'names': row['ingredient_names'],
            'ids': row['ingredient_ids']
        },
        'steps': {
            'num': row['n_steps'],
            'instructions': row['steps'],
        },
        'nutrition': {
            'calories': row['calories'],
            'protein': row['protein'],
            'carbohydrates': row['carbohydrates'],
            'fat': row['fat'],
            'sugar': row['sugar'],
            'sodium': row['sodium'],
            'saturated_fat': row['saturated fat'],
        },
        'rating': {
            'interactions': interactions,
            'average': row['average_rating'],
            'n': row['n_ratings']
        },
        'top_1000': row['top_1000']
    }
    return data

# tqdmを使用して進捗を表示しながら、apply関数を使用して各行に関数を適用します。
json_list = recipe_data.apply(lambda row: create_recipe_dict(row.name, row), axis=1).tolist()

In [24]:
import json

# jsonl形式で保存するための関数を定義
def save_as_jsonl(data_list, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in data_list:
            json_record = json.dumps(item, ensure_ascii=False)
            file.write(json_record + '\n')

# json_listをjsonlファイルとして保存
save_as_jsonl(json_list, '../../data/foodcom_recipes.jsonl')