# TODO
- レシピデータをCSV出力（低次元化用）
- インタラクション・ユーザデータを含むグラフデータを出力（Graph Embedding用）

In [35]:
import os,datetime
import pandas as pd
import numpy as np
from ast import literal_eval

folder = '../../data/foodcom/'

In [36]:
# データの読み込みと前処理
raw_recipes = pd.read_csv(folder+"RAW_recipes.csv")
pp_recipes = pd.read_csv(folder+"PP_recipes.csv")

# nutrition列を分割して数値に変換
raw_recipes[['calories','fat','sugar','sodium','protein','saturated fat','carbohydrates']] = raw_recipes['nutrition']\
    .str.replace('[','').str.replace(']','').str.split(",",expand=True).astype('float')

raw_recipes['submitted'] = pd.to_datetime(raw_recipes['submitted'], format='%Y-%m-%d')
raw_recipes['tags'] = raw_recipes['tags'].apply(literal_eval)
raw_recipes['steps'] = raw_recipes['steps'].apply(literal_eval)

pp_recipes['ingredient_ids'] = pp_recipes['ingredient_ids'].apply(literal_eval)
pp_recipes['techniques'] = pp_recipes['techniques'].apply(literal_eval)

# technique_dataをDataFrameに変換
technique_data = [
    'bake','barbecue','blanch','blend','boil','braise','brine','broil','caramelize','combine','crock pot',
    'crush','deglaze','devein','dice','distill','drain','emulsify','ferment','freez','fry','grate','griddle',
    'grill','knead','leaven','marinate','mash','melt','microwave','parboil','pickle','poach','pour','pressure cook',
    'puree','refrigerat','roast','saute','scald','scramble','shred','simmer','skillet','slow cook','smoke','smooth',
    'soak','sous-vide','steam','stew','strain','tenderize','thicken','toast','toss','whip','whisk'
]
technique_data = pd.DataFrame({'technique_id': range(len(technique_data)), 'technique_name': technique_data})

# technique_idsをマッピング
pp_recipes['technique_ids'] = pp_recipes['techniques'].apply(lambda x: [i for i, t in zip(range(len(technique_data)), technique_data['technique_id']) if t in x])

# データのマージと選択カラムの抽出
recipe_data = pp_recipes.merge(right=raw_recipes, on="id")[['id', 'name', 'description', 'n_steps', 'n_ingredients', 'ingredient_ids', 'technique_ids', 'steps', 'minutes', 'contributor_id', 'submitted', 'tags', 'calories', 'fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']].rename(columns={'id':'recipe_id','name':'recipe_name'})

# ingr_map.pklからingredient_dataを取得
mapping = pd.read_pickle(folder+"ingr_map.pkl")
ingredient_data = mapping[~mapping.duplicated(subset='id')][['id','replaced']].rename(columns={'id':'ingredient_id','replaced':'ingredient_name'})

# 表示
display('recipe_data', len(recipe_data), recipe_data.head(1))
display('ingredient_data', len(ingredient_data), ingredient_data.head(1))
display('technique_data', len(technique_data), technique_data.head(1))

'recipe_data'

178265

Unnamed: 0,recipe_id,recipe_name,description,n_steps,n_ingredients,ingredient_ids,technique_ids,steps,minutes,contributor_id,submitted,tags,calories,fat,sugar,sodium,protein,saturated fat,carbohydrates
0,424415,aromatic basmati rice rice cooker,from the ultimate rice cooker cookbook. the a...,6,5,"[389, 7655, 6270, 1527, 3406]","[0, 1]","[rinse the rice in a fine strainer , then drai...",61,496803,2010-05-10,"[weeknight, time-to-make, course, main-ingredi...",228.2,2.0,2.0,8.0,9.0,1.0,15.0


'ingredient_data'

8023

Unnamed: 0,ingredient_id,ingredient_name
0,4308,lettuce


'technique_data'

58

Unnamed: 0,technique_id,technique_name
0,0,bake


In [37]:
# 辞書を作成して、ルックアップを高速化
ingredient_id_to_name = ingredient_data.set_index('ingredient_id')['ingredient_name'].to_dict()
technique_id_to_name = technique_data.set_index('technique_id')['technique_name'].to_dict()

def replace_ingredient_ids_with_names(ingredient_ids_list):
    return [ingredient_id_to_name.get(id) for id in ingredient_ids_list]

def replace_technique_ids_with_names(technique_ids_list):
    return [technique_id_to_name.get(id) for id in technique_ids_list]

# recipe_dataのingredient_idsとtechnique_idsを置換
recipe_data['ingredient_names'] = recipe_data['ingredient_ids'].apply(replace_ingredient_ids_with_names)
recipe_data['technique_names'] = recipe_data['technique_ids'].apply(replace_technique_ids_with_names)

In [38]:
recipe_data.head(1)

Unnamed: 0,recipe_id,recipe_name,description,n_steps,n_ingredients,ingredient_ids,technique_ids,steps,minutes,contributor_id,...,tags,calories,fat,sugar,sodium,protein,saturated fat,carbohydrates,ingredient_names,technique_names
0,424415,aromatic basmati rice rice cooker,from the ultimate rice cooker cookbook. the a...,6,5,"[389, 7655, 6270, 1527, 3406]","[0, 1]","[rinse the rice in a fine strainer , then drai...",61,496803,...,"[weeknight, time-to-make, course, main-ingredi...",228.2,2.0,2.0,8.0,9.0,1.0,15.0,"[basmati rice, water, salt, cinnamon stick, gr...","[bake, barbecue]"


In [39]:
interaction_data = pd.read_csv(folder+'RAW_interactions.csv')
interaction_data['date'] = interaction_data['date'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
interaction_data = interaction_data.sort_values(by="date")

display('interaction_data',len(interaction_data),interaction_data.head(1))
# display('user_data',len(user_data),user_data.head(1))

'interaction_data'

1132367

Unnamed: 0,user_id,recipe_id,date,rating,review
514604,2008,992,2000-01-25,5,better than any you can get at a restaurant!


In [40]:
recipe_data['description'] = recipe_data['description'].fillna(value='')

In [41]:
# レシピデータとインタラクションデータをマージしてから計算することで高速化できます
recipe_interaction_data = recipe_data.merge(interaction_data, on='recipe_id', how='left')

# 平均評価と評価数を一度に計算する
recipe_data['average_rating'] = recipe_interaction_data.groupby('recipe_id')['rating'].mean()
recipe_data['n_ratings'] = recipe_interaction_data.groupby('recipe_id')['rating'].count()
recipe_data['average_rating'] = recipe_data['average_rating'].fillna(value=0)
recipe_data['n_ratings'] = recipe_data['n_ratings'].fillna(value=0)

In [42]:
# 評価数が10以上のレシピデータから平均評価と評価数が多い上位1000件をフラグ付けするコード
top_rated_recipes = recipe_data[recipe_data['n_ratings'] >= 10].nlargest(1000, ['average_rating', 'n_ratings'])
top_rated_recipes['top_1000'] = True

# 上位1000件にフラグが立っていないレシピにはFalseを設定
recipe_data = recipe_data.merge(top_rated_recipes[['recipe_id', 'top_1000']], on='recipe_id', how='left')
recipe_data['top_1000'] = recipe_data['top_1000'].fillna(False).astype(bool)

  recipe_data['top_1000'] = recipe_data['top_1000'].fillna(False).astype(bool)


In [43]:
recipe_data = recipe_data.astype({col: int for col in recipe_data.select_dtypes(['int64']).columns})
recipe_data = recipe_data.astype({col: float for col in recipe_data.select_dtypes(['float64']).columns})
recipe_data['submitted'] = recipe_data['submitted'].dt.strftime('%Y-%m-%d')

In [44]:
interaction_data['date'] = interaction_data['date'].dt.strftime('%Y-%m-%d')
interaction_data['review'] = interaction_data['review'].fillna(value='')

In [45]:
recipe_data.to_csv('../../data/foodcom_recipes.csv', index=False)
interaction_data.to_csv('../../data/foodcom_interactions.csv', index=False)

In [46]:
import pandas as pd

# サンプルDataFrameの作成
df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B'],
    'data1': [10, 20, 30, 40],
    'data2': [100, 200, 300, 400]
})

# groupカラムでグループ化し、各グループの各列を辞書化したもののリストを値とする辞書を作成
result_dict = {group: group_df.drop('group', axis=1).to_dict('records') for group, group_df in df.groupby('group')}

result_dict


{'A': [{'data1': 10, 'data2': 100}, {'data1': 20, 'data2': 200}],
 'B': [{'data1': 30, 'data2': 300}, {'data1': 40, 'data2': 400}]}

In [47]:
# result_dict = {group: group_df.drop('recipe_id', axis=1).to_dict('records') for group, group_df in interaction_data.groupby('recipe_id', sort=False)}


In [48]:
# for k,v in result_dict.items():
#     if len(v) >= 10:
#         print(k,len(v))

In [49]:
from tqdm import tqdm
import json

# interaction_dataをレシピIDごとにグループ化し、辞書に格納します。
interaction_groups = {group: group_df.drop('recipe_id', axis=1).to_dict('records') for group, group_df in interaction_data.groupby('recipe_id', sort=False)}
# レシピデータに対して関数を適用し、必要な情報を含む辞書を作成します。
def create_recipe_dict(id, row):
    rid = row['recipe_id']
    interactions = interaction_groups.get(rid, [])
    data = {
        'id': rid,
        'name': row['recipe_name'],
        'description': row['description'],
        'tags': row['tags'],
        'minutes': row['minutes'],
        'contributor_id': row['contributor_id'],
        'techniques': {
            'num': len(row['technique_ids']),
            'names': row['technique_names'],
            'ids': row['technique_ids'],
        },
        'ingredients': {
            'num': row['n_ingredients'],
            'names': row['ingredient_names'],
            'ids': row['ingredient_ids']
        },
        'steps': {
            'num': row['n_steps'],
            'instructions': row['steps'],
        },
        'nutrition': {
            'calories': row['calories'],
            'protein': row['protein'],
            'carbohydrates': row['carbohydrates'],
            'fat': row['fat'],
            'sugar': row['sugar'],
            'sodium': row['sodium'],
            'saturated_fat': row['saturated fat'],
        },
        'rating': {
            'interactions': interactions,
            'average': row['average_rating'],
            'n': row['n_ratings']
        },
        'top_1000': row['top_1000']
    }
    return data

# tqdmを使用して進捗を表示しながら、apply関数を使用して各行に関数を適用します。
json_list = recipe_data.apply(lambda row: create_recipe_dict(row.name, row), axis=1).tolist()

In [50]:
import json

# jsonl形式で保存するための関数を定義
def save_as_jsonl(data_list, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in data_list:
            json_record = json.dumps(item, ensure_ascii=False)
            file.write(json_record + '\n')

# json_listをjsonlファイルとして保存
save_as_jsonl(json_list, '../../data/foodcom_recipes.jsonl')

In [51]:
interactions = interaction_groups.get(109432, [])
interactions

[{'user_id': 37449,
  'date': '2005-02-01',
  'rating': 4,
  'review': 'This definitely was a colorful slaw! I enjoyed the flavors of the cumin and chili powder. The balsamic vinegar was a nice touch too. Thanks Jewell!'}]