In [8]:
import pandas as pd
import re
import json
from concurrent.futures import ProcessPoolExecutor
from batch_processing import process_batch, RecipeProcessor

In [9]:
def process_in_batches(recipe_df, ingr_mapping_df, ingredient_dic, batch_size=10000):
    num_batches = len(recipe_df) // batch_size + (1 if len(recipe_df) % batch_size != 0 else 0)
    processed_batches = []

    with ProcessPoolExecutor() as executor:
        futures = []
        for i in range(num_batches):
            batch = recipe_df.iloc[i*batch_size:(i+1)*batch_size].copy()
            futures.append(executor.submit(process_batch, batch, ingr_mapping_df, ingredient_dic))

        for future in futures:
            processed_batches.append(future.result())

    super_final_df = pd.concat(processed_batches, ignore_index=True)
    return super_final_df

In [10]:
if __name__ == "__main__":
    recipe_df = pd.read_csv("data/full_dataset.csv")
    ingr_path = 'data/flavor_network_data/ingr_comp/ingr_info.tsv'
    ingr_tsv = pd.read_csv(ingr_path, delimiter='\t')

    ingr_mapping_df = pd.DataFrame(data = ingr_tsv)
    ingr_columns = ['ingredient_id', 'ingredient_name', 'ingredient_category']
    ingr_mapping_df.columns = ingr_columns

    with open('./ingredient_dic.json', 'r') as json_file:
        ingredient_dic = json.load(json_file)


In [11]:
super_final_df = process_in_batches(recipe_df, ingr_mapping_df, ingredient_dic)

In [12]:
super_final_df.head() 

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,total_ingredients,matched_ingredients,matched_percentage,...,rue_oil,roasted_coconut,cajeput_oil,star_anise,feijoa,tobacco_oil,cinnamon_leaf,green_tea,artemisia_porrecta_oil,munster_cheese
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[milk, butter, brown_sugar, nut, bite_size_shr...",6,4,66.666667,...,0,0,0,0,0,0,0,0,0,0
1,9,Millionaire Pie,"[""1 large container Cool Whip"", ""1 large can c...","[""Empty Cool Whip into a bowl."", ""Drain juice ...",www.cookbooks.com/Recipe-Details.aspx?id=794547,Gathered,"[condensed_milk, pecan, lemon, graham_cracker_...",5,3,60.0,...,0,0,0,0,0,0,0,0,0,0
2,22,Cuddy Farms Marinated Turkey,"[""2 c. 7-Up or Sprite"", ""1 c. vegetable oil"", ...","[""Buy whole turkey breast; remove all skin and...",www.cookbooks.com/Recipe-Details.aspx?id=9449,Gathered,"[soy_sauce, garlic, vegetable_oil]",3,3,100.0,...,0,0,0,0,0,0,0,0,0,0
3,24,Prize-Winning Meat Loaf,"[""1 1/2 lb. ground beef"", ""1 c. tomato juice"",...","[""Mix well."", ""Press firmly into an 8 1/2 x 4 ...",www.cookbooks.com/Recipe-Details.aspx?id=923674,Gathered,"[egg, tomato_juice, pepper, ground_beef, salt,...",7,5,71.428571,...,0,0,0,0,0,0,0,0,0,0
4,33,Potato And Cheese Pie,"[""3 eggs"", ""1 tsp. salt"", ""1/4 tsp. pepper"", ""...","[""Beat eggs, salt and pepper until well blende...",www.cookbooks.com/Recipe-Details.aspx?id=784386,Gathered,"[pepper, cheddar_cheese, egg, potato, salt, gr...",6,4,66.666667,...,0,0,0,0,0,0,0,0,0,0


In [13]:
super_final_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

In [14]:
super_final_df.to_csv('data/super_final_df.csv.gz', float_format='%.2f', index=False, compression='gzip')

In [7]:
super_final_df = pd.read_csv('data/super_final_df.csv')