In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import pandas as pd
import opendatasets as od
import re

if os.getcwd().endswith('notebooks'):
    # Move up one level to the project root
    os.chdir('..') 

print(f"Current Working Directory: {os.getcwd()}")

#Data directory and dataset URL
DATA_DIR = "data"
KAGGLE_DATASET_URL = "https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions"

#Checking data directory existence
os.makedirs(DATA_DIR, exist_ok=True)

#Downloading dataset
od.download(KAGGLE_DATASET_URL, data_dir=DATA_DIR)

Current Working Directory: /home/hsozer/Projects/DeepLearning/recipe_bot
Skipping, found downloaded files in "data/food-com-recipes-and-user-interactions" (use force=True to force download)


In [3]:
from scripts.scraper import sitemap_category_extract, recipe_harvester

# CONFIG
SITEMAP_URL = "https://www.allrecipes.com/sitemap.xml"
CATEGORY_FILTER = "/appetizers-and-snacks/"
RECIPE_PATTERN = re.compile(r'(/recipe/\d+)|(-recipe-\d+/?$)')
EXCLUDE_LIST = ['/gallery/', '/article/']

categories = sitemap_category_extract(SITEMAP_URL, CATEGORY_FILTER)

if categories:
    print(f"\n--- Moving to Harvest Phase ({len(categories)} Categories) ---")

    recipe_url_dataset = recipe_harvester(
        category_urls = categories,
        url_pattern = RECIPE_PATTERN,
        exclude_substrings = EXCLUDE_LIST
    )

    df = pd.DataFrame(recipe_url_dataset, columns=["recipe_url"])
    os.makedirs(DATA_DIR, exist_ok=True)

    output_path = os.path.join(DATA_DIR, "harvested_recipe_urls.csv")
    df.to_csv(output_path, index=False)

    print(f"\n‚úÖ Success! {len(df)} Recipe URLs saved to: {output_path}")

else:
    print("No categories found with the specified filter.")

üó∫Ô∏è  Fetching Sitemap Index...
üîç Found 4 sub-sitemaps. Scanning for Appetizers and Snacks...
‚úÖ Found 92 URLs matching '/appetizers-and-snacks/'.

--- Moving to Harvest Phase (92 Categories) ---
üöú Starting Harvest on 92 categories...
üìÇ Processing category: https://www.allrecipes.com/recipes/1281/appetizers-and-snacks/dips-and-spreads/hummus/
üìÇ Processing category: https://www.allrecipes.com/recipes/1118/appetizers-and-snacks/vegetable/mushrooms/
üìÇ Processing category: https://www.allrecipes.com/recipes/16580/appetizers-and-snacks/snacks/chips/kale-chips/
üìÇ Processing category: https://www.allrecipes.com/recipes/16423/holidays-and-events/big-game/appetizers-and-snacks/
üìÇ Processing category: https://www.allrecipes.com/recipes/435/appetizers-and-snacks/dips-and-spreads/pate/
üìÇ Processing category: https://www.allrecipes.com/recipes/22451/everyday-cooking/special-collections/food-wishes/appetizers-and-snacks/
üìÇ Processing category: https://www.allrecipes.co

In [None]:
import time
from scripts.scraper import fetch_recipe_data, process_recipes_to_final_format

# --- CONFIGURATION & CONSTANTS ---
FDA_DAILY_VALUES = {
    'total_fat_pdv': ('total_fat_g', 65.0),
    'sugar_pdv': ('sugar_g', 50.0),
    'sodium_pdv': ('sodium_mg', 2400.0),
    'protein_pdv': ('protein_g', 50.0),
    'sat_fat_pdv': ('sat_fat_g', 20.0),
    'carbs_pdv': ('carbs_g', 300.0)
}

TAG_REPLACEMENTS = {
    'snacks': 'snack',
    'appetizers and snacks': 'snack',
    'appetizers': 'appetizer',
    'side dishes': 'side dish',
    'main dish': 'main-dish',
    'main dishes': 'main-dish',
    'desserts': 'dessert',
    'vegetables': 'vegetable',
    'condiments': 'condiment',
    'sauces': 'sauce'
}

# --- EXECUTION ---

df_urls = pd.read_csv("data/harvested_recipe_urls.csv")
target_urls = df_urls['recipe_url'].tolist()
print(f"CSV loaded. Processing URLs. Total:{len(target_urls)}")

raw_results_list = []
print("\n--Starting Scraping & Formatting--")
for i, url in enumerate(target_urls):
    if i % 10 == 0:
        print(f"Processing [{i+1}/{len(target_urls)}] ...")
        
    data = fetch_recipe_data(url)
    if data: 
        raw_results_list.append(data)
    
    time.sleep(1)

if raw_results_list:
    print(f"\nScraping finished. Formatting {len(raw_results_list)} recipes...")
    df_final = process_recipes_to_final_format(raw_results_list, FDA_DAILY_VALUES, TAG_REPLACEMENTS)
    
    if not df_final.empty:
        # 1. Save as Pickle (Preserves Python objects like lists)
        pkl_output_path = "data/processed/full_scraped_recipes.pkl"
        df_final.to_pickle(pkl_output_path)
        
        # 2. Save as CSV (Human readable / Excel compatible)
        csv_output_path = "data/processed/full_scraped_recipes.csv"
        df_final.to_csv(csv_output_path, index=False)
        
        print(f"\nSUCCESS! Process Completed.")
        print(f"Total Recipes: {len(df_final)}")
        print(f"Data saved to Pickle: {pkl_output_path}")
        print(f"Data saved to CSV: {csv_output_path}")
        
        # Display first few rows
        display(df_final.head())
    else:
        print("DataFrame is empty.")
else:
    print("No data retrieved.")

CSV loaded. Processing URLs. Total:2869

--Starting Scraping & Formatting--
Processing [1/2869] ...
Processing [11/2869] ...
Processing [21/2869] ...
Processing [31/2869] ...
Processing [41/2869] ...
Processing [51/2869] ...
Processing [61/2869] ...
Processing [71/2869] ...
Processing [81/2869] ...
Processing [91/2869] ...
Processing [101/2869] ...
Processing [111/2869] ...
Processing [121/2869] ...
Processing [131/2869] ...
Processing [141/2869] ...
Processing [151/2869] ...
Processing [161/2869] ...
Processing [171/2869] ...
Processing [181/2869] ...
Processing [191/2869] ...
Processing [201/2869] ...
Processing [211/2869] ...
Processing [221/2869] ...
Processing [231/2869] ...
Processing [241/2869] ...
Processing [251/2869] ...
Processing [261/2869] ...
Processing [271/2869] ...
Processing [281/2869] ...
Processing [291/2869] ...
Processing [301/2869] ...
Processing [311/2869] ...
Processing [321/2869] ...
Processing [331/2869] ...
Processing [341/2869] ...
Processing [351/2869] ...

Unnamed: 0,name,id,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,calories,...,sodium_pdv,protein_pdv,sat_fat_pdv,carbs_pdv,total_fat_g,sugar_g,sodium_mg,protein_g,sat_fat_g,carbs_g
0,Smoked Salmon Tartlets,c39dc1c0-817c-4de0-9cf0-65ba54c359da,35,"[snack, appetizer, american, canapes and crost...",3,[Preheat the oven to 375 degrees F (190 degree...,Smoked salmon canapes filled with a mixture of...,"[3 ounces cream cheese, softened, 1/3 cup dice...",6,894.0,...,28.1,22.0,60.0,38.3,44.0,31.0,675.0,11.0,12.0,115.0
1,Christmas Tree Cheese Ball,89649cc2-691d-4cf1-835a-a2a10d3f1751,20,"[holidays and events, christmas appetizers, ch...",6,[Spray one side of a 12-inch square piece of p...,This Christmas tree cheese ball is quickly sha...,"[Plastic wrap, Cooking spray, 2 (7.5 ounce) pa...",10,182.0,...,19.5,18.0,30.0,2.3,14.0,2.0,467.0,9.0,6.0,7.0
2,Butternut Squash Fritters,27c2be49-5fb8-4bec-84dc-a5744e136677,15,"[vegetable, appetizer, squash, american, butte...",5,"[Gather all ingredients., Combine the squash, ...",These crispy butternut squash fritters are mad...,"[3 cups shredded butternut squash, 1 cup shred...",8,116.0,...,6.8,8.0,10.0,3.7,6.0,1.0,164.0,4.0,2.0,11.0
3,Sweet Curry Pumpkin Seeds,07a77449-6b36-4ae3-903b-79fc5b7fc182,45,"[snack, indian inspired, appetizer, nuts and s...",3,[Preheat oven to 325 degrees F (165 degrees C)...,"In this quick and easy recipe, pumpkin seeds a...","[1 1/2 cups fresh pumpkin seeds, washed and dr...",5,173.0,...,0.2,10.0,15.0,3.3,14.0,6.0,6.0,5.0,3.0,10.0
4,Air Fryer Honey Sriracha Salmon Bites,ad7d9aa1-2c23-4f77-9413-3ee66726b28d,25,"[seafood, salmon, fish, american, dinner, entree]",5,[Preheat an air fryer to 400 degrees F (200 de...,These air fryer honey Sriracha salmon bites ar...,"[1 tablespoon soy sauce, 1 tablespoon honey, 1...",12,280.0,...,21.6,52.0,15.0,2.3,16.0,6.0,518.0,26.0,3.0,7.0


In [5]:
list1 = df_final.columns.tolist()
print(list1)
for x in range(0, 9):
    for i in list1:
        print(df_final.iloc[x][i])


['name', 'id', 'minutes', 'tags', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients', 'calories', 'total_fat_pdv', 'sugar_pdv', 'sodium_pdv', 'protein_pdv', 'sat_fat_pdv', 'carbs_pdv', 'total_fat_g', 'sugar_g', 'sodium_mg', 'protein_g', 'sat_fat_g', 'carbs_g']
Smoked Salmon Tartlets
c39dc1c0-817c-4de0-9cf0-65ba54c359da
35
['snack', 'appetizer', 'american', 'canapes and crostini', 'dinner']
3
['Preheat the oven to 375 degrees F (190 degrees C). Line a baking sheet with parchment paper.', 'Combine cream cheese, salmon, chives, garlic, salt, and pepper in a bowl; mix well. Divide equally into pastry shells and place on the prepared baking sheet.', 'Bake in the preheated oven until bubbling, 25 to 30 minutes. Remove and cool to room temperature before serving.']
Smoked salmon canapes filled with a mixture of cream cheese and chives are a perfect choice for parties, especially when they're this easy to make.
['3 ounces cream cheese, softened', '1/3 cup diced smoked salmon', '