In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import os

if os.getcwd().endswith('notebooks'):
    # Move up one level to the project root
    os.chdir('..') 

print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: /home/hsozer/Projects/DeepLearning/recipe_bot


In [19]:
import pandas as pd
from scripts.preprocessing import parse_nutrition, calculate_nutrition_mass, filter_outliers, parse_lists

# --- Configuration ---
RAW_DATA_PATH = "data/food-com-recipes-and-user-interactions/RAW_recipes.csv"
PROCESSED_PATH = "data/processed/clean_recipes.pkl"

In [20]:
print("‚è≥ Loading raw data...")
df = pd.read_csv(RAW_DATA_PATH)

print(f"‚úÖ Loaded successfully!")
print(f"   Raw Shape: {df.shape}")

‚è≥ Loading raw data...
‚úÖ Loaded successfully!
   Raw Shape: (231637, 12)


In [21]:
# 1. Parse Nutrition Columns (Strings -> Floats)
print("‚è≥ Parsing nutrition columns...")
df = parse_nutrition(df)

# 2. Calculate Mass (for Density Checks)
# Note: This creates the 'est_weight_g' column needed for filtering
print("‚è≥ Calculating nutrition mass...")
df = calculate_nutrition_mass(df)

# 3. Filter Junk (Physics Check)
print("‚è≥ Removing outliers (physics filter)...")
df_clean = filter_outliers(df)
print(f"   Cleaned Shape: {df_clean.shape} (Dropped {len(df) - len(df_clean)} rows)")

# 4. Parse Text Lists (Heavy Operation)
# We pass the specific columns we want to convert from strings to lists
print("‚è≥ Parsing text lists (ingredients, tags, steps)...")
df_final = parse_lists(df_clean, columns=['ingredients', 'tags', 'steps'])

# Preview
display(df_final.head(3))

‚è≥ Parsing nutrition columns...
‚è≥ Calculating nutrition mass...
‚è≥ Removing outliers (physics filter)...
   Cleaned Shape: (225343, 25) (Dropped 6294 rows)
‚è≥ Parsing text lists (ingredients, tags, steps)...


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,...,protein_pdv,sat_fat_pdv,carbs_pdv,total_fat_g,sugar_g,sodium_mg,protein_g,sat_fat_g,carbs_g,estimated_weight_g
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",...,2.0,0.0,4.0,0.0,6.5,0.0,1.0,0.0,12.0,13.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",...,22.0,35.0,1.0,11.7,0.0,408.0,11.0,7.0,3.0,25.7
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",...,39.0,27.0,5.0,14.3,16.0,1152.0,19.5,5.4,15.0,48.8


In [24]:
# Final Cleanup
# Define columns we strictly don't need for the app
cols_to_drop = [
    'contributor_id',  # User ID of uploader
    'submitted',       # Date uploaded
    'estimated_weight_g' # used for filtering
]

print(f"‚è≥ Dropping unused columns: {cols_to_drop}...")
df_final = df_final.drop(columns=cols_to_drop, errors='ignore')

# Save
print("üíæ Saving to Pickle...")
os.makedirs(os.path.dirname(PROCESSED_PATH), exist_ok=True)

df_final.to_pickle(PROCESSED_PATH)

print(f"‚úÖ Pipeline Finished! Saved to {PROCESSED_PATH}")
print(f"   Final Columns: {list(df_final.columns)}")
display(df_final.head(3))

‚è≥ Dropping unused columns: ['contributor_id', 'submitted', 'estimated_weight_g']...
üíæ Saving to Pickle...
‚úÖ Pipeline Finished! Saved to data/processed/clean_recipes.pkl
   Final Columns: ['name', 'id', 'minutes', 'tags', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients', 'calories', 'total_fat_pdv', 'sugar_pdv', 'sodium_pdv', 'protein_pdv', 'sat_fat_pdv', 'carbs_pdv', 'total_fat_g', 'sugar_g', 'sodium_mg', 'protein_g', 'sat_fat_g', 'carbs_g']


Unnamed: 0,name,id,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,calories,...,sodium_pdv,protein_pdv,sat_fat_pdv,carbs_pdv,total_fat_g,sugar_g,sodium_mg,protein_g,sat_fat_g,carbs_g
0,arriba baked winter squash mexican style,137739,55,"[60-minutes-or-less, time-to-make, course, mai...",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,51.5,...,0.0,2.0,0.0,4.0,0.0,6.5,0.0,1.0,0.0,12.0
1,a bit different breakfast pizza,31490,30,"[30-minutes-or-less, time-to-make, course, mai...",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,173.4,...,17.0,22.0,35.0,1.0,11.7,0.0,408.0,11.0,7.0,3.0
2,all in the kitchen chili,112140,130,"[time-to-make, course, preparation, main-dish,...",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,269.8,...,48.0,39.0,27.0,5.0,14.3,16.0,1152.0,19.5,5.4,15.0
