In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import the Raw CSV File

In [36]:
# read the csv file 
recipes_df = pd.read_csv('recipes_first_30.csv')

# Data Cleaning and Prep

## Types of data needed
| Data Type                    | Description                                     | Example                                         |
| ---------------------------- | ----------------------------------------------- | ----------------------------------------------- |
| Dish Metadata                | Name, category, cuisine, description            | “Spaghetti Carbonara”, “Italian”, “Main Course” |
| Ingredients                  | List of components per dish                     | “Egg, Parmesan, Pancetta, Spaghetti”            |
| Pairings                     | Known good matches                              | “Spaghetti → Garlic Bread”, “Sushi → Miso Soup” |
| User Interactions (optional) | Co-selections, purchases, clicks, ratings       | User A selected “Curry” and “Naan”              |
| Images (optional)            | Photos of dishes (useful for multimodal models) | JPEG/PNG links                                  |
| Nutritional Data             | Calories, macros                                | Optional for health-based pairing logic         |

## Data Souurce 
Food.com Recipes (Kaggle)
URL: https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews

Fields: name, ingredients, steps, tags, rating

## Objective

Build a clean, enriched dataset from the raw Food.com CSV, optimized for:

- Embedding-based dish similarity
- NLP-based content filtering
- Potential future user-dish interactions

In [37]:
recipes_df.columns

Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')

### Step 1
SELECT KEY COLUMNS

We selected specific columns from the raw dataset that are relevant to the use case:

- RecipeId, Name, Description: Basic identifiers and textual content.
- RecipeIngredientParts, RecipeIngredientQuantities: Core content of each recipe.
- RecipeCategory, Keywords: Help classify dishes by type or tags.
- AggregatedRating, ReviewCount: Useful for quality filtering.
- Nutrition columns: Used to form a nutrient vector.
- We ignore irrelevant columns (e.g., AuthorId, DatePublished, Images) to reduce noise.

In [38]:
# Select and rename 
df_clean = recipes_df[['RecipeId', 'Name', 'Description', 'RecipeIngredientParts',
               'RecipeIngredientQuantities', 'RecipeCategory', 'Keywords',
               'RecipeInstructions', 'AggregatedRating', 'ReviewCount',
               'Calories', 'FatContent', 'ProteinContent', 'CarbohydrateContent']].copy()

In [33]:
# Genera;l check to see we have the correct column count
len(df_clean.columns)

14

## Step 2

Transform the Data
- Renamed Columns to normalized, lower-case, consistent field names (dish_id, name, etc.).
- Parsed JSON-like strings in columns (RecipeIngredientParts, Keywords) into real Python lists using ast.literal_eval.

Renaming provides semantic clarity and consistency throughout the pipeline. The dataset includes list-like fields stored as strings (e.g., "['chicken', 'garlic']"), which must be parsed into Python lists so that they’re usable in ML and NLP workflows. These parsed lists are crucial for generating embeddings, filtering, and extracting ingredient features

In [40]:
# Rename the data
df_clean.rename(columns={
    'RecipeId': 'dish_id',
    'Name': 'name',
    'Description': 'description',
    'RecipeIngredientParts': 'ingredients',
    'RecipeIngredientQuantities': 'ingredient_qty',
    'RecipeCategory': 'category',
    'Keywords': 'tags',
    'RecipeInstructions': 'instructions',
    'AggregatedRating': 'rating',
    'ReviewCount': 'review_count'
}, inplace=True)

In [41]:
# # Parse stringified lists
import ast
import re

# # def safe_literal_eval(val):
# #     try:
# #         return ast.literal_eval(val)
# #     except Exception as e:
# #         print(f"Failed to parse: {val}\nError: {e}")
# #         return None

def convert_r_list_to_python(val):
    # Handle NaN, None
    if val is None or isinstance(val, float) and np.isnan(val):
        return []

    # Handle already-structured input
    if isinstance(val, (list, np.ndarray)):
        return list(val)

    # Handle string input
    if isinstance(val, str):
        val = val.strip()
        if val == "" or val.lower() in ("none", "nan"):
            return []

        # Convert R-style syntax
        if val.startswith("c(") and val.endswith(")"):
            val = re.sub(r'^c\((.*)\)$', r'[\1]', val)

        # Replace unquoted NA with None
        val = re.sub(r'\bNA\b', 'None', val)

        try:
            result = ast.literal_eval(val)
            return list(result) if isinstance(result, (tuple, set)) else result
        except Exception as e:
            print(f"Failed to parse: {val}\nError: {e}")
            return []

    return []

# Apply to each column safely
df_clean['ingredients'] = df_clean['ingredients'].apply(convert_r_list_to_python)
df_clean['ingredient_qty'] = df_clean['ingredient_qty'].apply(convert_r_list_to_python)
df_clean['tags'] = df_clean['tags'].apply(lambda x: convert_r_list_to_python(x) if x is not None and not (isinstance(x, float) and np.isnan(x)) else [])


# Combine nutrients into a dictionary
df_clean['nutrients'] = df_clean.apply(lambda row: {
    'calories': row.get('Calories'),
    'protein': row.get('ProteinContent'),
    'fat': row.get('FatContent'),
    'carbs': row.get('CarbohydrateContent')
}, axis=1)

# Drop individual nutrient cols
df_clean.drop(columns=['Calories', 'ProteinContent', 'FatContent', 'CarbohydrateContent'], inplace=True)