In [1]:
# exploration.ipynb - Jupyter Notebook for inspecting model predictions on raw HTML

from pathlib import Path
from joblib import load
from html_parser import parse_html
from feature_extraction import extract_features

# Load a sample HTML page
HTML_PATH = Path("../data/html/crab-cakes.html")
html = HTML_PATH.read_text(encoding="utf-8")
elements = parse_html(html)

# Extract features and run predictions
features = extract_features(elements)
model = load("../models/model.joblib")
predictions = model.predict(features)

# Group predictions
grouped = {"title": [], "ingredient": [], "direction": [], "none": []}
for el, label in zip(elements, predictions):
    grouped[label].append(el["text"])

# Show predictions per category
for label, texts in grouped.items():
    print(f"\n=== {label.upper()} ({len(texts)} blocks) ===")
    for t in texts:
        print("-", t[:100])


=== TITLE (12 blocks) ===
- Maryland Crab Cakes
- Video Tutorital
- Maryland Crab Cakes
- Comments
- Asparagus Soup with Lemon and Parmesan
- Baked Salmon with Panko-Dill Crust
- Classic Chicken Salad
- Baked Ziti
- Homemade Pancake Recipe
- Homemade Caesar Salad Dressing
- Buy Now
- Buy Now

=== INGREDIENT (154 blocks) ===
- Home
- Dinner
- Fish & Seafood
- Jennifer Segal
- Updated January 23, 2025
- 4.83 (555 reviews)
- 1,116 Comments
- Leave a Review
- Share
- Pin
- Email
- Tweet
- Save Recipe
- full disclosure policy
- .
- peel-and-eat shrimp
- ,
- hush puppies
- cornbread
- .
- salmon cakes
- .)
- Salmon Cakes
- Shrimp Burgers
- Success!
- Go
- Comments
- (1116)
- Comment
- Print
- Jennifer Segal
- Servings:
- Makes 6 large crab cakes
- Prep Time:
- Total Time:
- Ingredients
- For the Crab Cakes
- 2
- large eggs
- 2½ tablespoons
- mayonnaise, best quality such as Hellmann's or Duke's
- 1½ teaspoons
- Dijon mustard
- 1 teaspoon
- Worcestershire sauce
- 1 teaspoon
- Old Bay seasoni

In [2]:
import sys
from pathlib import Path

# Let notebook find files in ../src
sys.path.append(str(Path("..") / "src"))

from html_parser import parse_html
from feature_extraction import extract_features
from joblib import load


In [3]:
# Point this to your new downloaded HTML file
HTML_PATH = Path("../data/html/crab-cakes.html")  # <- update this
html = HTML_PATH.read_text(encoding="utf-8")

In [9]:
elements = parse_html(html)
features = extract_features(elements)
model = load("../models/model.joblib")
preds = model.predict(features)


In [10]:
structured = {"title": None, "ingredients": [], "directions": []}
for el, label in zip(elements, preds):
    if label == "title" and not structured["title"]:
        structured["title"] = el["text"]
    elif label == "ingredient":
        structured["ingredients"].append(el["text"])
    elif label == "direction":
        structured["directions"].append(el["text"])


In [12]:
import pprint
pprint.pprint(structured)


{'directions': ['22 Quick and Easy Recipes in 30 Minutes (or less) + 5 Chef '
                'Secrets To Make You A Better Cook!',
                'Find a Recipe',
                'Recipes',
                'Jump to Recipe',
                'By',
                'This post may contain affiliate links. Read my',
                'These homemade crab cakes bring a taste of Eastern shore '
                'right to your kitchen. Packed with fresh lump crabmeat and '
                'just enough filler to hold them together, they’re easy to '
                'make and even easier to enjoy!',
                'Photo by Alexandra Grablewski (Chronicle Books, 2018)',
                'When you live in Maryland, eating Chesapeake blue crabs and '
                'crab cakes is practically a religion—and, in my family, we '
                'are all loyal devotees. Every summer, we hit all of our '
                'favorite crab shacks, from local joints all the way to the '
                'Easte