# Hybrid Recipe Summarization for top 21 recipes (Directions → Preprocess → LexRank → BART)


Goal: use the directions column (which keeps punctuation and sentence boundaries), lightly preprocess it, extract key steps with LexRank, then rewrite with BART for clean, professional summaries. Saves to top_21_summaries.csv

Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Installs

In [1]:
!pip -q install transformers==4.41.2 sumy==0.11.0 nltk==3.9.1
import nltk
nltk.download('punkt', quiet=True)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m128.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m111.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone


True

Imports + GPU Configuration

In [3]:
import os, re, pandas as pd
from nltk.tokenize import sent_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from transformers import pipeline

try:
    import torch
    DEVICE = 0 if torch.cuda.is_available() else -1
    FP16 = torch.cuda.is_available()
except ImportError:
    DEVICE, FP16 = -1, False

# BART tends to give balanced, coherent summaries for procedures.
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=DEVICE,
    torch_dtype="auto" if FP16 else None
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [14]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Load CSV

In [7]:
csv_path = "/content/drive/MyDrive/top_21_recommended_recipes.csv"
df = pd.read_csv(csv_path)

# Columns to use
title_col = "title"
instr_col  = "directions"

# Preview
pd.set_option("display.max_colwidth", None)
df[[title_col, instr_col]].head(3)

Unnamed: 0,title,directions
0,Spiced Coffee Affogato With Vanilla Ice Cream,"[""Preheat oven to 350\u00b0F. Toast pistachios on a rimmed baking sheet, tossing once, until golden brown, 5-7 minutes. Let cool, then coarsely chop."", ""Mix cinnamon and cardamom in a small bowl."", ""Divide ice cream among 4 small serving bowls or coffee cups. Sprinkle spice mixture over and pour 2 Tbsp. espresso into each bowl. Top with pistachios.""]"
1,Hot Pineapple Chutney,"[""In a bowl, combine pineapple and salt."", ""Set aside."", ""In another bowl, combine cayenne pepper, turmeric, fenugreek and asafetida."", ""In a medium saucepan, heat oil over high heat until a couple of mustard seeds thrown in start to sputter."", ""Add remaining mustard seeds and cover immediately."", ""Uncover in a few seconds when seeds stop popping."", ""Reduce heat to medium."", ""Add spice mix."", ""Stir-fry for 10 seconds."", ""Add pineapple."", ""Cook, maintaining a gentle boil the entire time, stirring frequently, until mixture is thick, about 10 minutes."", ""Cover and serve at room temperature or chill before serving."", ""(Chutney can be stored in an airtight container in the refrigerator for up to 2 weeks."", ""Do not freeze.)""]"
2,Creamy Pumpkin Soup,"[""Melt butter in a medium saucepan."", ""Add onion and cook until soft."", ""Add curry powder and cook 1-2 minutes more."", ""Place the onion mixture in food processor, add pumpkin and salt, process until smooth."", ""Add Half and Half, process until smooth."", ""Pour mixture back in saucepan."", ""Add chicken stock."", ""Heat over low heat, stir occasionally."", ""Combine sour cream, cinnamon and parsley."", ""Serve hot soup with a dollop of sour cream in the center."", ""Serve with crusty bread."", ""Serves 6.""]"


Initialize the abstractive model

In [8]:
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=DEVICE,
    torch_dtype="auto" if FP16 else None
)


Preprocessing
1. Unwraps JSON-like lists such as ["Step 1", "Step 2", ...] into a single paragraph

2. Normalizes whitespace and fixes common sentence-boundary issues

3. Keeps periods so we don’t lose sentence structure

In [9]:
def preprocess_recipe_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Unwrap ["a","b","c"] → "a. b. c."
    if text.startswith('["') and text.endswith('"]'):
        text = text[2:-2]
        steps = text.split('", "')
        text = '. '.join(steps) + '.'

    # Normalize whitespace and fix boundary artifacts
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'(\d+)\s*minutes?\.(\s*[a-z])', r'\1 minutes. \2', text, flags=re.IGNORECASE)
    text = re.sub(r'([a-z])\.([A-Z])', r'\1. \2', text)

    # Ensure terminal period
    if text and not text.endswith('.'):
        text += '.'
    return text


Extractive pass (LexRank + light cooking heuristics)
1. Uses LexRank to pick salient sentences

2. Re-ranks with a tiny domain heuristic: boost sentences with cooking actions or time/temperature, downweight pure prep.

In [10]:
def extract_key_recipe_steps(text: str, max_sentences: int = 4) -> str:
    sents = sent_tokenize(text)
    if len(sents) <= max_sentences:
        return text

    def score_sentence(sent: str) -> int:
        score = 0
        cook = ['heat','cook','bake','fry','saute','boil','mix','stir','add','combine','simmer','whisk','fold']
        score += sum(2 for v in cook if v in sent.lower())
        if re.search(r'\d+\s*(minutes?|hours?|degrees?)', sent.lower()):
            score += 3
        prep_only = ['chop','cut','dice','slice','trim','peel','mince','grate']
        if any(v in sent.lower() for v in prep_only) and not any(v in sent.lower() for v in cook):
            score -= 1
        return score

    try:
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        lex = LexRankSummarizer()
        pool = [str(s) for s in lex(parser.document, max_sentences + 2)]
        scored = sorted([(s, score_sentence(s)) for s in pool], key=lambda x: x[1], reverse=True)
        return ". ".join([s for s,_ in scored[:max_sentences]]) + "."
    except Exception:
        return ". ".join(sents[:max_sentences]) + "."


Abstractive pass (BART) and cleanup
1. Chooses length targets by bucket (short/medium/long).

2. Truncates very long inputs for stability.

3. Falls back to extractive if the model errors.

4. Cleans filler phrases and deduplicates similar sentences; outputs bullets when multiple steps remain.

In [11]:
def recipe_abstractive_summary(text: str, target_length: str = "medium") -> str:
    text = (text or "").strip()
    if not text:
        return "No instructions available."

    wc = len(text.split())
    if target_length == "short":
        max_len = max(15, min(50, wc));  min_len = max(10, min(20, max_len-10))
    elif target_length == "long":
        max_len = max(40, min(150, wc)); min_len = max(20, min(40, max_len-20))
    else:
        max_len = max(30, min(100, wc)); min_len = max(15, min(30, max_len-10))
    if min_len >= max_len:
        min_len = max(1, max_len-3)

    if len(text) > 2000:
        text = text[:2000]

    try:
        out = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False, use_cache=True)
        return out[0]["summary_text"]
    except Exception:
        return extract_key_recipe_steps(text, 3)

def clean_recipe_summary(text: str) -> str:
    if not text or text.strip() == "":
        return "No summary available."

    patterns = [
        r'(?i)\b(serving suggestions?|how to serve|served? with [^.]*|makes? \d+ servings?)\b[^.]*\.?',
        r'(?i)\b(recipe serves? \d+|prep time|cook time|total time)[^.]*\.?',
        r'(?i)\b(this recipe|the dish|this dish)[^.]*\.?',
        r'(?i)\b(enjoy|perfect for|great for)[^.]*\.?'
    ]
    for p in patterns:
        text = re.sub(p, '', text)

    text = re.sub(r'^[[\'"]|[\'"]$', '', text.strip())
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.+', '.', text)
    text = re.sub(r'\s*,\s*\.', '.', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)

    sents = [s.strip() for s in sent_tokenize(text) if len(s.split()) >= 3]
    seen, uniq = set(), []
    for s in sents:
        key = re.sub(r'[^a-z0-9\s]', '', s.lower())
        if key not in seen:
            seen.add(key)
            if not s.endswith('.'): s += '.'
            s = s[0].upper() + s[1:] if len(s)>1 else s.upper()
            uniq.append(s)

    if len(uniq) > 1:
        return "\n".join(f"• {s}" for s in uniq)
    if len(uniq) == 1:
        return uniq[0]

    clean_text = text.strip()
    if not clean_text.endswith('.'): clean_text += '.'
    return clean_text[0].upper() + clean_text[1:] if len(clean_text)>1 else clean_text


Hybrid Policy
1. For very short directions, go straight to BART (short).

2. For long directions, LexRank first, then BART (long).

3. Otherwise, LexRank then BART (medium).

4. Always finish with the cleanup formatter.

In [12]:
def improved_hybrid_summarize(text: str) -> str:
    cleaned = preprocess_recipe_text(text or "")
    wc = len(cleaned.split())
    sc = len(sent_tokenize(cleaned))

    if wc < 20 or sc <= 2:
        summary = recipe_abstractive_summary(cleaned, "short")
    elif wc > 200 or sc > 8:
        key = extract_key_recipe_steps(cleaned, max_sentences=6)
        summary = recipe_abstractive_summary(key, "long")
    else:
        key = extract_key_recipe_steps(cleaned, max_sentences=4)
        summary = recipe_abstractive_summary(key, "medium")

    return clean_recipe_summary(summary)


Run on top 21 recipes and save

In [15]:
from tqdm.auto import tqdm
tqdm.pandas()

# Ensure no NaNs
df[instr_col] = df[instr_col].fillna("")

# Run
df["improved_summary"] = df[instr_col].progress_apply(improved_hybrid_summarize)

# Save
out_path = "top_21_summaries.csv"
df[[title_col, instr_col, "improved_summary"]].to_csv(out_path, index=False)
print(f"Saved → {out_path}")

#Preview
df[[title_col, "improved_summary"]].head(10)


  0%|          | 0/21 [00:00<?, ?it/s]

  text = re.sub(r'^[[\'"]|[\'"]$', '', text.strip())
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saved → top_21_summaries.csv


Unnamed: 0,title,improved_summary
0,Spiced Coffee Affogato With Vanilla Ice Cream,"• Toast pistachios on a rimmed baking sheet, tossing once, until golden brown, 5-7 minutes.\n• Mix cinnamon and cardamom in a small bowl.\n• Divide ice cream among 4 small serving."
1,Hot Pineapple Chutney,"• In a medium saucepan, heat oil over high heat until a couple of mustard seeds thrown in start to sputter.\n• Add remaining mustard seeds and cover immediately.\n• In a bowl, combine pineapple and salt.\n• In another bowl, Combine cayenne pepper, turmeric, fenugreek and asafetida."
2,Creamy Pumpkin Soup,"• Melt butter in a medium saucepan.\n• Add onion and cook until soft.\n• Add curry powder and cook 1-2 minutes more.\n• Place the onion mixture in food processor, add pumpkin and salt, process until smooth.\n• Add Half and Half,process until smooth.\n• Pour mixture back in saucepan.\n• Add chicken stock.\n• Heat over low heat, stir occasionally."
3,Sweet Pineapple Yellow Rice,"• Ghee, pineapple, rice, saffron, sugar, water and turmeric.\n• Simmer on medium heat for 10 minutes.\n• Reduce heat and simmer on low until slightly moist."
4,Chai Latte,"• Bring milk, water, sugar and spices to simmer in medium saucepan.\n• Place tea bags in milk mixture.\n• Simmer 1 minutes."
5,Spiced Rice Pudding,"• Mix rice, milk, salt and spices in a saucepan.\n• Simmer 3 minutes.\n• Remove from heat and add extracts and Equal.\n• Cool to room temperature.\n• Transfer to servingware."
6,Broiled Ginger-Cinnamon Grapefruit,"• Broil on high for about 8 minutes (or until the sugar begins to caramelize) Mix seasonings and sugar together in a small bowl.\n• After the grapefruit has been sectioned, divide."
7,Mohallabia/Muhallabia,• Mix the rice mixture with the milk and keep stirring till the mixture is boiling and bubbling.\n• Boil thte rest of the milk with salt and sugar.\n• Finish off adding rose water and almond powder.
8,Spicy Vegetable Fritters,"• Pour oil to a depth of 3 to 4 inches into a deep-fryer or a large, heavy saucepan and heat over medium-high heat to 350F.\n• Combine flour, 2 tsp oil, cumin, coriander, cayenne, garlic, salt, baking powder, and ajwain (if using) in a medium bowl."
9,Orange-Ginger Sugar Snaps,Sugar snap peas and orange rind are the stars of Add ginger and sugar snap peas to a nonstick skillet and saute for 2 minutes.


In [17]:
from google.colab import files
files.download("top_21_summaries.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>