In [125]:
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from PIL import Image
import os
import requests

* We are going to scrape https://www.delish.com/ for recipes. 

In [126]:
def getInfo(userRequest):
    """
    This function takes the user request and converts it to a link for further actions.
    :param userRequest: 
    :return: 
    """
    words = userRequest.split()
    if len(words) == 1:
        _url = f'https://www.delish.com/search/?q={words[0]}&type=Recipes'
    else:
        query = '+'.join(words)
        _url = f'https://www.delish.com/search/?q={query}&type=Recipes'
    return _url

In [127]:
def parseInfo(_url):
    """
    This function takes a link as a parameter and parses it into a BeautifulSoup object.
    :param _url: 
    :return: 
    """
    page = requests.get(_url)
    soup = BeautifulSoup(page.text, 'html.parser')
    return soup

In [128]:
def getPages(soup):
    """
    This function takes a BeautifulSoup object and returns a list of all pages.
    :param soup: 
    :return: 
    """
    urls = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and 'cooking/recipe-ideas/' in href:
            urls.append(href)
    return urls[2:]     

In [129]:
def fetchRecipes(urls, max_urls):
    """
    This function takes a list of urls and fetches all the recipes.
    :param urls: 
    :param max_urls: 
    :return: 
    """
    all_recipes = []

    for index, url_part in enumerate(urls):
        if max_urls is not None and index >= max_urls:
            break
        url = 'https://www.delish.com/search' + url_part
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        
        title = soup.find('meta', {'name': 'title'} ).get('content')
        
        ingredients = []
        for li in soup.find_all('li', class_='css-s5yyu3 e12sb1171'):
            ingredient = ' '.join(li.stripped_strings)
            ingredients.append(ingredient)
            
        guides = []
        for li in soup.select('li.css-21v28f ol > li'):
            step_number = li.find('span', class_='e1241r8m0').get_text(strip=True)
            step_text = ' '.join(li.stripped_strings).replace(f'Step {step_number}', '').strip()
            guide = f'Step {step_number}: {step_text}'
            guides.append(guide)
            
        img_tag = soup.find('img', {'class': 'css-0 e1g79fud0'})
        image_url = img_tag.get('src')
        response = requests.get(image_url)
        if response.status_code == 200:
            content_type = response.headers.get('content-type', '')
            extension = 'jpg'
            if 'png' in content_type:
                extension = 'png'
            elif 'webp' in content_type:
                extension = 'webp'
            
            sanitized_title = "".join(c if c.isalnum() else "_" for c in title)
            filename = f"recipe_{index + 1}_{sanitized_title}.{extension}"
            
            with open(filename, 'wb') as f:
                f.write(response.content)
                
            try:
                img = Image.open(filename)
                if img.format not in ["JPEG", "PNG"]:
                    new_filename = f"recipe_{index + 1}_{sanitized_title}_converted.jpg"
                    img.convert("RGB").save(new_filename, "JPEG")
                    os.remove(filename)
                    filename = new_filename
                    img.close()
            except Exception as e:
                print(f"Invalid image: {e}")
                filename = None
            all_recipes.append((ingredients, guides, filename, title, index))    
    return all_recipes

In [130]:
def toDoc(ingredients, guides, filename, title, index):
    """
    This function takes a list of ingredients, guides, and filename and converts it into a docx file.
    :param ingredients: 
    :param guides: 
    :param filename: 
    :param title: 
    :param index: 
    :return: 
    """
    document = Document()
    document.add_heading(title, 0).bold = True
    
    for ingredient in ingredients:
        document.add_paragraph(ingredient, style='List Bullet')
    for guide in guides:
        document.add_paragraph(guide).alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    
    document.add_picture(filename, width=Inches(6)).alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    document.save(f'Recipe_{index + 1}.docx')

In [131]:
def start(request, number):
    """
    This function takes the user request, the number of recipes, and enables scraping.
    :param request: 
    :param number: 
    :return: 
    """
    search_url = getInfo(request)
    search_soup = parseInfo(search_url)
    recipe_urls = getPages(search_soup)
    all_recipes = fetchRecipes(recipe_urls, number)
    for recipe in all_recipes:
        ingredients, guides, filename, title, index = recipe
        toDoc(ingredients, guides, filename, title, index)
    return "Recipes generated successfully."

An example of how this mini-scraper works:

In [133]:
start('curry', 3)

'Recipes generated successfully.'