In [1]:

import requests
from bs4 import BeautifulSoup
import spacy
from pathlib import Path



In [15]:

def scrape(url):
    try:
        # Make a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the entry title element and check if it exists before accessing its text content
        entry_title_element = soup.find('h1', class_='entry-title')
        recipe_name = entry_title_element.text.strip() if entry_title_element else "Recipe Title Not Available"

        # Extract other information based on the provided HTML structure
        ingredient_list = [ingredient.text.strip() for ingredient in soup.find_all('li', class_='wprm-recipe-ingredient')]

        # Find the Total Time element and check if it exists before accessing its next sibling
        total_time_label = soup.find('span', class_='wprm-recipe-details-label', text='Total Time:')
        total_time = total_time_label.find_next('span', class_='wprm-recipe-details').text.strip() if total_time_label else "Not available"

        # Find the Servings element and check if it exists before accessing its next sibling
        servings_label = soup.find('span', class_='wprm-recipe-details-label', text='Servings:')
        servings = servings_label.find_next('span', class_='wprm-recipe-details').text.strip() if servings_label else "Not available"

        # Find the main content div and check if it exists before accessing its text content
        main_content_div = soup.find('div', class_='entry-content')
        main_content = main_content_div.get_text(separator='\n').strip() if main_content_div else "Main content not available"

        # # Print the extracted information
        # print("Recipe Name:", recipe_name)
        # print("Ingredients:", ingredient_list)
        # print("Total Time:", total_time)
        # print("Servings:", servings)
        # print("Description:", main_content)

        return {
            "recipe_name": recipe_name,
            "ingredients": ingredient_list,
            "total_time": total_time,
            "servings": servings,
            "description": main_content
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")

"""Websites with successfuls:
    url = "https://www.indianhealthyrecipes.com/"
    url ="https://www.vidhyashomecooking.com/"
    """
url = "https://www.vidhyashomecooking.com/panchmel-dal-rajasthani-panchratna-dal/"
out = scrape(url)
print("\n".join([str(out[key]) for key in out]))

Panchmel Dal | Rajasthani Panchratna Dal
['▢ 2 tsp oil', '▢ 1 bay leaf', '▢ 2 green chilies slit', '▢ ¼ tsp asafoetida', '▢ 1 cup onion chopped', '▢ 2 tsp ginger garlic paste', '▢ 2 tomatoes chopped 250 grams', '▢ ½ tsp red chili powder', '▢ ½ tsp turmeric powder', '▢ ½ tsp coriander powder', '▢ ¼ cup moth dal', '▢ ¼ cup whole black urad dal', '▢ ¼ cup chana dal', '▢ ¼ cup green moong dal', '▢ ¼ cup toor dal', '▢ 3 cups water divided', '▢ 2 tbsp cilantro finely chopped', '▢ 2 tsp salt', '▢ ½ tsp garam masala', '▢ 2 tsp ghee', '▢ 1 tsp cumin seeds', '▢ 2 dried red chilies']
Not available
Not available
Jump to Recipe Card


 


Delicious and healthy 
panchmel dal recipe (five-lentil-based curry) 
made in Instant Pot! Check out how to make this panchmel dal recipe in the electric pressure cooker (Instant Pot) with detailed step-wise pictures. 




 


My love for dal is nothing new. I have shared quite a few dal recipes, 
one from each state in India and made in Instant Pot
. Even then, t

  total_time_label = soup.find('span', class_='wprm-recipe-details-label', text='Total Time:')
  servings_label = soup.find('span', class_='wprm-recipe-details-label', text='Servings:')


In [2]:
from models import spacy_base_en
from ner_core import NER

In [44]:
ner = NER(spacy_base_en("small"), use_ruler = False)

In [45]:
from utils import re2pat
import json

with open("sample_texts.json") as f:
	texts = json.load(f)

print(texts)

{'archanas-kitchen': ["The Onion Vada Sambar Recipe is a simple healthy and nutritious recipe of the urad dal vada that is pan fried in a paniyaram pan.  In this recipe, I have made the Vada’s healthy by using the Kuzhi Paniyaram pan as opposed to the traditional method of deep frying the vada's.", "Serve the delicious Onion Vada's with a Mixed Vegetable Sambar topped with finely chopped onions.", 'To begin making the Onion Vada Sambar Recipe;  we will first soak the dal for about 3 hours.\n\nUsing a strainer, strain all the excess water from the urad dal. We need to use very little water to grind the urad dal, so that you can shape them well if you are deep frying.\n\nNext we will grind the dal, along with ginger and green chillies using a food processor or a mixer grinder into a very smooth batter. Make sure add very little water while grinding the dal to make a smooth batter. Transfer the batter to a bowl. \n\nAdd in the chopped onions, coriander leaves into the batter. Stir well to

In [46]:
ner = NER(spacy_base_en("large"), use_ruler = False)

lg = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	lg[key] = ner.entities

	ner = NER(spacy_base_en("large"), use_ruler = False)

archanas-kitchen
{'Vada Sambar Recipe': 'PERSON', 'the urad dal vada': 'ORG', 'paniyaram pan': 'ORG', 'Vada': 'ORG', 'Kuzhi Paniyaram': 'ORG', 'vada': 'GPE', "Onion Vada's": 'ORG', 'Sambar': 'PERSON', 'first': 'ORDINAL', 'about 3 hours': 'TIME', 'the Kuzhi Paniyaram Pan': 'ORG', 'a couple minutes': 'TIME', 'toor dal': 'ORG', '2': 'CARDINAL', '3 to 4': 'CARDINAL', 'tadka': 'GPE', 'Vada Sambar': 'PERSON'}
17
bharatz-kitchen
{'biryani': 'PERSON', 'gol gappe': 'PERSON', 'first': 'ORDINAL', 'Chicken Biryani': 'PERSON', '1': 'CARDINAL', '1.5 Tbsp': 'MONEY', 'Gms Chicken': 'PERSON', '16': 'CARDINAL', '1.5 tsp': 'QUANTITY', 'Red Chili Powder': 'FAC', 'Salt': 'GPE', '½ tsp': 'QUANTITY', 'Turmeric / Haldi Powder': 'WORK_OF_ART', 'Ginger Garlic Paste': 'PERSON', '2': 'CARDINAL', 'Curd': 'PERSON', 'Saffron': 'ORG', 'Biryani Masala Mix\r\n\r\n': 'PERSON', 'Bhaap': 'PERSON', '350': 'CARDINAL', '400': 'CARDINAL', 'Gms Basmati Rice': 'PERSON', 'Cinnamon Sticks': 'PERSON', 'Daal Chini': 'ORG', '4': 'CA

In [3]:
import spacy
print(spacy.__version__)

3.7.2


In [48]:
import spacy_transformers
import spacy_curated_transformers

ner = NER(spacy_base_en("transformer"), use_ruler = False)

trf = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	trf[key] = ner.entities

	ner = NER(spacy_base_en("transformer"), use_ruler = False)

archanas-kitchen
{'The Onion Vada Sambar Recipe': 'WORK_OF_ART', 'Kuzhi Paniyaram pan': 'PRODUCT', 'first': 'ORDINAL', 'about 3 hours': 'TIME', 'the Kuzhi Paniyaram Pan': 'PRODUCT', 'a couple minutes': 'TIME', 'the Mini Onion Vadas': 'PRODUCT', '2 cups': 'QUANTITY', '3 to 4 whistles': 'QUANTITY', '2 whistles': 'QUANTITY', 'the Onion Vada Sambar': 'PRODUCT'}
11
bharatz-kitchen
{'few long weeks': 'DATE', 'first': 'ORDINAL', ':\n\n': 'QUANTITY', '1 Liter': 'QUANTITY', '30 Gms': 'QUANTITY', '1.5 Tbsp': 'QUANTITY', '1 Tbsp': 'QUANTITY', '400 Gms': 'QUANTITY', '16': 'CARDINAL', '1.5 tsp': 'QUANTITY', '2 tsp': 'QUANTITY', '½ tsp': 'QUANTITY', '2': 'CARDINAL', '1/3 Cup': 'QUANTITY', '350 – 400  ': 'QUANTITY', '4': 'CARDINAL', '60 Gms': 'QUANTITY', '40': 'CARDINAL', '1-Inch': 'QUANTITY', '5': 'CARDINAL', '1/8 Cup': 'QUANTITY', '1/4 Cup': 'QUANTITY', '2 Tbsp': 'QUANTITY'}
23
zayka-ka-tadka
{'moong dal idli': 'PRODUCT', '1 cup': 'QUANTITY', '2 tablespoon': 'QUANTITY', '1/2': 'CARDINAL', '1 teaspo

In [49]:
import spacy

ner = NER(spacy.load("model-last"), use_ruler = False)

trained = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	trained[key] = ner.entities

	ner = NER(spacy.load("model-last"), use_ruler = False)

archanas-kitchen
{'Onion': 'DISH', 'Sambar': 'UTENSIL', 'simple': 'UTENSIL', 'urad dal': 'INGREDIENT', 'paniyaram pan': 'UTENSIL', 'Kuzhi Paniyaram pan': 'UTENSIL', 'vada': 'INGREDIENT', "'s": 'METHOD', 'Mixed': 'UTENSIL', 'onions.': 'INGREDIENT', ';': 'INGREDIENT', 'soak': 'METHOD', 'dal': 'UTENSIL', 'strainer': 'UTENSIL', 'urad dal.': 'INGREDIENT', 'ginger': 'INGREDIENT', 'green chillies': 'INGREDIENT', 'batter': 'DISH', 'add': 'METHOD', 'batter.': 'INGREDIENT', 'Add': 'METHOD', 'onions': 'INGREDIENT', 'coriander leaves': 'INGREDIENT', 'Stir': 'METHOD', 'Kuzhi Paniyaram': 'UTENSIL', 'Pan': 'METHOD', 'pan': 'UTENSIL', 'Cover': 'METHOD', 'couple minutes of': 'TIME', 'onion': 'INGREDIENT', 'oil': 'INGREDIENT', 'flip': 'METHOD', 'platter': 'DISH', '2': 'MEAUREMENT', '3 to 4 whistles and': 'TIME', 'pressure': 'DISH', 'mash': 'METHOD', 'toor dal': 'INGREDIENT', 'pressure cooker': 'UTENSIL', 'water': 'INGREDIENT', 'radish': 'METHOD', 'carrot': 'INGREDIENT', 'sambar powder': 'INGREDIENT', 's

In [50]:
small = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	small[key] = ner.entities

	ner = NER(spacy_base_en("small"), use_ruler = False)

archanas-kitchen
{'Onion': 'DISH', 'Sambar': 'UTENSIL', 'simple': 'UTENSIL', 'urad dal': 'INGREDIENT', 'paniyaram pan': 'UTENSIL', 'Kuzhi Paniyaram pan': 'UTENSIL', 'vada': 'INGREDIENT', "'s": 'METHOD', 'Mixed': 'UTENSIL', 'onions.': 'INGREDIENT', ';': 'INGREDIENT', 'soak': 'METHOD', 'dal': 'UTENSIL', 'strainer': 'UTENSIL', 'urad dal.': 'INGREDIENT', 'ginger': 'INGREDIENT', 'green chillies': 'INGREDIENT', 'batter': 'DISH', 'add': 'METHOD', 'batter.': 'INGREDIENT', 'Add': 'METHOD', 'onions': 'INGREDIENT', 'coriander leaves': 'INGREDIENT', 'Stir': 'METHOD', 'Kuzhi Paniyaram': 'UTENSIL', 'Pan': 'METHOD', 'pan': 'UTENSIL', 'Cover': 'METHOD', 'couple minutes of': 'TIME', 'onion': 'INGREDIENT', 'oil': 'INGREDIENT', 'flip': 'METHOD', 'platter': 'DISH', '2': 'MEAUREMENT', '3 to 4 whistles and': 'TIME', 'pressure': 'DISH', 'mash': 'METHOD', 'toor dal': 'INGREDIENT', 'pressure cooker': 'UTENSIL', 'water': 'INGREDIENT', 'radish': 'METHOD', 'carrot': 'INGREDIENT', 'sambar powder': 'INGREDIENT', 's

In [12]:

# Replace 'your_file_path.txt' with the path to your text file containing URLs
file_path = 'recipe.txt'
with open(file_path, 'r') as file:
    urls = [line.strip() for line in file.readlines()]


In [17]:
ner_model = spacy.load("model-last")

for url in urls:
    print(f"Processing URL: {url}")
    scraped_data = scrape(url)  # Assuming scrape function returns a dictionary

    # Check if scraped_data is None or if 'description' key is missing
    if not scraped_data or 'description' not in scraped_data:
        print(f"Warning: No valid data scraped from {url}")
        continue

    # Extract the text data from the dictionary
    text_data = scraped_data['description']

    ner = NER(ner_model, use_ruler=False)
    ner.eval(text_data)
    print("Identified Entities:", ner.entities)


Processing URL: https://www.indianhealthyrecipes.com/samosa-recipe-make-samosa/


  total_time_label = soup.find('span', class_='wprm-recipe-details-label', text='Total Time:')
  servings_label = soup.find('span', class_='wprm-recipe-details-label', text='Servings:')


Identified Entities: {'30': 'UTENSIL', '45K \nShares\n\n\n': 'TIME', 'potato': 'INGREDIENT', '&': 'UTENSIL', 'South': 'DISH', 'bowl': 'UTENSIL', 'Chutney\n ': 'METHOD', 'Chutney': 'DISH', 'potatoes': 'INGREDIENT', 'keema': 'UTENSIL', 'popular': 'INGREDIENT', '1': 'MEAUREMENT', 'Samosa\n\n\n\n\n \n\n\n': 'METHOD', '2\n\n\n': 'MEAUREMENT', 'How': 'METHOD', 'Photos': 'UTENSIL', '3\n\n\n': 'UTENSIL', 'Pro': 'DISH', 'Crispy & Flaky Samosa\n\n\n\n\n \n\n\n': 'TIME', '4\n\n\nFaqs': 'TIME', '5\n\n\n': 'MEAUREMENT', '6': 'MEAUREMENT', 'whole wheat flour': 'INGREDIENT', 'atta': 'INGREDIENT', '7\n\n\nRecipe': 'MEAUREMENT', 'Card': 'UTENSIL', '8\n\n\nWatch': 'TIME', 'health reasons': 'DISH', 'recipe': 'INGREDIENT', 'Punjabi': 'UTENSIL', 'For': 'METHOD', 'Falafel': 'METHOD', 'rolls\n': 'INGREDIENT', 'Pakora\nSandwiches': 'MEAUREMENT', 'Preparation': 'MEAUREMENT', 'boil': 'INGREDIENT', 'grams': 'INGREDIENT', '1.1 pounds': 'UTENSIL', '3 to': 'MEAUREMENT', '1 whistle': 'TIME', '8 mins.': 'TIME', '2.':