In [146]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse

In [108]:
BASE_URL = "https://www.archanaskitchen.com"

In [109]:
page= requests.get(BASE_URL)
page

<Response [200]>

In [110]:
def fetch_page_content(url):
    """Fetch the content of a webpage."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [111]:
def extract_recipes_from_page(soup):
    """Extract recipes from a single page."""
    recipes = []
    recipe_divs = soup.find_all('div', class_='blogRecipe col-md-3')
    for div in recipe_divs:
        # Extract recipe link
        link_tag = div.find('a', href=True)
        recipe_url = link_tag['href'] if link_tag else None
        if recipe_url and recipe_url.startswith('/'):
            recipe_url = BASE_URL + recipe_url
        
        # Extract recipe name
        title_tag = div.find('h3')
        recipe_name = title_tag.text.strip() if title_tag else "No Title Found"
        
        if recipe_url and recipe_name:
            recipes.append((recipe_name, recipe_url))
    return recipes

In [147]:
def clean_url(url):
    """Remove query parameters from a URL."""
    parsed = urlparse(url)
    return urlunparse(parsed._replace(query=""))

In [148]:
def get_next_page_url(soup, visited_urls):
    """Find the URL of the next page and avoid revisiting."""
    pagination = soup.find('ul', class_='pagination')
    if pagination:
        # Find the "Next" link
        next_page_tag = pagination.find('a', class_='page-link', href=True, title="Next")
        if next_page_tag:
            next_page_url = next_page_tag['href']
            if next_page_url.startswith('/'):
                next_page_url = BASE_URL + next_page_url
            
            # Clean URL to remove unnecessary query parameters
            next_page_url = clean_url(next_page_url)
            
            # Check if the next page URL has already been visited
            if next_page_url not in visited_urls:
                return next_page_url
    return None

In [152]:
def scrape_all_recipes(start_url):
    """Scrape all recipes by iterating through pagination."""
    all_recipes = []
    current_url = start_url
    visited_urls = set()  # Track visited URLs to prevent loops
    
    while current_url:
        print(f"Scraping page: {current_url}")
        if current_url in visited_urls:
            print(f"Loop detected! Already visited: {current_url}")
            break  # Stop if the URL is already visited
        
        visited_urls.add(current_url)  # Mark the URL as visited
        soup = fetch_page_content(current_url)
        if soup:
            # Extract recipes and find the next page URL
            recipes = extract_recipes_from_page(soup)
            all_recipes.extend(recipes)
            current_url = get_next_page_url(soup, visited_urls)  # Pass both arguments
        else:
            break  # Stop scraping if a page fails to load
    return all_recipes


In [153]:
def save_recipes_to_csv(recipes, file_name='recipes.csv'):
    """Save the list of recipes to a CSV file."""
    try:
        with open(file_name, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Recipe Name', 'URL'])
            writer.writerows(recipes)
        print(f"Recipes saved to {file_name}")
    except Exception as e:
        print(f"Error saving to CSV: {e}")

In [154]:
def main():
    """Main function to execute the scraper."""
    start_url = f"{BASE_URL}/recipes"
    all_recipes = scrape_all_recipes(start_url)
    
    if all_recipes:
        # Print the total number of recipes scraped
        print(f"Total recipes scraped: {len(all_recipes)}")
        
        # Save the recipes to a CSV file
        save_recipes_to_csv(all_recipes)
    else:
        print("No recipes were scraped.")

if __name__ == "__main__":
    main()

Scraping page: https://www.archanaskitchen.com/recipes
Scraping page: https://www.archanaskitchen.com/recipes/page-2
Scraping page: https://www.archanaskitchen.com/recipes/page-3
Scraping page: https://www.archanaskitchen.com/recipes/page-4
Scraping page: https://www.archanaskitchen.com/recipes/page-5
Scraping page: https://www.archanaskitchen.com/recipes/page-6
Scraping page: https://www.archanaskitchen.com/recipes/page-7
Scraping page: https://www.archanaskitchen.com/recipes/page-8
Scraping page: https://www.archanaskitchen.com/recipes/page-9
Scraping page: https://www.archanaskitchen.com/recipes/page-10
Scraping page: https://www.archanaskitchen.com/recipes/page-11
Scraping page: https://www.archanaskitchen.com/recipes/page-12
Scraping page: https://www.archanaskitchen.com/recipes/page-13
Scraping page: https://www.archanaskitchen.com/recipes/page-14
Scraping page: https://www.archanaskitchen.com/recipes/page-15
Scraping page: https://www.archanaskitchen.com/recipes/page-16
Scraping