In [None]:
import requests  # For sending HTTP requests to websites
from bs4 import BeautifulSoup  # For parsing HTML content
import csv  # For saving data into CSV files
from datetime import datetime  # For handling dates and timestamps
import schedule  # For scheduling the script to run at a specific time
import time  # For adding delays in execution

# URL of the webpage to scrape
url = 'https://www.indianspices.com/marketing/price/domestic/daily-price.html'

# Headers to make the request look like it's coming from a real web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def scrape_data():
    print("Scraping data...")

    # Send a request to the website and get the response
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful (status code 200 means success)
    if response.status_code != 200:
        print(f"Failed to retrieve page. Status code: {response.status_code}")
        return  # Stop execution if the page can't be retrieved

    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.content, 'html.parser')

    # Function to extract data from the Small Cardamom table
    def process_small_cardamom_table(table):
        rows = table.find_all('tr')  # Get all table rows
        if len(rows) < 3:
            return [], []  # Return empty lists if not enough data is found
        
        # Define the column headers we need
        desired_headers = ['Date', 'Auctioneer', 'No. of Lots', 'Total Qty Arrived', 'Qty Sold', 'Max Price', 'Avg Price']
        data = []
        
        for row in rows[2:]:  # Skip the first two rows (headers and unwanted data)
            cols = row.find_all('td')  # Get all columns in the row
            if len(cols) >= 8:
                data.append([
                    cols[1].get_text(strip=True),
                    cols[2].get_text(strip=True),
                    cols[3].get_text(strip=True),
                    cols[4].get_text(strip=True),
                    cols[5].get_text(strip=True),
                    cols[6].get_text(strip=True),
                    cols[7].get_text(strip=True)
                ])
        
        return desired_headers, data

    # Function to extract data from the Large Cardamom table
    def process_large_cardamom_table(table):
        rows = table.find_all('tr')
        if len(rows) < 3:
            return [], []
        
        # Define the column headers we need
        desired_headers = ['Date', 'Market', 'Type', 'Price']
        data = []
        
        for row in rows[2:]:
            cols = row.find_all('td')
            if len(cols) >= 5:
                data.append([
                    cols[1].get_text(strip=True),
                    cols[2].get_text(strip=True),
                    cols[3].get_text(strip=True),
                    cols[4].get_text(strip=True)
                ])
        
        return desired_headers, data

    # Locate the Small Cardamom section on the webpage
    small_heading = soup.find('h2', string=lambda t: t and 'Small Cardamom' in t)
    small_table = small_heading.find_next('table') if small_heading else None
    
    # Locate the Large Cardamom section on the webpage
    large_heading = soup.find('h2', string=lambda t: t and 'Large Cardamom' in t)
    large_table = large_heading.find_next('table') if large_heading else None
    
    # Extract data from the Small Cardamom table
    small_headers, small_data = process_small_cardamom_table(small_table) if small_table else ([], [])
    
    # Extract data from the Large Cardamom table
    large_headers, large_data = process_large_cardamom_table(large_table) if large_table else ([], [])

    # Get the current date in YYYYMMDD format to use in filenames
    current_date = datetime.now().strftime('%Y%m%d')

    # Save Small Cardamom data to a CSV file
    if small_headers and small_data:
        small_filename = f"small_cardamom_prices_{current_date}.csv"
        with open(small_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(small_headers)  # Write headers
            writer.writerows(small_data)  # Write data rows
        print(f"Small Cardamom data saved to {small_filename}")

    # Save Large Cardamom data to a CSV file
    if large_headers and large_data:
        large_filename = f"large_cardamom_prices_{current_date}.csv"
        with open(large_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(large_headers)  # Write headers
            writer.writerows(large_data)  # Write data rows
        print(f"Large Cardamom data saved to {large_filename}")

    # Print some sample data from Small Cardamom
    if small_data:
        print("\nSample Small Cardamom Data:")
        for row in small_data[:2]:  # Show first two rows only
            print(row)
    
    # Print some sample data from Large Cardamom
    if large_data:
        print("\nSample Large Cardamom Data:")
        for row in large_data[:2]:  # Show first two rows only
            print(row)

# **Schedule the script to run every 24 hours at a fixed time**
schedule.every().day.at("08:00").do(scrape_data)  # Runs daily at 08:00 AM

print("Scheduler started. The script will run every 24 hours.")

# Keep the script running indefinitely and check every minute if it's time to run
while True:
    schedule.run_pending()  # Check if any scheduled task is due
    time.sleep(60)  # Wait for 60 seconds before checking again
    # Note: This will keep the script running indefinitely. You can stop it with Ctrl+C.

Scheduler started. The script will run every 24 hours.
