In [None]:
!pip install firecrawl


Collecting firecrawl
  Downloading firecrawl-1.6.4-py3-none-any.whl.metadata (10 kB)
Collecting python-dotenv (from firecrawl)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading firecrawl-1.6.4-py3-none-any.whl (16 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, firecrawl
Successfully installed firecrawl-1.6.4 python-dotenv-1.0.1


In [None]:
from google.colab import drive
from firecrawl import FirecrawlApp
from pydantic import BaseModel
import json
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the path where the JSON file will be saved in Drive
final_output_path = '/content/drive/My Drive/final_scrape_21.json'

# Initialize the FirecrawlApp with your API key
app = FirecrawlApp(api_key='fc-f467709c7fb64865b23aa850076358a8')

# Define the schema for the scraped data
class StoreSchema(BaseModel):
    store_name: str
    store_description: str
    store_location: str
    store_email_address: str
    store_phone_number: str
    url: str

# Function to truncate descriptions to 80 words
def truncate_description(description, max_words=80):
    words = description.split()
    if len(words) > max_words:
        return " ".join(words[:max_words]) + "..."
    return description

# Function to scrape store data
def scrape_store_data(url):
    try:
        # Use FirecrawlApp to scrape the URL
        data = app.scrape_url(url, {
            'formats': ['extract'],
            'extract': {
                'schema': StoreSchema.model_json_schema(),
            }
        })
        extracted_data = data.get("extract", {})

        # Truncate the store description if present
        if "store_description" in extracted_data and extracted_data["store_description"]:
            extracted_data["store_description"] = truncate_description(extracted_data["store_description"])

        # Ensure URL is included
        extracted_data["url"] = url

        # Return the modified extracted data
        return extracted_data
    except Exception as e:
        print(f"Error while scraping {url}: {e}")
        return None

# Your list of URLs goes here
urls = [

]

unique_urls = list(set(urls))

# If the file already exists and contains data, load it first
scraped_data = []
existing_urls = set()

if os.path.exists(final_output_path) and os.path.getsize(final_output_path) > 0:
    with open(final_output_path, 'r') as f:
        try:
            existing_data = json.load(f)
            # Ensure it's a list
            if isinstance(existing_data, list):
                scraped_data = existing_data
                # Collect existing URLs to prevent duplicates
                existing_urls = {item.get("url") for item in existing_data if "url" in item}
        except json.JSONDecodeError:
            # If the existing file is not valid JSON, we start fresh
            scraped_data = []
            existing_urls = set()

# Filter out URLs that have already been scraped
new_urls = [url for url in unique_urls if url not in existing_urls]

if not new_urls:
    print("No new URLs to scrape. All URLs have already been processed.")
else:
    print(f"Starting to scrape {len(new_urls)} new URLs...")

    # Scrape data from each new URL and append to the list
    for url in new_urls:
        store_data = scrape_store_data(url)
        if store_data:
            # Append the data to our list
            scraped_data.append({
                "url": store_data.get("url", ""),
                "store_name": store_data.get("store_name", ""),
                "store_description": store_data.get("store_description", ""),
                "store_location": store_data.get("store_location", ""),
                "store_phone_number": store_data.get("store_phone_number", "")
            })

    # Write the updated scraped data list to the JSON file
    with open(final_output_path, "w") as f:
        json.dump(scraped_data, f, indent=4)

    print(f"Scraping completed and updated data appended to '{final_output_path}'")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting to scrape 18 new URLs...
Scraping completed and updated data appended to '/content/drive/My Drive/final_scrape_21.json'
