# Page Analyst 

In [56]:
from urllib.parse import urlparse
from firecrawl import FirecrawlApp
import os
import re
import requests
import base64
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env.local file
env_path = Path('../..') / '.env.local'
load_dotenv(dotenv_path=env_path)

api_key = os.environ["FIRECRAWL_API_KEY"]
image_link_regex = r"https?://\S+\.(?:jpg|jpeg|png|gif|svg)(?:\?[\w=&]*)?"
category_question = "In less than 10 words and ignoring the brand, what category of product does this detail page sell?"
image_question = '''Extract all image urls from the following markdown, making sure to avoid small icons and logos, and only include the images that pertain to the main product on the page, rather than images of recommended or similar products in other components of the page. Prefer larger images (over 1kb), prefer to be stricter and only keep 1-2 images with white-only backgrounds if possible, rather than keeping too many or images with complex backgrounds. Return the result as a space-delimited list of image_urls and nothing else.'''
llm = ChatOpenAI(model_name="gpt-4o-mini")

def trim_url(url):
    parsed = urlparse(url)
    return parsed.scheme + '://' + parsed.netloc + parsed.path

class PageAnalyzer:
    url = None
    markdown = None
    image_urls = []
    category = None
    images = {}

    def __init__(self, url):
        self.url = trim_url(url)
        self._do_scrape()
        # self._check_or_scrape()

    def _do_scrape(self):
        # scrape the page
        self.markdown = FirecrawlApp(api_key=api_key).scrape_url(self.url, formats=['markdown']).markdown
        print(f"Scraped markdown for {self.url}")

        # extract the image urls
        image_urls_response = llm.invoke(f"{image_question} \n\n {self.markdown}").content
        self.image_urls = re.findall(image_link_regex, image_urls_response)
        print(f"Extracted {len(self.image_urls)} image urls from {self.url}")

        # extract the category
        self.category = llm.invoke(f"{category_question} \n\n {self.markdown}").content
        self.category = re.sub(r'[^a-zA-Z\s]', '', self.category).lower()
        print(f"Extracted category '{self.category}' from {self.url}")

    def query_markdown(self, question):
        return llm.invoke(f"{question} \n\n {self.markdown}").content
    
    def query_images(self, question):
        base64_images = [base64.b64encode(image).decode('utf-8') for image in self.images.values()]
        return llm.invoke([
            HumanMessage(content=[
                {"type": "text", "text": question},
                *[{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} for base64_image in base64_images]
            ])]).content.lower()

    def _check_or_scrape(self):
        with utils.pool.connection() as conn:
            # Create table if it doesn't exist
            conn.execute("""
                CREATE TABLE IF NOT EXISTS pagemetadata (
                    url TEXT PRIMARY KEY,
                    markdown TEXT,
                    image_urls TEXT[],
                    category TEXT
                )""")
            
            # Check if URL already exists
            with conn.cursor() as cur:
                cur.execute("SELECT url, markdown, image_urls, category FROM pagemetadata WHERE url = %s", (self.url,))
                result = cur.fetchone()
                if not result:
                    self._do_scrape()
                    # Store document content and metadata separately
                    cur.execute(
                        "INSERT INTO pagemetadata (url, markdown, image_urls, category) VALUES (%s, %s, %s, %s)",
                        (self.url, self.markdown, self.image_urls, self.category)
                    )
                else:
                    self.markdown = result[1]
                    self.image_urls = result[2]
                    self.category = result[3]

    def load_relevant_images(self):
        for image_url in self.image_urls:
            if image_url in self.images:
                continue
            try:
                # Skip small images and icons
                if any(x in image_url.lower() for x in ['icon', 'sprite', 'pixel', 'logo']):
                    continue
                    
                # Download image
                response = requests.get(image_url)
                if response.status_code == 200:
                    # Get image size from headers
                    content_length = int(response.headers.get('content-length', 0))
                    if content_length < 1000:  # Skip images smaller than 1KB
                        continue
                                        
                    self.images[image_url] = response.content                        
            except Exception as e:
                print(f"Error processing image {image_url}: {str(e)}")
                continue



In [57]:
pa = PageAnalyzer('https://www.walmart.com/ip/Five-Star-1-Subject-7x5-Notebook-Green/5488604868?athcpid=5488604868&athpgid=AthenaItempage&athcgid=null&athznid=si&athieid=v0_eeMjEyLjg2LDY2NzAuNTYsMC4wMzE3MDkzNTMwMjk2OTg5NCwwLjVf&athstid=CS055~CS004&athguid=EmUu_FhMExkUy0XMpCa_SLyabAy10fZMsFd4&athancid=5349984957&athposb=0&athena=true&athbdg=L1600')
print(pa.image_urls)

Scraped markdown for https://www.walmart.com/ip/Five-Star-1-Subject-7x5-Notebook-Green/5488604868
Extracted 2 image urls from https://www.walmart.com/ip/Five-Star-1-Subject-7x5-Notebook-Green/5488604868
Extracted category 'notebooks and writing supplies' from https://www.walmart.com/ip/Five-Star-1-Subject-7x5-Notebook-Green/5488604868
['https://i5.walmartimages.com/seo/Five-Star-1-Subject-7x5-Notebook-Green_ddf50e5a-77b3-4212-8417-c01c97a00052.26661a0501d10a466cc2229cc4d14d51.jpeg?odnHeight=640&odnWidth=640&odnBg=FFFFFF', 'https://i5.walmartimages.com/seo/Five-Star-1-Subject-7x5-Notebook-Green_ddf50e5a-77b3-4212-8417-c01c97a00052.26661a0501d10a466cc2229cc4d14d51.jpeg?odnHeight=372&odnWidth=372&odnBg=FFFFFF']


In [58]:
print(pa.url)
print(pa.query_markdown("Where is this product is made. Be very very concise. Make sure you're not simply returning the brand of the product, but an actual location where it may have been manufactured. If that cannot be determined, return 'unknown'."))
print(pa.query_markdown("Where is this product is shipping from. Be very very concise. Make sure you're not simply returning the brand of the product, but an actual location where it may be shipping from. If that cannot be determined, return 'unknown'."))

https://www.walmart.com/ip/Five-Star-1-Subject-7x5-Notebook-Green/5488604868
unknown
Unknown


In [None]:
print(pa.url)
pa.load_relevant_images()
print(pa.images.keys())
print(pa.query_images("Provide a very concise list of raw materials that appear to be inputs to this product including the percent of the total mass."))

https://www.walmart.com/ip/Five-Star-1-Subject-7x5-Notebook-Green/5488604868
dict_keys([])
here’s a concise list of raw materials likely used in the production of a college ruled notebook like the five star notebook, along with estimated percentages of total mass:

1. **paper** - 70-80%
2. **plastic (for cover)** - 10-15%
3. **wire (for binding)** - 5-10%
4. **ink (for printing)** - 1-5%

(note: actual percentages may vary depending on the specific product design and materials used.)
