Continues from 1_describe_product.ipynb

In [1]:
from googleapiclient.discovery import build
from pathlib import Path
from typing import *
from IPython.display import Markdown, HTML
import requests
import json
from bs4 import BeautifulSoup

### Config

In [None]:
PRODUCT = "earbuds"

In [3]:
# Ensure key has Custom Search API enabled in Google Cloud Console.
GOOGLE_SEARCH_KEY = "AIzaSyC17AZKYJrAZL010FyenUPhbjpt2MG7hDo"
# Create a search engine: https://programmablesearchengine.google.com
GOOGLE_SEARCH_ENGINE_ID = "b0ead037efd694ba8"
# Get a free Jina AI API key here: https://jina.ai/reader/
JINA_API_KEY = "jina_cc9194461be645f4aa5aae18a2c2e784Lo1aqN5TMYBXrB6NYs6QLygnA6EK"

In [None]:
DATA_DIR = Path("session") / PRODUCT
META_PATH = DATA_DIR / "stage_1.json"
assert META_PATH.exists(), "Run 1_describe_product.ipynb first!"

DATASHEET_PATH = DATA_DIR / "datasheets.json"

### Operations

#### Setup

In [None]:
search = build("customsearch", "v1", developerKey=GOOGLE_SEARCH_KEY)

In [None]:
with open(META_PATH, "r") as f:
    metadata = json.load(f)

competitors = [o["name"] for o in metadata["competition_products"]]
display(competitors)

['Sony WF-1000XM5', 'Bose QuietComfort Ultra Earbuds', 'OnePlus Buds 3']

In [7]:
if DATASHEET_PATH.exists():
    with open(DATASHEET_PATH, "r") as f:
        datasheets = json.load(f)

    prev_competitors = set(datasheets.keys())
    print("Previous datasheet metadata found.")
    print("Previous competitors:", prev_competitors)
    assert prev_competitors == set(competitors), (
        "Competitors have changed! Please run 1_describe_product.ipynb again."
    )
else:
    datasheets = {competitor: {} for competitor in competitors}
    print("No previous datasheet metadata found.")

No previous datasheet metadata found.


#### Search for Datasheet URLs
After a bit of qualitative testing, it seems "official datasheet" tends to give
the best results, regardless of product type. This is cause Google's algo probably
strongly related datasheet to technical specs, even more than "technical specs",
which sometimes returns review websites instead.

In [8]:
def find_datasheet_url(competitor):
    query = f"{competitor} {PRODUCT} official datasheet"
    # TODO: We can find most of the time; But some are suboptimal like the technical
    # specifications is hidden in an accordian and the scraper can't find it.
    req = search.cse().list(
        q=query,
        cx=GOOGLE_SEARCH_ENGINE_ID,
        # hl="en",
        # NOTE: Especially for products that don't have real datasheets, this might
        # return the user manual instead, which often doesn't contain specifications.
        # fileType="pdf",
        num=1,
    )
    res = req.execute()
    result = res["items"][0]
    print(f"Found: {result['title']} ({result['link']})")
    return result["link"]

In [9]:
for competitor in competitors:
    url = datasheets[competitor].get("url", None)
    if url is not None:
        print(f"Found previous URL for {competitor}: {url}")
        continue

    url = find_datasheet_url(competitor)
    datasheets[competitor]["url"] = url

display(datasheets)

Found: WF-1000XM5 Specifications | Sony USA (https://www.sony.com/electronics/support/wireless-headphones-bluetooth-headphones/wf-1000xm5/specifications)
Found: QuietComfort Ultra Earbuds | Bose (https://www.bose.com/p/earbuds/bose-quietcomfort-ultra-earbuds/QCUE-HEADPHONEIN.html)
Found: OnePlus Buds 3 Specs (https://www.oneplus.com/us/oneplus-buds-3/specs)


{'Sony WF-1000XM5': {'url': 'https://www.sony.com/electronics/support/wireless-headphones-bluetooth-headphones/wf-1000xm5/specifications'},
 'Bose QuietComfort Ultra Earbuds': {'url': 'https://www.bose.com/p/earbuds/bose-quietcomfort-ultra-earbuds/QCUE-HEADPHONEIN.html'},
 'OnePlus Buds 3': {'url': 'https://www.oneplus.com/us/oneplus-buds-3/specs'}}

In [10]:
with open(DATASHEET_PATH, "w") as f:
    json.dump(datasheets, f, indent=2)

#### Extract HTML from URLs

In [11]:
def get_url_html(url):
    """Get the HTML content of a URL."""
    # Resolve redirects.
    res = requests.get(url, allow_redirects=True)
    if res.status_code != 200:
        raise Exception(f"Failed to fetch {url}: {res.status_code}")
    soup = BeautifulSoup(res.text, "html.parser")
    soup = soup.find("body")
    for script in soup.find_all(
        [
            "script",
            "style",
            "link",
            "svg",
            "img",
            "header",
            "footer",
            "nav",
            "video",
            "iframe",
            "noscript",
        ]
    ):
        script.decompose()
    return soup.prettify()

In [14]:
for competitor, data in datasheets.items():
    html = datasheets[competitor].get("html", None)
    url = datasheets[competitor]["url"]
    html = None
    while html is None:
        print(f"Scraping {competitor}...")
        html = get_url_html(url)
        datasheets[competitor]["html"] = html
        print(f"Scraped {competitor} successfully.")


Scraping Sony WF-1000XM5...
Scraped Sony WF-1000XM5 successfully.
Scraping Bose QuietComfort Ultra Earbuds...
Scraped Bose QuietComfort Ultra Earbuds successfully.
Scraping OnePlus Buds 3...
Scraped OnePlus Buds 3 successfully.


In [15]:
with open(DATASHEET_PATH, "w") as f:
    json.dump(datasheets, f, indent=2)

#### Preview Scraped Data

In [16]:
display(
    HTML(
        get_url_html(
            "https://www.bose.com/p/earbuds/bose-quietcomfort-ultra-earbuds/QCUE-HEADPHONEIN.html"
        )
    )
)

0,1
Headphone Fit,In Ear
Microphones,Built-in Microphone
Noise Cancelling,Yes
Noise Control Type,Adjustable Noise Cancelling
Water Resistant,IPX4
Case,Charging
Bud Single,"1.23"" H x 0.79"" W x 0.96"" D (0.017 lb)"
Case,"2.61"" H x 2.34"" W x 1.05"" D (0.132 lb)"
Product Material,"Plastic (PC-ABS), Silicone, Metal"
Product Case Material,Plastic (Hard)


#### Scrape Datasheet (Using Jina AI's HTML -> Markdown)
This performs poorly as the intermediate model is not instructable on what to
extract, and extracts only the obvious content instead of comprehensively. Arguably,
its good cause the converted content is well-formatted and successfully strips
out all unnecessary details.

But in our case, that is bad. BTW, we never clarified with prof, but maybe we
are obliged to use RAG? IMO, it is not necessary given the large context, and
that the raw HTML does feed in context sufficiently for us to extract necessary
details for the next stage. TBH, since we are doing structured data in stages,
such that essential data is orchestrated, maybe RAG isn't necessary at all since
there is no need to filter anything when everything is already filtered to the
essentials.

In [15]:
%%script echo skipping

# Need to clarify if its okay to use smth so complete for scraping like JinaAI reader.
headers = {
    "Authorization": f"Bearer {JINA_API_KEY}",
    "X-Engine": "browser",
    "X-Md-Bullet-List-Marker": "-",
    "X-Respond-With": "readerlm-v2",
    "X-Retain-Images": "none",
    "X-With-Links-Summary": "true",
}


def get_url_as_markdown(url):
    data = {
        "url": url,
        "instruction": "Extract the technical specifications from the following product website.",
    }
    response = requests.post("https://r.jina.ai/", headers=headers, json=data)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

skipping


In [16]:
%%script echo skipping

for competitor, data in datasheets.items():
    markdown = datasheets[competitor].get("markdown", None)
    url = datasheets[competitor]["url"]
    while markdown is None:
        print(f"Scraping {competitor}...")
        markdown = get_url_as_markdown(url)
        datasheets[competitor]["markdown"] = markdown
        print(f"Scraped {competitor} successfully.")

skipping


In [17]:
%%script echo skipping

with open(DATASHEET_PATH, "w") as f:
    json.dump(datasheets, f, indent=2)

skipping


#### Preview Scraped Data

In [18]:
%%script echo skipping

display(Markdown(list(datasheets.values())[0]["markdown"]))

skipping
