In [1]:
from googleapiclient.discovery import build
from pathlib import Path
from typing import *
from IPython.display import Markdown
import requests
import json

### Config

In [2]:
PRODUCT = "earbuds"

In [3]:
# Ensure key has Custom Search API enabled in Google Cloud Console.
GOOGLE_SEARCH_KEY = "AIzaSyC17AZKYJrAZL010FyenUPhbjpt2MG7hDo"
# Create a search engine: https://programmablesearchengine.google.com
GOOGLE_SEARCH_ENGINE_ID = "b0ead037efd694ba8"
# Get a free Jina AI API key here: https://jina.ai/reader/
JINA_API_KEY = "jina_cc9194461be645f4aa5aae18a2c2e784Lo1aqN5TMYBXrB6NYs6QLygnA6EK"

In [4]:
DATA_DIR = Path("session") / PRODUCT
META_PATH = DATA_DIR / "stage_1.json"
assert META_PATH.exists(), "Run 1_describe_product.ipynb first!"

DATASHEET_PATH = DATA_DIR / "datasheets.json"

### Operations

#### Setup

In [5]:
search = build("customsearch", "v1", developerKey=GOOGLE_SEARCH_KEY)

In [6]:
with open(META_PATH, "r") as f:
    metadata = json.load(f)

competitors = [o["name"] for o in metadata["competition_products"]]
display(competitors)

['Sony WF-1000XM5', 'Bose QuietComfort Ultra Earbuds', 'OnePlus Buds 3']

In [7]:
if DATASHEET_PATH.exists():
    with open(DATASHEET_PATH, "r") as f:
        datasheets = json.load(f)

    prev_competitors = set(datasheets.keys())
    print("Previous datasheet metadata found.")
    print("Previous competitors:", prev_competitors)
    assert prev_competitors == set(competitors), (
        "Competitors have changed! Please run 1_describe_product.ipynb again."
    )
else:
    datasheets = {competitor: {} for competitor in competitors}
    print("No previous datasheet metadata found.")

No previous datasheet metadata found.


#### Search for Datasheet URLs
After a bit of qualitative testing, it seems "official datasheet" tends to give
the best results, regardless of product type. This is cause Google's algo probably
strongly related datasheet to technical specs, even more than "technical specs",
which sometimes returns review websites instead.

In [None]:
def find_datasheet_url(competitor):
    query = f"{competitor} {PRODUCT} official datasheet"
    # TODO: We can find most of the time; But some are suboptimal like the technical
    # specifications is hidden in an accordian and the scraper can't find it.
    req = search.cse().list(
        q=query,
        cx=GOOGLE_SEARCH_ENGINE_ID,
        # hl="en",
        # NOTE: Especially for products that don't have real datasheets, this might
        # return the user manual instead, which often doesn't contain specifications.
        # fileType="pdf",
        num=1,
    )
    res = req.execute()
    result = res["items"][0]
    print(f"Found: {result['title']} ({result['link']})")
    return result["link"]

In [9]:
for competitor in competitors:
    url = datasheets[competitor].get("url", None)
    if url is not None:
        print(f"Found previous URL for {competitor}: {url}")
        continue

    url = find_datasheet_url(competitor)
    datasheets[competitor]["url"] = url

display(datasheets)

Found: WF-1000XM5 Specifications | Sony USA (https://www.sony.com/electronics/support/wireless-headphones-bluetooth-headphones/wf-1000xm5/specifications)
Found: QuietComfort Ultra Earbuds | Bose (https://www.bose.com/p/earbuds/bose-quietcomfort-ultra-earbuds/QCUE-HEADPHONEIN.html)
Found: OnePlus Buds 3 Specs (https://www.oneplus.com/us/oneplus-buds-3/specs)


{'Sony WF-1000XM5': {'url': 'https://www.sony.com/electronics/support/wireless-headphones-bluetooth-headphones/wf-1000xm5/specifications'},
 'Bose QuietComfort Ultra Earbuds': {'url': 'https://www.bose.com/p/earbuds/bose-quietcomfort-ultra-earbuds/QCUE-HEADPHONEIN.html'},
 'OnePlus Buds 3': {'url': 'https://www.oneplus.com/us/oneplus-buds-3/specs'}}

In [10]:
with open(DATASHEET_PATH, "w") as f:
    json.dump(datasheets, f, indent=2)

#### Scrape Datasheet

In [15]:
# Need to clarify if its okay to use smth so complete for scraping like JinaAI reader.
headers = {
    "Authorization": f"Bearer {JINA_API_KEY}",
    "X-Engine": "browser",
    "X-Md-Bullet-List-Marker": "-",
    "X-Respond-With": "readerlm-v2",
    "X-Retain-Images": "none",
    "X-With-Links-Summary": "true",
}

def get_url_as_markdown(url):
    data = {
        "url": url,
        "instruction": "Extract the technical specifications from the following product website.",
    }
    response = requests.post("https://r.jina.ai/", headers=headers, json=data)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

In [16]:
for competitor, data in datasheets.items():
    markdown = datasheets[competitor].get("markdown", None)
    url = datasheets[competitor]["url"]
    while markdown is None:
        print(f"Scraping {competitor}...")
        markdown = get_url_as_markdown(url)
        datasheets[competitor]["markdown"] = markdown
        print(f"Scraped {competitor} successfully.")

Scraping Bose QuietComfort Ultra Earbuds...
Error: 524
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>r.jina.ai | 524: A timeout occurred</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wrapper">
    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block sm:block sm:mb-2 font-ligh

KeyboardInterrupt: 

In [None]:
with open(DATASHEET_PATH, "w") as f:
    json.dump(datasheets, f, indent=2)

#### Preview Scraped Data

In [17]:
display(Markdown(list(datasheets.values())[0]["markdown"]))

```json
{
  "size_and_weight": {
    "weight": "Approx. 0.21\" x2 (including earbud tips (M))"
  },
  "general_features": {
    "volume_control": true,
    " headphone_type": "Closed, dynamic",
    "dsee_extreme": true,
    "frequency_response_bluetooth_communication": [
      "20–20,000 Hz (44.1 kHz sampling)",
      "(LDAC 96 kHz sampling 990 kbps)"
    ],
    "ambient_sound_mode": true,
    "multi_point_connection": true,
    "driver_unit": "0.33\"",
    "waterproof": true
  },
  "battery": {
    "charge_time_usb_charger_wireless_charger_with_case": [
      "Approx. 1.5 hrs"
    ],
    "battery_charge_method_usb_charger_wireless_charger_with_case": [
      "(USB charger | Wireless charger (with case))"
    ]
  },
  "bluetooth_specification": {
    "supported_audio_format_sbc_aac_ldac_lc3": [
      "SBC",
      "AAC",
      "LDAC",
      "LC3"
    ],
    "frequency_range_24_20_40_hertz_band_24_28_mhz_frequency_band", [
      "[2.4 GHz band (2.4000–2.4835 GHz)]"
    ],
    bluetooth_version_5_3: true,
    supported_content_protection_scms_t: true,
    effective_range_32_ft: true,
    profile_a2dp_avrpc_hfp_hsp_tmap_csip_mcp_vcp_ccp: [
      "[A2DP, AVRCP, HFP, HSP, TMAP, CSIP, MCP, VCP, CCP]"
    ]
  },
  noise_canceling: {
    quick_attention_yes: true,
    ambient_sound_yes: true
  },
  charging_case: {
    weight_approximate_oz: approx._1.38_oz,
    battery_charge_time_usb_charger充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电时间usb充电timeusb充电timeusb充电timeusb充电timeusb充电timeusb充电timeusb充电timeusb充电timeusbchargingcasechargerchargetimeapprox._2_hr_usb_chargingapprox._2_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGINGapprox._1_hr_USB_CHARGING Approx._2_HRS USB Charging time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time USB Charger time usbchargingcasechargerchargetimeapproximately_2_hours_usb_chargingapproximately_2_hours_usb_chargingapproximately_1_hour_usb_chargingapproximately_1_hour_usb_chargingapproximately_1_hour_usb_chargingapproximately_1_hour_usb_chargingapproximately_1_hour_usb_chargingapproximately_1_hour_usb_chargingapproximately_1_hour_usb_chargingapproximately_1_hour_usb_charging Approximately _two hours usb charging approximately two hours usb charging approximately one hour usb charging approximately one hour usb charging approximately one hour usb charging approximately one hour usb charging approximately one hour usb charging approximately one hour usb charging Approximately two hours usb charging approximate two hours usb charging approximate one hour usb charging approximate one hour usb charging approximate one hour usb charging approximate one hour usb charging approximate one hour usb charging Approximately two hours usb charging approximate two hours usb charging approximate one hour usb charging approximate one hour usb charging approximate one hour usb charging Approximately two hours usb charging approximate two hours usb charging approximate one hour usb charging approximate one hour
  }
}
```
