<a href="https://colab.research.google.com/github/ivyisaplantt/Career-Launch/blob/main/shopifyWebscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Code from https://scrapfly.io/blog/crawling-with-python/

In [2]:
!pip install httpx parsel w3lib tldextract loguru

Collecting parsel
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting w3lib
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting cssselect>=1.2.0 (from parsel)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting jmespath (from parsel)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading parsel-1.9.1-py2.py3-none-any.whl (17 kB)
Downloading w3lib-2.2.1-py3-none-any.whl (21 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading loguru-0.7.3-py3-none-any.whl (61 kB)
[2K   [

In [3]:
# for extractor function
from typing import List
from urllib.parse import urljoin
from parsel import Selector
import httpx

In [4]:
# for filter class
from typing import Pattern
import posixpath
from urllib.parse import urlparse
from tldextract import tldextract
from w3lib.url import canonicalize_url
from loguru import logger as log

In [5]:
# for Crawler class
import asyncio
from typing import Callable, Dict, Optional, Tuple
# from Filter import UrlFilter

In [6]:
# for nytimes implementation
import re
import json

In [7]:
# url extractor function
def extract_urls(response: httpx.Response) -> List[str]:
  tree = Selector(text=response.text)
  # using XPath
  urls = tree.xpath('//a/@href').getall()
  # or CSS
  urls = tree.css('a::attr(href)').getall()
  # turn relative urls (/foo.html) to absolute (https://domain.com/foo.html)
  urls = [urljoin(str(response.url), url.strip()) for url in urls]
  return urls

In [8]:
#testing the url extractor function
response = httpx.get("http://httpbin.org/links/10/1")
for url in extract_urls(response):
  print(url)

http://httpbin.org/links/10/0
http://httpbin.org/links/10/2
http://httpbin.org/links/10/3
http://httpbin.org/links/10/4
http://httpbin.org/links/10/5
http://httpbin.org/links/10/6
http://httpbin.org/links/10/7
http://httpbin.org/links/10/8
http://httpbin.org/links/10/9


In [9]:
# url filter class
class UrlFilter:
  IGNORED_EXTENSIONS = [
        # archives
        '7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
        # images
        'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif', 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
        # audio
        'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
        # video
        '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv', 'm4a', 'm4v', 'flv', 'webm',
        # office suites
        'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
        # other
        'css', 'pdf', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk',
    ]

  def __init__(self, domain:str=None, subdomain:str=None, follow:List[Pattern]=None) -> None:
    # restrict filtering to specific TLD
    self.domain = domain or ""
    # restrict filtering to specific subdomain
    self.subdomain = subdomain or ""
    self.follow = follow or []
    log.info(f"filter created for domain {self.subdomain}.{self.domain} with follow rules {follow}")
    self.seen = set()

  def is_valid_ext(self, url):
    # ignore non-crawlable documents
    return posixpath.splitext(urlparse(url).path)[1].lower() not in self.IGNORED_EXTENSIONS

  def is_valid_scheme(self, url):
    # ignore non http/s links
    return urlparse(url).scheme in ["http", "https"]

  def is_valid_domain(self, url):
    # ignore offsite urls (only keep urls with same domain and subdomain)
    parsed = tldextract.extract(url)
    return parsed.registered_domain == self.domain and parsed.subdomain == self.subdomain

  def is_valid_path(self, url):
    # ignore urls of undesired paths
    if not self.follow:
      return True
    path = urlparse(url).path
    for pattern in self.follow:
      if pattern.match(path):
        return True
    return False

  def is_new(self, url):
    # ignore visited urls
    return canonicalize_url(url) not in self.seen

  def filter(self, urls: List[str]) -> List[str]:
    # filter list of urls
    found = []
    for url in urls:
      if not self.is_valid_scheme(url):
        log.debug(f"drop ignored scheme {url}")
        continue
      if not self.is_valid_domain(url):
        log.debug(f"drop domain mismatch {url}")
        continue
      if not self.is_valid_ext(url):
        log.debug(f"drop ignored extension {url}")
        continue
      if not self.is_valid_path(url):
        log.debug(f"drop ignored path {url}")
        continue
      if not self.is_new(url):
        log.debug(f"drop duplicate {url}")
        continue
      self.seen.add(canonicalize_url(url))
      found.append(url)
    return found

In [None]:
# testing the url filter function
nytimes_filter = UrlFilter("nytimes.com", "store")
response = httpx.get("https://store.nytimes.com")
urls = extract_urls(response)
filtered = nytimes_filter.filter(urls)
filtered_2nd_page = nytimes_filter.filter(urls)
print(filtered)
print(filtered_2nd_page)

In [10]:
# Crawler class
class Crawler:
  async def __aenter__(self):
    self.session = await httpx.AsyncClient(
        timeout=httpx.Timeout(60.0),
        limits=httpx.Limits(max_connections=5),
        headers={
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "accept-language": "en-US;en;q=0.9",
            "accept-encoding": "gzip, deflate, br",
        },
    ).__aenter__()
    return self

  async def __aexit__(self, *args, **kwargs):
    await self.session.__aexit__(*args, **kwargs)

  def __init__(self, filter: UrlFilter, callbacks: Optional[Dict[str, Callable]] = None) -> None:
    self.url_filter = filter
    self.callbacks = callbacks or {}

  def parse(self, response: List[httpx.Response]) -> List[str]:
    # find valid urls in responses
    all_unique_urls = set()
    found = []
    for response in responses:
      sel = Selector(text=response.text, base_url=str(response.url))
      _url_in_response = set(
          urljoin(str(response.url), url.strip())
          for url in sel.xpath("//a/@href").getall()
      )
      all_unique_urls |= _url_in_response
    urls_to_follow = self.url_filter.filter(all_unique_urls)
    log.info(f"found {len(urls_to_follow)} urls to follow (from total {len(all_unique_urls)})")
    return urls_to_follow

  async def scrape_url(self, url):
    return await self.session.get(url, follow_redirects=True)

  async def scrape(self, urls: List[str]) -> Tuple[List[httpx.Response], List[Exception]]:
    # scrape urls and return their responses
    responses = []
    failures = []
    log.info(f"scraping {len(urls)} urls")

    tasks = [self.scrape_url(url) for url in urls]
    for result in await asyncio.gather(*tasks, return_exceptions=True):
      if isinstance(result, httpx.Response):
        responses.append(result)
      else:
        failures.append(result)
    return responses, failures

  async def run(self, starts_urls: List[str], max_depth=5) -> None:
    # crawl target to maximum depth or until no more urls are found
    url_pool = starts_urls
    depth = 0
    while url_pool and depth <= max_depth:
      responses, failures= await self.scrape(url_pool)
      log.info(f"depth {depth}: scraped {len(responses)} pages and failed {len(failures)}")
      url_pool = self.parse(responses)
      await self.callback(responses)
      depth += 1

  async def callback(self, responses):
    for response in responses:
      for pattern, fn in self.callbacks.items():
        if pattern.match(str(response.url)):
          log.debug(f'found matching callback for {response.url}')
          fn(response=response)


In [11]:
def extract_json_objects(text: str, decoder=json.JSONDecoder()):
    # Find JSON objects in text, and yield the decoded JSON data
    pos = 0
    while True:
        match = text.find('{', pos)
        if match == -1:
            break
        try:
            result, index = decoder.raw_decode(text[match:])
            yield result
            pos = match + index
        except ValueError:
            pos = match + 1

def find_json_in_script(response: httpx.Response, keys):
    # find all json objects in HTML <script> tags that contain specified keys
    scripts = Selector(text=response.text).xpath('//script/text()').getall()
    objects = []
    for script in scripts:
        if not all(f'"{k}"' in script for k in keys):
            continue
        objects.extend(extract_json_objects(script))
    return [obj for obj in objects if all(k in str(obj) for k in keys)]

In [None]:
# testing above code
url = "https://store.nytimes.com/collections/apparel/products/a1-stacked-logo-shirt"
response = httpx.get(url)
products = find_json_in_script(response, ["published_at", "price"])
print(json.dumps(products, indent=2, ensure_ascii=False)[:500])

[
  {
    "id": 6984160215110,
    "title": "Stacked Logo Shirt",
    "handle": "a1-stacked-logo-shirt",
    "description": "<p>Wear The Times proudly with this simple yet expressive T-shirt, featuring the world-renowned New York Times logo, stacked on three lines. This 100% cotton short-sleeve shirt has a comfortable unisex fit and is available in black and gray, in sizes ranging from XS to 4XL. It also comes in kids sizes for budding young journalists.</p>\n<!-- split -->\n<p>The Times logo ha


In [19]:
results = []
def parse_product(response):
    products = find_json_in_script(response, ["published_at", "price"])
    results.extend(products)
    if not products:
        log.warning(f"could not find product data in {response.url}")



async def run():
    callbacks = {
        # any url that contains "/products/" is a product page
        re.compile(".+/products/.+"): parse_product
    }
    url_filter = UrlFilter(domain="nytimes.com", subdomain="store")
    async with Crawler(url_filter, callbacks=callbacks) as crawler:
        await crawler.run(["https://store.nytimes.com/"])
    print(results)



In [22]:
#this is getting an error
if __name__ == "__main__":
    asyncio.run(run())

TypeError: run() missing 1 required positional argument: 'main'