In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from typing import Iterable, List, Dict, Union
import json
import newspaper

from urllib.parse import urlparse
from data_sources.news.scrape import remove_img_tags, article_to_markdown

def response_to_article(
    response: scrapy.http.Response,
) -> newspaper.Article:
    """Parse the scrapy response into a newspaper Article"""
    article = newspaper.article(
        response.url,
        language="en",
        # Remove images to prevent downloading them, which crashes
        input_html=remove_img_tags(response.text),
        fetch_images=False,
    )
    article.parse()
    return article


class CompanySpider(scrapy.Spider):
    name = "company"
    allowed_domains: List[str]
    start_urls: List[str]

    def __init__(self, domain: str, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.allowed_domains = [domain]
        self.start_urls = [f"http://{domain}"]

    def parse(self, response: scrapy.http.Response) -> Iterable[Union[Dict, scrapy.http.Request]]:
        parsed_article = response_to_article(response)
        markdown = article_to_markdown(parsed_article)
        yield {
            "url": response.url,
            "html": response.text,
            "markdown": markdown,
        }

        # Follow links to other pages within the same domain
        for href in response.css("a::attr(href)").getall():
            if href.startswith("/"):
                href = response.urljoin(href)
            url_parts = urlparse(href)
            if url_parts.netloc.endswith(self.allowed_domains[0]):
                yield scrapy.Request(href, callback=self.parse)



In [None]:
def crawl_company_webpage(domain: str, depth_limit: int=1):
    filename = f"../output/data/scrapy/{domain}.json"
    process = CrawlerProcess(settings={
        "FEEDS": {
            filename: {
                "format": "json",
                "overwrite": True,
                },
        },
        "DOWNLOAD_DELAY": 1,
        # "DEFAULT_REQUEST_HEADERS": {
        #     "User-Agent": "Mozilla/5.0 (compatible; CompanyBot/1.0; +http://example.com/bot)",
        #     "Accept-Language": "en",
        # },
        "DEPTH_LIMIT": depth_limit,
    })

    process.crawl(CompanySpider, domain=domain)

    # NOTE: This can only be run once per process, so in a notebook we need to restart the kernel
    process.start()

    return filename


json_file = crawl_company_webpage("98point6.com", depth_limit=2)

In [None]:
with open(json_file, "r") as file:
    data = json.load(file)

total_chars = 0
for page in sorted(data, key=lambda p: p["url"]):
    truncated_page = page["markdown"][:4000]
    print(f"""
# {page["url"]}
{len(page["markdown"]):,} chars in markdown

{truncated_page}...
""")
    total_chars += len(truncated_page)

print(f"Total chars: {total_chars:,}")