In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from typing import Iterable, List, Dict, Union
from dataclasses import dataclass
import json
import newspaper

from urllib.parse import urlparse
from data_sources.news.scrape import remove_img_tags

def response_to_article(
    response: scrapy.http.Response,
) -> newspaper.Article:
    """Parse the response from a URL into a newspaper Article"""
    article = newspaper.article(
        response.url,
        language="en",
        # Remove images to prevent downloading them, which crashes
        input_html=remove_img_tags(response.text),
        fetch_images=False,
    )
    article.parse()
    return article


def article_to_markdown(article: newspaper.Article, max_chars=None) -> str:
    """Format a parsed newspaper Article into Markdown for summarization"""
    header = article.title
    if article.authors:
        header += f" by {', '.join(article.authors)}"
    if article.publish_date:
        header += f" on {article.publish_date.strftime('%Y-%m-%d')}"

    text = article.text
    if max_chars:
        text = text[:max_chars]

    header = f"# [{header}]({article.url})"

    return f"{header}\n{text}"


class CompanySpider(scrapy.Spider):
    name = "company"
    allowed_domains: List[str]
    start_urls: List[str]

    def __init__(self, domain: str, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.allowed_domains = [domain]
        self.start_urls = [f"http://{domain}"]

    def parse(self, response: scrapy.http.Response) -> Iterable[Union[Dict, scrapy.http.Request]]:
        # TODO: Replace this with extraction code
        # page_content = response.css("body").get()
        parsed_article = response_to_article(response)
        markdown = article_to_markdown(parsed_article)
        yield {
            "url": response.url,
            "html": response.text,
            "markdown": markdown,
        }

        # Follow links to other pages within the same domain
        for href in response.css("a::attr(href)").getall():
            if href.startswith("/"):
                href = response.urljoin(href)
            url_parts = urlparse(href)
            # if self.allowed_domains[0] in href:
            if url_parts.netloc.endswith(self.allowed_domains[0]):
                yield scrapy.Request(href, callback=self.parse)



In [None]:


def crawl_company_webpage(domain: str):
    process = CrawlerProcess(settings={
        "FEEDS": {
            "output.json": {
                "format": "json",
                "overwrite": True,
                },
        },
        "DOWNLOAD_DELAY": 1,
        # "DEFAULT_REQUEST_HEADERS": {
        #     "User-Agent": "Mozilla/5.0 (compatible; CompanyBot/1.0; +http://example.com/bot)",
        #     "Accept-Language": "en",
        # },
        # NOTE: Setting this to 0 will crawl everything
        "DEPTH_LIMIT": 1,
    })

    process.crawl(CompanySpider, domain=domain)

    # NOTE: This can only be run once per process, so in a notebook we need to restart the kernel
    process.start()


webpage_results = crawl_company_webpage("synthesize.bio")

In [None]:
with open("output.json", "r") as file:
    data = json.load(file)

for page in sorted(data, key=lambda p: p["url"]):
    print(f"""
# {page["url"]}
{len(page["markdown"]):,} chars in markdown

{page["markdown"][:300]}...
""")

total_chars = sum(len(page["markdown"]) for page in data)
print(f"Total chars: {total_chars:,}")