# Summarize HN

Jon Chun
30 Sep 2024

# Installation

In [None]:
!pip install requests beautifulsoup4 openai python-dotenv aiohttp spacy
!python -m spacy download en_core_web_sm


Collecting openai
  Downloading openai-1.51.0-py3-none-any.whl.metadata (24 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.51.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.5/383.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Downloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# **[RESTART RUNTIME]**

# Setup

In [None]:
import os
import getpass
import re
import requests
from bs4 import BeautifulSoup
import openai
from dotenv import load_dotenv
import logging
from typing import List, Dict, Tuple
import asyncio
import aiohttp
import sys
import spacy
from collections import Counter
from google.colab import files

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

load_dotenv()

URL_TARGET = 'https://news.ycombinator.com/item?id=40515465'



In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Select the HN URL

In [None]:
URL_TARGET = 'https://news.ycombinator.com/item?id=40515465'

# OpenAI Key

In [None]:
import getpass

In [None]:
OPENAI_API_KEY = getpass.getpass("Enter OpenAI API Key: ")
openai.api_key = OPENAI_API_KEY

Enter OpenAI API Key: ··········


# Functions

In [None]:
async def fetch_web_page(url: str, session: aiohttp.ClientSession) -> str:
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            return await response.text()
    except aiohttp.ClientError as e:
        logging.error(f"Error fetching {url}: {e}")
        raise

def parse_threads(html_content: str) -> List[Dict]:
    soup = BeautifulSoup(html_content, 'html.parser')
    threads = []

    try:
        comments = soup.find_all('tr', class_='athing comtr')
        for comment in comments:
            comment_id = comment.get('id')
            indent = int(comment.find('td', class_='ind').find('img').get('width', 0)) // 40
            content = comment.find('div', class_='comment')

            if content:
                threads.append({
                    'id': comment_id,
                    'indent': indent,
                    'content': content.get_text(strip=True),
                    'replies': []
                })
    except AttributeError as e:
        logging.error(f"Error parsing HTML: {e}")
        raise

    return threads

def build_thread_tree(threads: List[Dict]) -> List[Dict]:
    thread_tree = []
    stack = []

    for thread in threads:
        while stack and stack[-1]['indent'] >= thread['indent']:
            stack.pop()

        if stack:
            stack[-1]['replies'].append(thread)
        else:
            thread_tree.append(thread)

        stack.append(thread)

    return thread_tree

def is_common_word(word: str) -> bool:
    common_words = set(['more', 'as', 'where', 'there', 'again', 'many', 'internet', 'edit', 'to'])
    return word.lower() in common_words

def extract_resources(text: str) -> List[Tuple[str, str, str]]:
    resources = []

    # Extract URLs
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    urls = re.findall(url_pattern, text)
    for url in urls:
        resources.append(("URL", url, url))

    # Extract named entities (this is a simplified version, you might want to use a more sophisticated NER)
    entity_types = ["PERSON", "ORG", "PRODUCT", "WORK_OF_ART"]
    for entity_type in entity_types:
        entities = re.findall(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', text)
        for entity in entities:
            if not is_common_word(entity):
                resources.append((entity_type, entity, ""))

    return resources

async def format_resources(resources: Dict[int, Tuple[str, str, str]]) -> str:
    formatted = "# Combined and Deduplicated Resources List\n\n"
    resource_counter = Counter()

    for _, (res_type, res_name, res_url) in resources.items():
        resource_counter[(res_type, res_name)] += 1

    sorted_resources = sorted(resource_counter.items(), key=lambda x: x[1], reverse=True)

    async with aiohttp.ClientSession() as session:
        for (res_type, res_name), count in sorted_resources:
            formatted += f"(count {count}) Type: {res_type}, Name: {res_name}"
            if res_type == "URL":
                description = await get_url_description(res_name, session)
                formatted += f"\n   URL: {res_name}\n   {description}"
            formatted += "\n\n"

    return formatted

# Summarize Threads

In [None]:
async def summarize_thread(thread: Dict, session: aiohttp.ClientSession) -> Tuple[str, List[Tuple[str, str, str]]]:
    prompt = f"Summarize the following discussion thread concisely:\n\n{thread['content']}"

    try:
        async with session.post(
            "https://api.openai.com/v1/chat/completions",
            headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
            json={
                "model": "gpt-3.5-turbo",
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 150
            }
        ) as response:
            response.raise_for_status()
            result = await response.json()
            summary = result['choices'][0]['message']['content'].strip()
            resources = extract_resources(thread['content'])
            return summary, resources
    except aiohttp.ClientError as e:
        logging.error(f"Error calling OpenAI API: {e}")
        raise

async def summarize_threads(threads: List[Dict]) -> Tuple[List[Dict], Dict[int, Tuple[str, str, str]]]:
    async with aiohttp.ClientSession() as session:
        tasks = [summarize_thread(thread, session) for thread in threads]
        results = await asyncio.gather(*tasks, return_exceptions=True)

    all_resources = {}
    resource_counter = 1

    for thread, result in zip(threads, results):
        if isinstance(result, Exception):
            logging.error(f"Error summarizing thread {thread['id']}: {result}")
            thread['summary'] = "Error in summarization"
            thread['resources'] = []
        else:
            summary, resources = result
            thread['summary'] = summary
            thread['resources'] = resources
            for resource in resources:
                all_resources[resource_counter] = resource
                resource_counter += 1

    return threads, all_resources

def combine_summaries(threads: List[Dict]) -> str:
    combined_summary = "Individual Thread Summaries:\n\n"

    for thread in threads:
        combined_summary += f"Thread ID: {thread['id']}\nSummary: {thread['summary']}\n\n"

    return combined_summary

# Outline Synthesis and Formatting

In [None]:
async def create_outline_synthesis(threads: List[Dict], session: aiohttp.ClientSession) -> str:
    all_summaries = "\n".join([thread['summary'] for thread in threads])
    prompt = f"""Create an outline synthesizing all the topics from the following summaries.
    Include supporting details like:
    a. Concise description
    b. Pros
    c. Cons
    d. Related resources (if any)

    Summaries:
    {all_summaries}
    """

    try:
        async with session.post(
            "https://api.openai.com/v1/chat/completions",
            headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
            json={
                "model": "gpt-3.5-turbo",
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 1000
            }
        ) as response:
            response.raise_for_status()
            result = await response.json()
            return result['choices'][0]['message']['content'].strip()
    except aiohttp.ClientError as e:
        logging.error(f"Error calling OpenAI API for synthesis: {e}")
        raise


async def get_url_description(url: str, session: aiohttp.ClientSession) -> str:
    prompt = f"Provide a concise 1-3 word title and a brief 1 sentence description for this URL: {url}"

    try:
        async with session.post(
            "https://api.openai.com/v1/chat/completions",
            headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
            json={
                "model": "gpt-3.5-turbo",
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 50
            }
        ) as response:
            response.raise_for_status()
            result = await response.json()
            return result['choices'][0]['message']['content'].strip()
    except aiohttp.ClientError as e:
        logging.error(f"Error calling OpenAI API for URL description: {e}")
        return "Title: Unknown\nDescription: Unable to fetch description."

# The extract_resources function remains the same as in the previous version

def get_hn_story_title(html_content: str) -> str:
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.find('tr', class_='athing').find('span', class_='titleline').text.strip()
    return re.sub(r'[^a-zA-Z0-9_]', '_', title.lower())

# File Saving and Main

In [None]:
import nest_asyncio
nest_asyncio.apply()

async def main():
    try:
        async with aiohttp.ClientSession() as session:
            html_content = await fetch_web_page(URL_TARGET, session)

        story_title = get_hn_story_title(html_content)
        output_file = f"ycombinatornews_{story_title}_40515465.txt"

        threads = parse_threads(html_content)
        thread_tree = build_thread_tree(threads)

        all_resources = {}
        resource_counter = 1
        for thread in thread_tree:
            resources = extract_resources(thread['content'])
            for resource in resources:
                all_resources[resource_counter] = resource
                resource_counter += 1

        formatted_resources = await format_resources(all_resources)

        # ... rest of the main function ...

    except Exception as e:
        logging.error(f"An error occurred: {e}")

In [None]:
  async def save_summary_to_file(thread_summaries: str, outline_synthesis: str, resources: str, file_path: str):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write("--- Thread Summaries ---\n\n")
            file.write(thread_summaries)
            file.write("\n--- Collected Resources ---\n\n")
            file.write(resources)
            file.write("\n--- Overall Outline Synthesis ---\n\n")
            file.write(outline_synthesis)
        logging.info(f"Summary saved to {file_path}")
        files.download(file_path)  # This will prompt a download in Colab
    except IOError as e:
        logging.error(f"Error saving summary to file: {e}")
        raise

In [None]:
async def main():
    try:
        async with aiohttp.ClientSession() as session:
            html_content = await fetch_web_page(URL_TARGET, session)

        story_title = get_hn_story_title(html_content)
        output_file = f"ycombinatornews_{story_title}_40515465.txt"

        threads = parse_threads(html_content)
        thread_tree = build_thread_tree(threads)

        summarized_threads, all_resources = await summarize_threads(thread_tree)
        thread_summaries = combine_summaries(summarized_threads)

        formatted_resources = await format_resources(all_resources)

        async with aiohttp.ClientSession() as session:
            outline_synthesis = await create_outline_synthesis(summarized_threads, session)

        await save_summary_to_file(thread_summaries, outline_synthesis, formatted_resources, output_file)

        print(f"Output has been saved to {output_file}")

    except Exception as e:
        logging.error(f"An error occurred: {e}")

# Run the main function
await main()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Output has been saved to ycombinatornews_california_senate_passes_sb_1047__hyperdimensional_co__40515465.txt
