In [1]:
import os
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import  Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path
from IPython.display import Markdown, display
from openai import OpenAI
from dotenv import load_dotenv

In [3]:
class WebsiteSummarizer:
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"}
    system_prompt = "You are an assistant that analyzes the contents of a website \
                    and provides a short summary, ignoring text that might be navigation related. \
                    Respond in markdown."

    def __init__(self, url, model, api_key, is_openai_api=False, user_prompt=None):
        self.url = url
        self.model = model
        self.api_key = api_key
        self.is_openai_api = is_openai_api
        self.user_prompt = user_prompt

    def use_requests(self):
        response = requests.get(self.url, headers=self.headers).text
        soup = BeautifulSoup(response, 'lxml')
        return soup

    def use_selenium(self):
        # options = Options()
        # options.add_argument("--headless")
        # options.add_argument("--no-sandbox")
        service = Service(Path.cwd().resolve()/"chromedriver-mac-arm64"/"chromedriver")
        driver = webdriver.Chrome(service=service)
        driver.get(self.url)
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, "lxml")
        driver.quit()
        return soup


    def get_html(self):
        soup = self.use_requests()
        if soup.select_one("noscript #challenge-error-text") or "cdn-cgi/challenge-platform" in soup:
            print("Requests: Detected Cloudflare anti-bot challenge so using Selenium")
            soup = self.use_selenium()


        title = soup.title.string if soup.title else "No title found"

        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()

        text = soup.body.get_text(separator="\n", strip=True)

        return title, text


    def create_user_prompt(self):
        title, text = self.get_html()
        user_prompt = f"You are looking at a website titled {title}"

        if self.user_prompt:
            user_prompt += self.user_prompt
        else:
            user_prompt += "\nThe contents of this website is as follows; \
                        please provide a short summary of this website in markdown. \
        I               If it includes news or announcements, then summarize these too.\n\n"
        user_prompt += text
        return user_prompt

    def message_for_llm(self):
        return [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": self.create_user_prompt()}
        ]


    def initialize_openai_api(self):
        load_dotenv(override=True)
        api_key = os.getenv("OPENAI_API_KEY")

        # Check the key
        if not api_key:
            print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
        elif not api_key.startswith("sk-proj-"):
            print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
        elif api_key.strip() != api_key:
            print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
        else:
            print("API key found and looks good so far!")

    def summarize(self):
        # Summarize by using opeanai API
        if self.is_openai_api:
            self.initialize_openai_api()
            openai = OpenAI()
        else:
            openai = OpenAI(base_url='http://localhost:11434/v1', api_key=self.api_key)  ## ollama via openai api

        response = openai.chat.completions.create(
        model = self.model,   # Example "gpt-4o-mini"
        messages = self.message_for_llm()
        )
        return response.choices[0].message.content

    def __call__(self):
        return self.summarize()

In [4]:
summarized = WebsiteSummarizer("https://edwarddonner.com")()

API key found and looks good so far!


In [5]:
print(summarized)

# Summary of Edward Donner's Website

Edward Donner is a coder and LLM (Large Language Model) enthusiast, sharing his interests in technology and music. He is the co-founder and CTO of Nebula.io, where the focus is on utilizing AI to help individuals maximize their potential, specifically in talent recruitment. Previously, he was the founder and CEO of an AI startup called untapt, which was acquired in 2021.

## Main Features
- **Connect Four**: A feature exploring competitive interactions between LLMs.
- **Outsmart**: An arena that challenges LLMs in a mix of diplomacy and strategic gameplay.

## News and Announcements
- **May 18, 2025**: Upcoming "AI Executive Briefing"
- **April 21, 2025**: Launch of "The Complete Agentic AI Engineering Course"
- **January 23, 2025**: Hosting an LLM hands-on workshop with resources.
- **December 21, 2024**: Welcoming a group referred to as "SuperDataScientists".

Overall, the website communicates Ed's passion for AI, coding, and music, while providi

In [6]:
# A function to display this nicely in the Jupyter output, using markdown
# A function to display this nicely in the Jupyter output, using markdown
def display_summary(url, model, api_key, is_openai_model=False, userprompt=None, systemprompt=None):
    summary = WebsiteSummarizer(url=url, model=model, api_key=api_key, is_openai_api=is_openai_model)
    if userprompt:
        summary.userprompt = userprompt

    if systemprompt:
        summary.system_prompt = systemprompt

    summarized = summary()

    display(Markdown(summarized))

In [9]:
display_summary("https://cnn.com")

API key found and looks good so far!


# Summary of CNN Website

CNN's website provides up-to-date news coverage across various categories including U.S. news, world events, politics, business, health, entertainment, sports, science, and climate issues. The platform includes in-depth articles, live updates, and video segments highlighting significant stories of the day. 

## Notable Recent News:
- **Ukraine-Russia War:** An analysis of the ongoing conflict, mentioning unresolved issues and actions by both sides.
- **Sean “Diddy” Combs Trial:** Focusing on the ongoing legal proceedings against Combs, with detailed testimonies from various witnesses about allegations of violence and control in his relationship with Cassie Ventura.
- **Natural Disasters:** Reports on strong tornadoes affecting parts of the central U.S. and coverage of a tragic incident where a train hit a family, resulting in fatalities.
- **Political Updates:** Developments regarding U.S. immigration policies and other significant political moves, including actions from former President Trump.

In addition to news articles, CNN also features multimedia content such as videos and podcasts, making it a comprehensive source for current events and in-depth analysis. The site emphasizes live updates and trending stories to keep viewers informed in real-time.

In [8]:
display_summary("https://openai.com")

API key found and looks good so far!
Requests: Detected Cloudflare anti-bot challenge so using Selenium


# OpenAI Website Summary

OpenAI's website serves as a comprehensive platform showcasing its various offerings, including the ChatGPT tool, its API for developers, and research advancements. Below are the key sections and recent news highlighted on the site:

## Key Features:
- **ChatGPT**: An interactive tool for users to engage in conversations, plan, and retrieve information across various topics.
- **Sora**: A platform with features aimed at enhancing user experience in specific applications.
- **API Platform**: Provides documentation and tools for businesses and developers to integrate OpenAI's AI capabilities into their applications.

## Research and Advancements:
- **Research Index**: An overview of OpenAI's current research initiatives and publications.
- **Latest Models**: Introduction of advanced models such as GPT-4.5 and OpenAI o4-mini, focusing on improvements in capabilities and applications.

## Recent News:
1. **Evolving OpenAI’s Structure** - Announced on May 5, 2025, detailing organizational changes.
2. **New Image Generation Model** - Released on April 23, 2025, enhancing OpenAI's visual content capabilities.
3. **Nonprofit Commission Advisors** - Introduced on April 15, 2025, aimed at guiding OpenAI's ethical framework and initiatives.
4. **Updated Preparedness Framework** - Published on April 15, 2025, outlining safety and preparedness measures.

The site serves as a central hub for accessing AI tools, capabilities, and the latest news regarding OpenAI's ongoing projects and advancements in the field of artificial intelligence.