In [6]:
!pip install selenium



In [12]:
!pip install webdriver-manager


Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2


In [18]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

# Define headers to mimic a real browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# A class to represent a Webpage
class Website:

    def __init__(self, url):
        """
        This class creates a website object from the given URL using BeautifulSoup or Selenium depending on the content type
        """
        self.url = url
        # Check if the page is dynamic or static
        if self.is_dynamic_page(url):
            self.text = self.get_dynamic_content(url)
        else:
            self.text = self.get_static_content(url)
        self.title = self.get_title()

    def get_title(self):
        """
        Fetches the title of the page
        """
        return self.soup.title.string if self.soup.title else "No title found"

    def get_static_content(self, url):
        """
        Fetches static content from the page using requests and BeautifulSoup
        """
        response = requests.get(url, headers=headers)
        self.soup = BeautifulSoup(response.content, 'html.parser')
        # Remove irrelevant tags like script, style, img, and input
        for irrelevant in self.soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        return self.soup.body.get_text(separator="\n", strip=True)

    def get_dynamic_content(self, url):
        """
        Fetches dynamic content from the page using Selenium
        """
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.get(url)
        time.sleep(3)  # Wait for JavaScript to load the content
        # Get the page source after the JavaScript has been executed
        self.soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()  # Close the browser
        # Remove irrelevant tags like script, style, img, and input
        for irrelevant in self.soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        return self.soup.body.get_text(separator="\n", strip=True)

    def is_dynamic_page(self, url):
        """
        A helper function to determine if the page is dynamic or static (based on URL pattern or other logic)
        """
        # For simplicity, we're assuming pages with 'javascript' in the URL are dynamic (this can be improved)
        return "javascript" in url or "dynamic" in url




In [19]:
# Instantiate the Website class with the desired URL
reviewed_website = Website("https://www.bbc.com/news/articles/czd31157m31o")

# Print the title and text content of the webpage
print(reviewed_website.title)
print(reviewed_website.text)

Russia still 'working with US' after Trump says he is 'angry' with Putin
Skip to content
British Broadcasting Corporation
Home
News
Sport
Business
Innovation
Culture
Arts
Travel
Earth
Audio
Video
Live
Home
News
Israel-Gaza War
War in Ukraine
US & Canada
UK
UK Politics
England
N. Ireland
N. Ireland Politics
Scotland
Scotland Politics
Wales
Wales Politics
Africa
Asia
China
India
Australia
Europe
Latin America
Middle East
In Pictures
BBC InDepth
BBC Verify
Sport
Business
Executive Lounge
Technology of Business
Future of Business
Innovation
Technology
Science & Health
Artificial Intelligence
AI v the Mind
Culture
Film & TV
Music
Art & Design
Style
Books
Entertainment News
Arts
Arts in Motion
Travel
Destinations
Africa
Antarctica
Asia
Australia and Pacific
Caribbean & Bermuda
Central America
Europe
Middle East
North America
South America
World’s Table
Culture & Experiences
Adventures
The SpeciaList
Earth
Natural Wonders
Weather & Science
Climate Solutions
Sustainable Business
Green Living
A

In [20]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

In [21]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

API key found and looks good so far!


In [22]:
openai = OpenAI()

In [23]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."


In [24]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt


In [25]:
print(user_prompt_for(reviewed_website))


You are looking at a website titled Russia still 'working with US' after Trump says he is 'angry' with Putin
The contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.

Skip to content
British Broadcasting Corporation
Home
News
Sport
Business
Innovation
Culture
Arts
Travel
Earth
Audio
Video
Live
Home
News
Israel-Gaza War
War in Ukraine
US & Canada
UK
UK Politics
England
N. Ireland
N. Ireland Politics
Scotland
Scotland Politics
Wales
Wales Politics
Africa
Asia
China
India
Australia
Europe
Latin America
Middle East
In Pictures
BBC InDepth
BBC Verify
Sport
Business
Executive Lounge
Technology of Business
Future of Business
Innovation
Technology
Science & Health
Artificial Intelligence
AI v the Mind
Culture
Film & TV
Music
Art & Design
Style
Books
Entertainment News
Arts
Arts in Motion
Travel
Destinations
Africa
Antarctica
Asia
Australia and Pacific
Caribbean & Bermuda
Central Ameri

In [26]:
# See how this function creates exactly the format above

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [27]:
# Try this out, and then try for a few more websites

messages_for(reviewed_website)

[{'role': 'system',
  'content': 'You are an assistant that analyzes the contents of a website and provides a short summary, ignoring text that might be navigation related. Respond in markdown.'},
 {'role': 'user',
  'content': 'You are looking at a website titled Russia still \'working with US\' after Trump says he is \'angry\' with Putin\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\n\nSkip to content\nBritish Broadcasting Corporation\nHome\nNews\nSport\nBusiness\nInnovation\nCulture\nArts\nTravel\nEarth\nAudio\nVideo\nLive\nHome\nNews\nIsrael-Gaza War\nWar in Ukraine\nUS & Canada\nUK\nUK Politics\nEngland\nN. Ireland\nN. Ireland Politics\nScotland\nScotland Politics\nWales\nWales Politics\nAfrica\nAsia\nChina\nIndia\nAustralia\nEurope\nLatin America\nMiddle East\nIn Pictures\nBBC InDepth\nBBC Verify\nSport\nBusiness\nExecutive Lounge\nTechnology of Business\nFut

In [28]:
# And now: call the OpenAI API. You will get very familiar with this!

def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [29]:
# And now: call the OpenAI API. You will get very familiar with this!

def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [30]:
summarize("https://www.bbc.com/news/articles/czd31157m31o")

'# Summary of Article: "Russia still \'working with US\' after Trump says he is \'angry\' with Putin"\n\nIn response to President Trump\'s recent criticism of Vladimir Putin, Russia emphasized that it continues to "work with the US" on improving bilateral relations. Kremlin spokesperson Dmitry Peskov downplayed tensions, noting that while there are no current plans for a call between Trump and Putin, the Russian president remains open to discussions if necessary.\n\nTrump expressed his anger over Putin\'s remarks regarding Ukrainian President Zelensky, which he described as damaging. He threatened to impose a 50% tariff on Russian oil unless a ceasefire was agreed upon. This marks a notable shift in Trump’s previous tone toward Putin.\n\nMoreover, parts of Trump\'s statements found reflection in Russian media, with pro-Kremlin outlets criticizing Trump for not fulfilling his obligations regarding Ukraine\'s actions against Russian energy infrastructure. Peskov also suggested that some 

In [31]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [32]:
display_summary("https://www.bbc.com/news/articles/czd31157m31o")

## Summary of the Website Content

The article discusses statements from Russia regarding its relationship with the United States following comments made by former President Donald Trump. After expressing anger towards Russian President Vladimir Putin for undermining Ukrainian President Volodymyr Zelensky's credibility, Trump threatened to impose a 50% tariff on countries purchasing Russian oil unless a ceasefire is agreed upon.

Kremlin spokesman Dmitry Peskov indicated that Russia remains committed to engaging with the US, despite the rising tensions and clarified that there are currently no scheduled discussions between Putin and Trump, although Putin is open to dialogue. This interaction follows weeks of negotiations between US and Russian officials aimed at resolving the conflict in Ukraine.

The article highlights a shift in Trump's tone toward Putin and includes commentary from Russian media, reflecting both criticism of Trump and a willingness from Moscow to negotiate.

### Key Points:
- Trump expresses anger at Putin’s remarks regarding Zelensky.
- Kremlin downplays tensions, reaffirming commitment to US talks.
- No calls planned between Trump and Putin, but Putin is open to discussing if needed.
- Pro-Kremlin media criticizes Trump for not preventing Ukraine from targeting Russian assets.