# Load OpenAI API key

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if api_key:
    print(f"API key loaded: {api_key[:5]}... (truncated)")
else:
    print("API key not loaded")

API key loaded: sk-pr... (truncated)


# Get title and body of the website

**Note**: make sure [ChromeDriver](https://googlechromelabs.github.io/chrome-for-testing/#stable) is installed and in your `PATH`

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

class WebSite:
    def __init__(self, url, wait_time=3):
        self.url = url
        self.title = None
        self.body = None

        # Setup headless Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")

        try:
            # Initialize WebDriver
            driver = webdriver.Chrome(options=chrome_options)
            driver.get(url)

            # Wait for JavaScript to render (can adjust this)
            time.sleep(wait_time)

            # Get rendered page source
            html = driver.page_source
            driver.quit()

            # Parse with BeautifulSoup
            soup = BeautifulSoup(html, 'html.parser')

            # Extract title
            self.title = soup.title.string.strip() if soup.title and soup.title.string else None

            # Remove unwanted tags
            for tag in soup(['img', 'input', 'audio', 'video', 'script', 'style']):
                tag.decompose()

            # Extract and clean body text
            body_tag = soup.body
            self.body = body_tag.get_text(separator=' ', strip=True) if body_tag else None

        except Exception as e:
            print(f"Failed to retrieve or parse the website: {e}")


In [4]:
cnn = WebSite("https://www.cnn.com")
print(f"Title: {cnn.title}")
print(f"Body: {cnn.body[:200]}...")  # Print first 200 characters of body text

Title: Breaking News, Latest News and Videos | CNN
Body: CNN values your feedback 1. How relevant is this ad to you? 2. Did you encounter any technical issues? Video player was slow to load content Video content never loaded Ad froze or did not finish loadi...


In [7]:
openai = WebSite("https://www.openai.com")
print(f"Title: {openai.title}")
print(f"Body: {openai.body[:500]}...")  # Print first 200 characters of body text

Title: Just a moment...
Body: Waiting for www.openai.com to respond... Enable JavaScript and cookies to continue...
