# Web Scraper with JavaScript Support
Uses day1-webscraping-selenium-for-javascript.ipynb solution simplified so easy to run.

## Install dependencies
Uncomment and run once

In [2]:
# !pip install selenium
!pip install undetected-chromedriver
# !ollama pull llama3.2



## Import required dependencies

In [3]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

# If you get an error running this cell, then please head over to the troubleshooting notebook!

## Run setup

In [4]:
chrome_path = "C:/Program Files/Google/Chrome/Application/chrome.exe"
chrome_path = "/usr/bin/google-chrome"

In [5]:
# Load environment variables in a file called .env

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [6]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.
# If it STILL doesn't work (horrors!) then please see the troubleshooting notebook, or try the below line instead:
# openai = OpenAI(api_key="your-key-here-starting-sk-proj-")

# Create Prompts

In [8]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt


# Create Functions

In [9]:
# Setup for how OpenAI expects to receive messages in a particular structure

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

# Use Selenium and chrome to scrape website
class WebsiteCrawler:
    def __init__(self, url, wait_time=20, chrome_binary_path=None):
        """
        Initialize the WebsiteCrawler using Selenium to scrape JavaScript-rendered content.
        """
        self.url = url
        self.wait_time = wait_time

        options = uc.ChromeOptions()
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("start-maximized")
        options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
        )
        if chrome_binary_path:
            options.binary_location = chrome_binary_path

        self.driver = uc.Chrome(options=options)

        try:
            # Load the URL
            self.driver.get(url)

            # Wait for Cloudflare or similar checks
            time.sleep(10)

            # Ensure the main content is loaded
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.TAG_NAME, "main"))
            )

            # Extract the main content
            main_content = self.driver.find_element(By.CSS_SELECTOR, "main").get_attribute("outerHTML")

            # Parse with BeautifulSoup
            soup = BeautifulSoup(main_content, "html.parser")
            self.title = self.driver.title if self.driver.title else "No title found"
            self.text = soup.get_text(separator="\n", strip=True)

        except Exception as e:
            print(f"Error occurred: {e}")
            self.title = "Error occurred"
            self.text = ""

        finally:
            self.driver.quit()

def new_summary(url, chrome_path):
    web = WebsiteCrawler(url, 30, chrome_path)
    response = openai.chat.completions.create(
            model = "gpt-4o-mini",
            messages = messages_for(web)
        )

    web_summary = response.choices[0].message.content
    
    return display(Markdown(web_summary))

In [10]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

# Setup for how OpenAI expects to receive messages in a particular structure

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

# Use Selenium and chrome to scrape website
class WebsiteCrawler:
    def __init__(self, url, wait_time=20, chrome_binary_path=None):
        """
        Initialize the WebsiteCrawler using Selenium to scrape JavaScript-rendered content.
        """
        self.url = url
        self.wait_time = wait_time

        options = uc.ChromeOptions()
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("start-maximized")
        options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
        )
        if chrome_binary_path:
            options.binary_location = chrome_binary_path

        self.driver = uc.Chrome(options=options)

        try:
            # Load the URL
            self.driver.get(url)

            # Wait for Cloudflare or similar checks
            time.sleep(10)

            # Ensure the main content is loaded
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.TAG_NAME, "main"))
            )

            # Extract the main content
            main_content = self.driver.find_element(By.CSS_SELECTOR, "main").get_attribute("outerHTML")

            # Parse with BeautifulSoup
            soup = BeautifulSoup(main_content, "html.parser")
            self.title = self.driver.title if self.driver.title else "No title found"
            self.text = soup.get_text(separator="\n", strip=True)

        except Exception as e:
            print(f"Error occurred: {e}")
            self.title = "Error occurred"
            self.text = ""

        finally:
            self.driver.quit()

def new_summary(url, chrome_path):
    web = WebsiteCrawler(url, 30, chrome_path)
    response = openai.chat.completions.create(
            model = "gpt-4o-mini",
            messages = messages_for(web)
        )

    web_summary = response.choices[0].message.content
    
    return display(Markdown(web_summary))

# Scrape and Summarize Web Page

In [11]:
url = "https://www.canva.com/"
new_summary(url, chrome_path)

# Canva: Visual Suite for Everyone

Canva is a versatile design platform that enables users to create a wide range of projects, including posters, resumes, logos, presentations, social media content, and more. It provides options for individuals, teams, and organizations, offering both free and premium plans tailored for different needs.

## Key Features
- **Design Capabilities**: Users can utilize a variety of templates for personal and professional use, alongside AI-powered tools such as Magic Write for generating text and Magic Edit for photo transformations.
- **Collaboration**: Canva allows real-time collaboration for teams, enabling shared design efforts on various projects.
- **Printing Services**: Users can print their designs and have them delivered directly, with a focus on sustainability, including tree planting with each print order.

## Plans
- **Canva Free**: Basic design tools for individuals.
- **Canva Pro**: Advanced features for personal branding and projects.
- **Canva Teams**: Collaborative features for groups.
- **Canva Enterprise**: Solutions for organizations to manage teams and brand consistency.

## Additional Highlights
- **Educational and Nonprofit Support**: Free premium features for educational institutions and nonprofits to support their missions.
- **Sustainability Commitment**: Canva prioritizes eco-friendly practices, aiming for carbon neutrality and promoting regeneration.
- **User Testimonials**: Feedback from business users underscores Canva's effectiveness in enhancing productivity and teamwork.

For those looking to start designing, Canva offers an easy entry point with numerous templates and an intuitive interface suitable for various experience levels.

In [12]:
url = "https://www.bbc.co.uk/"
new_summary(url, chrome_path)

# BBC Home Summary

The BBC Home website provides a wide range of breaking news and updates across various categories, including world news, US news, sports, business, innovation, climate, culture, and travel.

## Key News Highlights

- **Iran Explosion**: A massive explosion at an Iranian port has resulted in the death of 40 people, leading to a blame game regarding improperly stored solid fuel intended for ballistic missiles. (2 hours ago)

- **Pope Francis's Tomb**: Images have been released showing the late Pope Francis's tomb following a private burial ceremony after his public funeral at the Vatican. (7 hours ago)

- **Vancouver Car Ramming Incident**: The death toll in the car ramming incident at a festival in Vancouver has risen to 11. The acting police chief described it as the "darkest day in the city's history" but stated it is not believed to be an act of terrorism. (1 hour ago)

- **South China Sea Seizure**: Beijing has seized a small sandbank in the South China Sea, as state media releases images of Chinese coastguard officials raising a flag on the strategic reef. (3 hours ago)

- **Political Climate in the US**: As Democrats reflect on their strategies post-Trump's second term, there are varied opinions on the party's future directions. (16 hours ago)

- **Legislation in Canada**: A significant change is observed in Canada’s political landscape, with the Liberals trying to retain power following a campaign influenced by Donald Trump's threats. (16 hours ago)

## Cultural Insights

- The BBC also features articles on current cultural trends, such as the gardening practice of "meadowscaping," which emphasizes transforming lawns into meadows for environmental benefits, illustrating a growing interest in sustainable living.

- In a look at popular cooking, the site offers insights into eight essential Asian sauces that can enhance dishes, marking a culinary exploration.

Overall, the BBC Home serves as a comprehensive news source, blending immediate updates with cultural and lifestyle features.