# This exercise is using selenium to render websites, read their page sources, and then passes on the source code to OpenAI. It then uses the model to identify and find potential vulnerabilities and security gaps in that source.

In [0]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook!

In [0]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


In [0]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.
# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions

In [0]:
pip install selenium

In [0]:
pip install webdriver-manager

In [0]:
# Step 1: Download the .deb package as a normal user
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb

# Step 2: Install it with sudo
!sudo apt install ./google-chrome-stable_current_amd64.deb


In [0]:
!which google-chrome
!google-chrome --version

In [0]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# options = Options()
# options.binary_location = "/usr/bin/google-chrome"  # Or wherever `which google-chrome` points
# options.add_argument("--headless")
# options.add_argument("--no-sandbox")
# options.add_argument("--disable-dev-shm-usage")

# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=options)

In [0]:
# # Get page source
# url = "https://nohello.net"
# driver.get(url)
# page_source = driver.page_source
# driver.quit()

# Selenium setup done. Definiing website class and other objects below

In [0]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = "You are an assistant that analyzes the page source of a website and identifies potentila vulnerabilities and security gaps in the page source code and gives a short one liner on what should be done about it. Respond in markdown"

In [0]:
class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the Selenium library
        """

        options = Options()
        options.binary_location = "/usr/bin/google-chrome"  # Or wherever `which google-chrome` points
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")

        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        
        self.url = url
        driver.get(url)
        self.page_title = driver.title
        self.page_source = driver.page_source
        driver.quit()
    
        # response = requests.get(url, headers=headers)
        # soup = BeautifulSoup(response.content, 'html.parser')
        # self.title = soup.title.string if soup.title else "No title found"
        # for irrelevant in soup.body(["script", "style", "img", "input"]):
        #     irrelevant.decompose()
        # self.text = soup.body.get_text(separator="\n", strip=True)

In [0]:
# Let's try one out. Change the website and add print statements to follow along.

testweb = Website("https://nohello.net")
# print(testweb.page_title)
# print(testweb.page_source)

In [0]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.page_title}"
    user_prompt += "\nThe contents of this website is as follows; please analyze the page source on this website in detail and identify potential vulnerabilites and security gaps that can be fixed.\n\n"
    user_prompt += website.page_source
    return user_prompt

In [0]:
# print(user_prompt_for(testweb))

In [0]:
# See how this function creates exactly the format above

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [0]:
# Try this out, and then try for a few more websites

messages_for(testweb)

In [0]:
# And now: call the OpenAI API. You will get very familiar with this!

def analyze_code(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [0]:
analyze_code("https://nohello.net")

In [0]:
# A function to display this nicely in the Jupyter output, using markdown

def display_results(url):
    analysis = analyze_code(url)
    display(Markdown(analysis))

In [0]:
display_results("https://nohello.net")