In [22]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from typing import List
import json

In [6]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    print("API_KEY not set")
else:
    print("API_KEY set")

MODEL = 'gpt-4o-mini'
openai = OpenAI()

API_KEY set


In [8]:
class Website:
    url: str
    title: str
    body: str
    link: List[str]
    text: str

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.link = [link for link in links if link]

    def get_content(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [16]:
ed = Website("https://www.github.com")
ed.link

['#start-of-content',
 'https://github.com/features/copilot/?utm_source=github&utm_medium=banner&utm_campaign=copilotfree-bannerheader',
 '/',
 '/login',
 'https://github.com/features/copilot',
 'https://github.com/features/security',
 'https://github.com/features/actions',
 'https://github.com/features/codespaces',
 'https://github.com/features/issues',
 'https://github.com/features/code-review',
 'https://github.com/features/discussions',
 'https://github.com/features/code-search',
 'https://github.com/features',
 'https://docs.github.com',
 'https://skills.github.com',
 'https://github.blog',
 'https://github.com/enterprise',
 'https://github.com/team',
 'https://github.com/enterprise/startups',
 '/solutions/industry/nonprofits',
 '/solutions/use-case/devsecops',
 '/solutions/use-case/devops',
 '/solutions/use-case/ci-cd',
 '/solutions/use-case',
 '/solutions/industry/healthcare',
 '/solutions/industry/financial-services',
 '/solutions/industry/manufacturing',
 '/solutions/industry/

In [18]:
link_system_prompt = "You are provided with the link of links found on a webpage. \
                    Your are able to decide which of the links would be most relevant to include in a brochure about the company, \
                    such as links to an About page, or a Company page, or Career/Jobs pages. \n"
link_system_prompt += "You should respond in json as in this example:"
link_system_prompt += """
{
    "links":[
        {"type": "about page", "url": "https://www.full.url/goes/here/about"},
        {"type": "careers page", "url": "https://www.full.url/careers/here/about"},
    ]
}
"""

In [19]:
def get_links_user_prompt(website):
    user_prompt = f"here is the list of links found on {website.url} - "
    user_prompt += "please decide which of these are relevent web links for a brochure about the company, respond wuth full https URL. Do not include Terms of Service , Privacy , email links. \n"
    user_prompt += "Links (some might be relative links): \n"
    user_prompt += "\n".join(website.link)
    return user_prompt

In [20]:
print(get_links_user_prompt(ed))

here is the list of links found on https://www.github.com - please decide which of these are relevent web links for a brochure about the company, respond wuth full https URL. Do not include Terms of Service , Privacy , email links. 
Links (some might be relative links): 
#start-of-content
https://github.com/features/copilot/?utm_source=github&utm_medium=banner&utm_campaign=copilotfree-bannerheader
/
/login
https://github.com/features/copilot
https://github.com/features/security
https://github.com/features/actions
https://github.com/features/codespaces
https://github.com/features/issues
https://github.com/features/code-review
https://github.com/features/discussions
https://github.com/features/code-search
https://github.com/features
https://docs.github.com
https://skills.github.com
https://github.blog
https://github.com/enterprise
https://github.com/team
https://github.com/enterprise/startups
/solutions/industry/nonprofits
/solutions/use-case/devsecops
/solutions/use-case/devops
/solutio

In [26]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)},
        ],
        response_format={"type": "json_object"},
    )
    result = response.choices[0].message.content  #OpenAI responds, the result is stored in a dictionary-like format.
    return json.loads(result)
get_links("https://www.github.com")

{'links': [{'type': 'about page', 'url': 'https://github.com/about'},
  {'type': 'careers page', 'url': 'https://github.careers'},
  {'type': 'blog', 'url': 'https://github.blog'},
  {'type': 'customer stories page',
   'url': 'https://github.com/customer-stories'},
  {'type': 'solutions page', 'url': 'https://github.com/solutions'},
  {'type': 'features page', 'url': 'https://github.com/features'},
  {'type': 'team page', 'url': 'https://github.com/team'}]}

In [28]:
def get_all_details(url):
    result = "Landing pages:\n"
    result += Website(url).get_content()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_content()
    return result
#make brochure
print(get_all_details("https://www.github.com"))

Found links: {'links': [{'type': 'about page', 'url': 'https://github.com/about'}, {'type': 'careers page', 'url': 'https://github.careers'}, {'type': 'team page', 'url': 'https://github.com/team'}, {'type': 'customer stories page', 'url': 'https://github.com/customer-stories'}, {'type': 'blog page', 'url': 'https://github.blog'}, {'type': 'diversity page', 'url': 'https://github.com/about/diversity'}]}
Landing pages:
Webpage Title:
GitHub · Build and ship software on a single, collaborative platform · GitHub
Webpage Contents:
Skip to content
GitHub Copilot is now available for free.
Learn more
Navigation Menu
Toggle navigation
Sign in
Product
GitHub Copilot
Write better code with AI
Security
Find and fix vulnerabilities
Actions
Automate any workflow
Codespaces
Instant dev environments
Issues
Plan and track work
Code Review
Manage code changes
Discussions
Collaborate outside of code
Code Search
Find more, search less
Explore
All features
Documentation
GitHub Skills
Blog
Solutions
By co