In [1]:
# imports

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""
link_system_prompt += "For any external links in website, add another key called external with value set to true as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "careers page": "url": "https://another.full.url/careers" : "external" : "true"}
    ]
}
"""

In [5]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
For any external links in website, add another key called external with value set to true as in this example:
{
    "links": [
        {"type": "careers page": "url": "https://another.full.url/careers" : "external" : "true"}
    ]
}



In [6]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [7]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [8]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 'inference/get-started',
 '/spaces',
 '/models',
 '/zai-org/GLM-4.5',
 '/Qwen/Qwen3-Coder-480B-A35B-Instruct',
 '/tencent/HunyuanWorld-1',
 '/bosonai/higgs-audio-v2-generation-3B-base',
 '/Qwen/Qwen3-30B-A3B-Instruct-2507',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/zumjoy/Multi-Style_Video-to-Anime_Generator',
 '/spaces/Qwen/Qwen3-Coder-WebDev',
 '/spaces/smola/higgs_audio_v2',
 '/spaces/black-forest-labs/FLUX.1-Kontext-Dev',
 '/spaces',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/NousResearch/Hermes-3-Dataset',
 '/datasets/MegaScience/MegaScience',
 '/datasets/interstellarninja/hermes_reasoning_tool_use',
 '/datasets/microsoft/rStar-Coder',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/ama

In [9]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page',
   'url': 'https://apply.workable.com/huggingface/',
   'external': True},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'community discussion',
   'url': 'https://discuss.huggingface.co',
   'external': True},
  {'type': 'GitHub page',
   'url': 'https://github.com/huggingface',
   'external': True},
  {'type': 'Twitter page',
   'url': 'https://twitter.com/huggingface',
   'external': True},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/',
   'external': True}]}

In [10]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [11]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/', 'external': True}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface', 'external': True}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface', 'external': True}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/', 'external': True}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
NEW
Get started with Inference in seconds 🚀
Reachy Mini: The Open Robot for AI Builders
Welcome Cohere on the Hub 🔥
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trendin

In [12]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information. The brochure should be in Spanish language."

In [13]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [14]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/', 'external': True}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'status page', 'url': 'https://status.huggingface.co/', 'external': True}, {'type': 'GitHub', 'url': 'https://github.com/huggingface', 'external': True}, {'type': 'Twitter', 'url': 'https://twitter.com/huggingface', 'external': True}, {'type': 'LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface/', 'external': True}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nNEW\nGet started with Inference in seconds 🚀\nReachy Mini: The Open Robot for AI Builders\nWelcome Cohere on the Hub 🔥\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nzai-org/GLM-4.5\nUpdated\n3 days ago\n•\n3.93k\n•\n735\nQwen/Qwen3-Coder-480B-A35B-Instruct\nUpdated\n7 days ago\n•\n17.4k\n•\n923\ntencent/HunyuanWorld-1\nUpdated\n2 days ago\n•\n6.83k\n•\n451\nbosonai/higgs-audio-v2-generation-3B-base\nUpdated\n2 days ago\n•\n111k\n•\n4

In [15]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [16]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/', 'external': True}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion page', 'url': 'https://discuss.huggingface.co', 'external': True}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface', 'external': True}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface', 'external': True}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/', 'external': True}]}


```markdown
# Brochure de Hugging Face

## La comunidad de IA que construye el futuro

Hugging Face es una plataforma innovadora que conecta a la comunidad de aprendizaje automático, permitiendo la colaboración en modelos, conjuntos de datos y aplicaciones. Nos esforzamos por ser el hogar del aprendizaje automático, proporcionando las herramientas necesarias para crear, descubrir y colaborar de manera efectiva.

---

### ¿Quiénes somos?

Hugging Face es más que una empresa; somos una comunidad dedicada a impulsar el futuro de la inteligencia artificial. Con más de 50,000 organizaciones utilizando nuestra plataforma, lideramos el camino en herramientas de IA de código abierto, ofreciendo soluciones avanzadas para empresas y desarrolladores por igual.

---

### Clientes destacados

Más de 50,000 organizaciones confían en Hugging Face, incluyendo gigantes de la industria como:

- Meta
- Amazon
- Google
- Microsoft
- Grammarly

Estos clientes utilizan nuestros modelos de IA y soluciones especializadas para mejorar sus propios productos y servicios.

---

### Cultura empresarial

En Hugging Face, fomentamos un entorno de trabajo colaborativo e inclusivo. Valoramos la innovación, la experimentación y el aprendizaje continuo. Nuestros empleados tienen la libertad de explorar nuevas ideas y contribuir al crecimiento de la comunidad de inteligencia artificial.

### Oportunidades de carrera

Estamos siempre en busca de personas entusiastas y talentosas que deseen unirse a nuestro equipo. Si te apasiona la IA y deseas contribuir a proyectos de alto impacto, revisa nuestras ofertas laborales. En Hugging Face, tendrás la oportunidad de trabajar con las últimas tecnologías en un ambiente que promueve el desarrollo profesional.

---

### ¿Listo para comenzar?

Invitamos a desarrolladores, investigadores y empresas a unirse a nosotros en esta emocionante aventura en el mundo de la inteligencia artificial. Ya sea que desees explorar nuestra vasta colección de modelos y conjuntos de datos o implementar nuestras soluciones empresariales, Hugging Face es el lugar ideal para ti.

Visítanos en [Hugging Face](https://huggingface.co) para obtener más información, registrarte y comenzar tu viaje en la IA.
```
