In [2]:
# Library Imports
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [3]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [4]:
# A class to represent a Webpage
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website to  be scraped with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


In [5]:
huggingface = Website("https://huggingface.co")

In [6]:
# System Prompt for getting relevant links
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company,\
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [7]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company,such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [8]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [9]:
print(get_links_user_prompt(huggingface))

Here is the list of links on the website of https://huggingface.co - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/tencent/HunyuanImage-3.0
/deepseek-ai/DeepSeek-V3.2-Exp
/tencent/Hunyuan3D-Part
/zai-org/GLM-4.6
/inclusionAI/Ring-1T-preview
/models
/spaces/Wan-AI/Wan2.2-Animate
/spaces/enzostvs/deepsite
/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster
/spaces/Qwen/Qwen-Image-Edit-2509
/spaces/multimodalart/ai-toolkit
/spaces
/datasets/openai/gdpval
/datasets/fka/awesome-chatgpt-prompts
/datasets/nvidia/Nemotron-Personas-Japan
/datasets/t-tech/T-ECD
/datasets/zai-org/CC-Bench-trajectories
/datasets
/join
/pricing#endpoints
/pricing#spaces
/pricing
/enterprise
/enterprise
/enterprise
/enterprise
/enterprise
/enterprise
/enter

In [12]:
def get_links(website):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [13]:
response = get_links(huggingface)
print(response)

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


In [16]:
def get_all_details(website):
    result = "Landing page:\n"
    result += website.get_contents()
    links = get_links(website)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [17]:
print(get_all_details(huggingface))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'company profile on LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
tencent/HunyuanImage-3.0
Updated
about 7 hours ago
•
825
•
710
deepseek-ai/DeepSeek-V3.2-Exp
Updated
3 days ago
•
10.7k
•
464
tencent/Hunyuan3D-Part
Updated
4 days ago
•
2.57k
•
461
zai-org/GLM-4.6
Updated
2 days ago
•


In [18]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [19]:
def get_brochure_user_prompt(company_name, website):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(website)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [20]:
get_brochure_user_prompt("HuggingFace", huggingface)

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\ntencent/HunyuanImage-3.0\nUpdated\nabout 7 hours ago\n•\n825\n•\n710\ndeepseek-ai/DeepSeek-V3.2-Exp\nUpdated\n3 days ago\n•\n10.7k\n•\n464\ntencent/Hunyuan3D-Part\nUpdated\n4 days ago\n•\n2.57k\n•\n461\nzai-org/GLM-4.6\nUpdated\n2 days ago\n•\n9.72k\n•\n284\ninclusionAI/Ring-1T-preview\nUpdated\n1 day ago\n•\n818\n•\n204\nBrowse 1M+ models\nSpaces\nRunning\n1.28k\n1.28k\nWan2.2 Animate\n👁\

In [21]:
# def create_brochure(company_name, url):
#     response = openai.chat.completions.create(
#         model=MODEL,
#         messages=[
#             {"role": "system", "content": system_prompt},
#             {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
#           ],
#     )
#     result = response.choices[0].message.content
#     display(Markdown(result))

In [22]:
def stream_brochure(company_name, website):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, website)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [23]:
stream_brochure("Hugging Face", huggingface)

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}]}


# Hugging Face Brochure

## Welcome to Hugging Face
**The AI community building the future.**  
At Hugging Face, we provide a collaborative platform for the machine learning community to create, discover, and share AI models, datasets, and applications. With over 1 million models and 250,000 datasets, we are transforming how work gets done in AI and ML.

---

## What We Do
- **Models**: Host and collaborate on unlimited public models. Our platform features trending models like HunyuanImage-3.0 and DeepSeek-V3.2.
- **Datasets**: Access an extensive library of datasets, including specialized collections like GPT-3 related prompts and personas for various uses.
- **Spaces**: Build and run AI applications swiftly with tools designed for smooth collaboration and innovation.
- **Community**: Join thousands of developers and researchers working together to push the boundaries of machine learning.

---

## Company Culture
At Hugging Face, we embrace a culture of **openness and collaboration**. Everyone is encouraged to contribute and engage in meaningful discussions. Our community-driven approach allows us to stay at the forefront of AI advancements while actively supporting knowledge sharing and learning.

- **Open Source**: We believe in building the foundation of ML tooling with the community. Tools like Transformers and Diffusers exemplify our commitment to open-source development.
- **Diverse Team**: We are proud of our diverse and inclusive team that brings together different perspectives from all walks of life.
- **Continuous Learning**: We foster an environment that promotes personal growth, skill development, and innovation.

---

## Customer Base
More than **50,000 organizations** leverage Hugging Face's capabilities to advance their AI initiatives, including:
- **Major Enterprises**: Amazon, Microsoft, Google, and Intel are just a few companies that utilize our services to innovate within their fields.
- **Educational Institutions**: Non-profits and academic research groups collaborate with us to develop cutting-edge applications and technologies.

---

## Careers at Hugging Face
We are always on the lookout for talented individuals who are passionate about AI and machine learning. From software engineers to data scientists, we offer a wide range of opportunities to join our team.

- **Why Join Us?**: Be part of a community-driven company that values your contributions. You'll collaborate with industry leaders on impactful projects that shape the future of technology.
- **Benefits**: Competitive salaries, flexible working hours, and opportunities for professional development make Hugging Face an attractive place to work.

---

## Get Involved Today!
Explore our platform, join our community, and contribute to building the future of AI.  
[**Visit our website**](https://huggingface.co) to learn more about our offerings and how you can get involved!

---

_Hugging Face - The AI Community Building the Future._  