In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [8]:
website_sample = Website("https://github.com")
website_sample.links

['#start-of-content',
 '/',
 '/login',
 'https://github.com/features/copilot',
 'https://github.com/security/advanced-security',
 'https://github.com/features/actions',
 'https://github.com/features/codespaces',
 'https://github.com/features/issues',
 'https://github.com/features/code-review',
 'https://github.com/features/discussions',
 'https://github.com/features/code-search',
 'https://github.com/why-github',
 'https://github.com/features',
 'https://docs.github.com',
 'https://skills.github.com',
 'https://github.blog',
 'https://github.com/enterprise',
 'https://github.com/team',
 'https://github.com/enterprise/startups',
 '/solutions/industry/nonprofits',
 '/solutions/use-case/devsecops',
 '/solutions/use-case/devops',
 '/solutions/use-case/ci-cd',
 '/solutions/use-case',
 '/solutions/industry/healthcare',
 '/solutions/industry/financial-services',
 '/solutions/industry/manufacturing',
 '/solutions/industry/government',
 '/solutions/industry',
 '/solutions',
 '/resources/article

In [9]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [10]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [11]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [12]:
print(get_links_user_prompt(website=website_sample))

Here is the list of links on the website of https://github.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
#start-of-content
/
/login
https://github.com/features/copilot
https://github.com/security/advanced-security
https://github.com/features/actions
https://github.com/features/codespaces
https://github.com/features/issues
https://github.com/features/code-review
https://github.com/features/discussions
https://github.com/features/code-search
https://github.com/why-github
https://github.com/features
https://docs.github.com
https://skills.github.com
https://github.blog
https://github.com/enterprise
https://github.com/team
https://github.com/enterprise/startups
/solutions/industry/nonprofits
/solutions/use-case/devsecops
/solutions/use-case/devops
/solutions/use-case/ci-cd
/solutions/use-case
/solutions/ind

In [13]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [15]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [16]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
Landing page:
Webpage Title:
Hugging Face ‚Äì The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
HiDream-ai/HiDream-I1-Full
Updated
1 day ago
‚Ä¢
20.3k
‚Ä¢
545
microsoft/bitnet-b1.58-2B-4T
Updated
about 8 hours ago
‚Ä¢
1.63k
‚Ä¢
287
agentica-org/DeepCoder-14B-Preview
Updated
8 days ago
‚Ä¢
16.8k
‚Ä¢
558
moonshotai/Kimi-VL-A3

In [17]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [18]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [19]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face ‚Äì The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nHiDream-ai/HiDream-I1-Full\nUpdated\n1 day ago\n‚Ä¢\n20.3k\n‚Ä¢\n545\nmicrosoft/bitnet-b1.58-2B-4T\nUpdated\nabout 8 hours ago\n‚Ä¢\n1.63k\n‚Ä¢\n287\nagentica-org/DeepCoder-14B-Preview\nUpdated\n8 days ago\n‚Ä¢\n16.8k\n‚Ä¢\n558\nmoonshotai/Kimi-VL-A3B-Thinking\nUpdated\n3 days ago\n‚Ä¢\n16.7k\n‚Ä¢\n346\ndeepseek-ai/DeepSeek-V3-0324\nUpdated\n22 days ago\n‚Ä¢\n231k\n‚Ä¢\n2.65k\nBrowse 1M+ mod

In [20]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [21]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


```markdown
# Hugging Face ‚Äì Building the Future of AI

## About Us
At Hugging Face, we are the AI community dedicated to transforming the future of machine learning and artificial intelligence. Our platform serves as a collaborative space where developers, researchers, and businesses can come together to create, discover, and share state-of-the-art models, datasets, and AI applications. With over 1 million models and 250,000 datasets, Hugging Face is the home of machine learning innovation.

## Our Offerings
- **Models**: Explore diverse AI models including cutting-edge transformers for text, image, video, audio, and even 3D applications.
- **Datasets**: Browse a vast collection of datasets tailored for various machine learning tasks, enabling enhanced research and development.
- **Spaces**: Collaborate on applications effortlessly with others in the community.

### Enterprise Solutions
We provide enterprise-grade solutions that ensure security and enhanced collaboration. With our optimized inference endpoints, dedicated support, and extensive tools, organizations can leverage AI capabilities efficiently. Pricing starts at **$20/user/month**.

## Our Customers
More than **50,000 organizations** trust Hugging Face, including industry leaders like:
- Google
- Microsoft
- Amazon
- Grammarly
- Meta AI

## Our Culture
Hugging Face embraces an open-source philosophy, emphasizing community collaboration. We encourage everyone to contribute and build their portfolios on our platform, fostering an inclusive and innovative environment. Our mission is driven by the belief that machine learning should be accessible and collaborative for all.

## Careers at Hugging Face
Join our dynamic team and be part of shaping the future of AI! We are constantly looking for talented individuals across various disciplines. Check out our [careers page](#) for current job openings and become an agent of change in the AI community.

## Connect With Us
For more information, resources, or inquiries, please visit our website or connect with us on social media platforms. Together, let's build a future powered by AI!

---

*Explore our platform today and join the AI community at [Hugging Face](https://huggingface.co)!*
```

In [22]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [23]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

## About Us
Hugging Face is at the forefront of the AI community dedicated to building the future of machine learning (ML) and artificial intelligence (AI). Our mission is to democratize access to high-quality machine learning tools and resources, allowing everyone‚Äîfrom researchers to developers‚Äîto innovate and collaborate effectively.

## Our Platform
Hugging Face hosts a dynamic collaboration platform where the machine learning community can distribute, discover, and enhance a growing collection of models, datasets, and applications. Users can explore over **1M+ models** and **250k+ datasets**, making it an indispensable resource for AI enthusiasts and professionals.

### Key Features
- **Models & Datasets**: Browse and utilize a vast array of AI models and datasets that cater to various applications in text, image, video, audio, and more.
- **Spaces**: Create and run applications seamlessly, enhancing productivity and collaboration within the community.
- **Compute Solutions**: Explore paid compute and enterprise solutions to accelerate your machine learning projects with optimized endpoints and dedicated support.

## Our Community
Join over **50,000 organizations** including prominent names like Google, Microsoft, and Amazon who leverage Hugging Face for their AI endeavors. Collaborate, share your work, and build your portfolio as part of a vibrant community committed to advancing machine learning standards.

## Company Culture
At Hugging Face, we prioritize open-source principles and an inclusive community. Our culture promotes transparency, collaboration, and continuous learning. We believe in the power of knowledge sharing and welcome individuals from diverse backgrounds to contribute to our mission.

## Careers at Hugging Face
We are always looking for passionate and talented individuals to join our team! Whether you are an expert in machine learning, software engineering, product management, or marketing, we encourage you to check our [Careers Page](https://huggingface.co/join) for current opportunities and to help us shape the future of AI.

## Connect with Us
Follow us on our social channels and join the conversation:
- [GitHub](https://github.com/huggingface)
- [Twitter](https://twitter.com/huggingface)
- [LinkedIn](https://linkedin.com/company/huggingface)
- [Discord](https://discord.gg/huggingface)

### Join the AI Revolution
Discover the endless possibilities of AI and machine learning with Hugging Face. Sign up today and be part of a growing community dedicated to shaping tomorrow's technology! 

--- 

*For more information, visit us at [Hugging Face](https://huggingface.co)*.