In [None]:
import os
import requests
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display
from bs4 import BeautifulSoup
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate

In [2]:
os.environ["LANGSMITH_TRACING"] = "true"

load_dotenv("../.env")
langsmith_key = os.getenv("LANGSMITH_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Inform the server about the operating system, vendor, and version of the requesting client.
# This helps to avoid blocking the scraper.
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
}

class Website:
    """"
    Utility class to represent a website that we have scraped, now with links
    """
    def __init__(self, url):
        """
        Create a Website object from the give url using BeautifulSoup.
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = "No body found"
        self.links = [link.get('href') for link in soup.find_all('a')]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 'blog/inference-providers-nebius-novita-hyperbolic',
 '/spaces',
 '/models',
 '/perplexity-ai/r1-1776',
 '/deepseek-ai/DeepSeek-R1',
 '/microsoft/OmniParser-v2.0',
 '/stepfun-ai/Step-Audio-Chat',
 '/stepfun-ai/stepvideo-t2v',
 '/models',
 '/spaces/nanotron/ultrascale-playbook',
 '/spaces/lllyasviel/LuminaBrush',
 '/spaces/microsoft/OmniParser-v2',
 '/spaces/black-forest-labs/FLUX.1-dev',
 '/spaces/m-ric/open_Deep-Research',
 '/spaces',
 '/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k',
 '/datasets/facebook/natural_reasoning',
 '/datasets/open-r1/OpenR1-Math-220k',
 '/datasets/open-thoughts/OpenThoughts-114k',
 '/datasets/SakanaAI/AI-CUDA-Engineer-Archive',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/fa

In [5]:
links_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
links_system_prompt += "You should respond in JSON without format indicators as in this example:"
links_system_prompt += """
{{
    "links": [
        {{"type": "about page", "url": "https://full.url/goes/here/about"}},
        {{"type": "careers page": "url": "https://another.full.url/careers"}}
    ]
}}
"""

In [6]:
print(links_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON without format indicators as in this example:
{{
    "links": [
        {{"type": "about page", "url": "https://full.url/goes/here/about"}},
        {{"type": "careers page": "url": "https://another.full.url/careers"}}
    ]
}}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "decide which of these are relevant web links for a brochure \
about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(get_links_user_prompt(huggingface))

Here is the list of links on the website of https://huggingface.co - decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/posts
/docs
/enterprise
/pricing
/login
/join
blog/inference-providers-nebius-novita-hyperbolic
/spaces
/models
/perplexity-ai/r1-1776
/deepseek-ai/DeepSeek-R1
/microsoft/OmniParser-v2.0
/stepfun-ai/Step-Audio-Chat
/stepfun-ai/stepvideo-t2v
/models
/spaces/nanotron/ultrascale-playbook
/spaces/lllyasviel/LuminaBrush
/spaces/microsoft/OmniParser-v2
/spaces/black-forest-labs/FLUX.1-dev
/spaces/m-ric/open_Deep-Research
/spaces
/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k
/datasets/facebook/natural_reasoning
/datasets/open-r1/OpenR1-Math-220k
/datasets/open-thoughts/OpenThoughts-114k
/datasets/SakanaAI/AI-CUDA-Engineer-Archive
/datasets
/join
/pricing#endpoints
/prici

In [9]:
model = init_chat_model("gpt-4o-mini", model_provider="openai")

In [10]:
template = ChatPromptTemplate([
    ("system", links_system_prompt),
    ("human", "{user_prompt}")
])

prompt = template.invoke({"user_prompt": get_links_user_prompt(huggingface)})

response = model.invoke(prompt)
print(response.content)

{
    "links": [
        {"type": "about page", "url": "https://huggingface.co"},
        {"type": "enterprise page", "url": "https://huggingface.co/enterprise"},
        {"type": "pricing page", "url": "https://huggingface.co/pricing"},
        {"type": "careers page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "blog page", "url": "https://huggingface.co/blog"},
        {"type": "GitHub page", "url": "https://github.com/huggingface"},
        {"type": "LinkedIn page", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "Twitter page", "url": "https://twitter.com/huggingface"}
    ]
}


In [11]:
def get_links(url):
    website = Website(url)
    template = ChatPromptTemplate([
        ("system", links_system_prompt),
        ("human", "{user_prompt}")
        ])
    prompt = template.invoke({"user_prompt": get_links_user_prompt(website)})
    response = model.invoke(prompt)
    return json.loads(response.content)

In [12]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'company page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'blog page', 'url': 'https://blog.huggingface.co'},
  {'type': 'contact page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

In [13]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [14]:
print(get_all_details("https://huggingface.co"))

Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
NEW
Welcome Hyperbolic, Nebius AI Studio, and Novita on the Hub 🔥
Welcome Fireworks.ai on the Hub 🎆
Welcome to Inference Providers on the Hub 🔥
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
perplexity-ai/r1-1776
Updated
2 days ago
•
6.44k
•
1.33k
deepseek-ai/DeepSeek-R1
Updated
13 days ago
•
4.35M
•
9.86k
microsoft/OmniParser-v2.0
Updated
4 days ago
•
3.58k
•
809
stepfun-ai/Step-Audio-Chat
Updated
4 days ago
•
544
•
331
stepfun-ai/stepvideo-t2v
Updated
3 days ago
•
800
•
275
Browse 1M+ models
Spaces
Running
1.18k
1.18k
The Ultra-Scale Playbook
🌌
The ultimate guide to training LLM on large GPU Clusters
Running
on
Zero
632
632
LuminaBrush
📈
Execute envi

In [15]:
system_prompt = "You are an assistant that analyzes the contents of several \
relevant pages from a company website and creates a short brochure about the \
company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [None]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [17]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages;     use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nNEW\nWelcome Hyperbolic, Nebius AI Studio, and Novita on the Hub 🔥\nWelcome Fireworks.ai on the Hub 🎆\nWelcome to Inference Providers on the Hub 🔥\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nperplexity-ai/r1-1776\nUpdated\n2 days ago\n•\n6.44k\n•\n1.33k\ndeepseek-ai/DeepSeek-R1\nUpdated\n13 days ago\n•\n4.35M\n•\n9.86k\nmicrosoft/OmniParser-v2.0\nUpdated\n4 days ago\n•\n3.58k\n•\n809\nstepfun-ai/Step-Audio-Chat\nUpdated\n4 days ago

In [None]:
def create_brochure(company_name, url):
    template = ChatPromptTemplate([
        ("system", system_prompt),
        ("human", "{user_prompt}")
        ])
    prompt = template.invoke({"user_prompt": get_brochure_user_prompt(company_name, url)})
    response = model.invoke(prompt)
    display(Markdown(response.content))

In [19]:
create_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face Company Brochure

## Company Overview
**Hugging Face** is a vibrant platform at the heart of the machine learning community, dedicated to building the future of AI. As a collaborative space, it enables users to create, discover, and share **1M+ models**, **250k+ datasets**, and numerous applications.

### Key Offerings
- **Models**: Explore a vast library of state-of-the-art machine learning models across various modalities—including text, image, video, audio, and even 3D.
- **Datasets**: Access and contribute to a diverse collection of datasets tailored for a myriad of ML tasks.
- **Spaces**: Collaborate and deploy applications, making it easier to run demo apps and share your work with the community.

## Our Culture
Hugging Face prides itself on fostering a **collaborative and innovative culture**. The company emphasizes:
- **Open Source**: We are committed to building an open-source foundation for machine learning tooling with contributions from the community, ensuring accessibility for all.
- **Diversity & Inclusion**: At Hugging Face, we believe in empowering all voices, promoting a culture where everyone can contribute to the AI revolution.
- **Community Engagement**: We nurture a strong community spirit, where members are encouraged to collaborate and support one another on their AI journeys.

## Customer Base
Hugging Face serves a diverse clientele, with **over 50,000 organizations**, including notable companies like:
- **Amazon Web Services (AWS)**
- **Google**
- **Meta**
- **Microsoft**
- **Grammarly**

Our platform is designed to meet the needs of enterprises, developers, researchers, and hobbyists alike.

## Career Opportunities
Hugging Face is always on the lookout for talented individuals who are passionate about AI and machine learning. We offer:
- **Innovative Projects**: Work on cutting-edge technology and contribute to groundbreaking projects in AI.
- **Flexible Work Environment**: Embrace a flexible work culture that supports a healthy work-life balance.
- **Growth & Development**: Join a company committed to your personal and professional growth through various learning opportunities.

### Current Openings
Check our [Jobs page](#) for the latest opportunities to join our team!

## Join Us to Build the Future
At Hugging Face, we're not just building software; we're nurturing a community of innovators who are enthusiastic about AI. Whether you're a prospective customer, investor, or recruit, we invite you to be a part of this exciting journey.

For more information, visit our website at [huggingface.co](https://huggingface.co) or connect with us on social media! 

**Twitter | LinkedIn | Discord** 

---

*This brochure is intended for prospective customers, investors, and recruits looking to understand the mission, culture, and opportunities at Hugging Face.*

In [20]:
create_brochure("Anthropic", "https://anthropic.com")

# Anthropic Brochure

## Welcome to Anthropic

At Anthropic, we are committed to shaping the future of artificial intelligence (AI) by creating systems that prioritize safety, reliability, and interpretability. With our headquarters in San Francisco, our mission is to build AI that individuals and organizations can trust.

### Our Purpose

We believe that AI will profoundly impact the world. That’s why we’re dedicated to developing reliable systems and advancing research on the benefits and risks associated with AI technology.

### Our Product: Claude

Meet **Claude 3.5 Sonnet**, our latest and most intelligent AI model designed to power various applications across industries. Claude is adaptable, enabling builders to create AI-powered applications tailored to their needs. To learn more about using Claude, visit our [API](#) section.

### Safety First

At Anthropic, we treat AI safety as a systematic science:
- **Research:** We explore innovative safety techniques and share our findings with the broader community.
- **Application:** Our team effectively translates research insights into practical, reliable tools that benefit businesses and organizations globally.
- **Interdisciplinary Approach:** Our diverse team includes researchers, engineers, policy experts, and operational leaders from various domains, enhancing our collaboration and innovation.

### Our Values

We uphold a set of core principles that guide our operations:
1. **Act for the Global Good:** We focus on maximizing positive outcomes for humanity through our technology.
2. **Hold Light and Shade:** We acknowledge and address the potential risks of AI while striving to harness its benefits.

### Customers

We serve a wide range of clients, including businesses, nonprofits, and civil society organizations. Our partnerships are pivotal in deploying AI applications that have a tangible impact on their operations and communities.

### Join Our Team

At Anthropic, we believe that a diverse and inclusive culture drives our innovation. We are continuously seeking passionate individuals who are eager to contribute to the safe evolution of AI. If you’re interested in joining a team where collaboration thrives, check out our [Careers](#) page for current openings.

### Community Engagement

We recognize that AI companies are just one piece of the larger puzzle. That's why we actively collaborate with civil society, government entities, academia, and nonprofit organizations to promote safety and responsible AI practices industry-wide.

### Stay Connected

For updates on our work, research, and news, follow us on our social media platforms and join our community discussions. Let’s navigate the future of AI together!

- [Twitter](#)
- [LinkedIn](#)
- [YouTube](#)

---

**Contact Us**

For inquiries or further information about our products and opportunities:
- Website: [www.anthropic.com](#)
- Email: contact@anthropic.com

Join us at Anthropic as we lead the charge in responsible AI development!