In [None]:
--requirements
pip install -U langchain-ollama

In [1]:
import langchain
import requests
import json
from bs4 import BeautifulSoup
from langchain_ollama import ChatOllama
from langchain.prompts import ChatPromptTemplate
from typing import *

In [2]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:{self.title} Webpage Contents:{self.text}"

In [5]:
def llm_client(model_name,format=None, temperature = 0.8):
    try:
        llm =ChatOllama(model=model_name,format=format, temperature = 0.8)
        return llm
    except Exception as e:
        raise Exception("llm initiation  failed",e) 
def create_prompt(system_prompt,user_prompt,variables: Optional[Dict] = None):
    try:
        # Define the prompt template
        prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", user_prompt)
        ])
        # Format the template with variables
        variables = variables or {}
        formatted_prompt = prompt.format(**variables) 
        return formatted_prompt  # Return the formatted prompt as
    except Exception as e:
        raise Exception ("error in creating prompt",e)

def get_links(url):
    website = Website(url)
    llm=llm_client('llama3.2',format='json')
    prompt_result=create_prompt(link_system_prompt,get_links_user_prompt(website))
    #print(f"formatted prompt is {prompt_result}")
    result=llm.invoke(prompt_result)
    print(f"llm response is {result}")
    return json.loads(result.content)
    

def get_all_details(url):
    result = Website(url).get_contents()
    print(f"in get all details function websiste contents is :: {result}")
    links = get_links(url)
    print("Found links:", links,type(links))
    for link in links["links"]:
        result += f"{link['type']}"
        result += Website(link["url"]).get_contents()
    return result

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    print("calling the get all details")
    user_prompt += get_all_details(url)
    user_prompt = user_prompt # Truncate if more than 5,000 characters
    return user_prompt


In [7]:
link_system_prompt =r"""
You are provided with a list of links found on a webpage. 
Your task is to determine which links are relevant for a company brochure. 

Relevant links include:
- **About page** (e.g., a page describing the company)
- **Company page** (e.g., an official company overview)
- **Careers/Jobs page** (e.g., job opportunities)

⚠️ **Important:** Do NOT copy the example below. Instead, analyze the provided links and return only the ones that fit.

Your response must be in JSON format, like this:
```json
{{

    "links": [
        {{"type": "about page", "url": "https://example.com/about"}},
        {{"type": "careers page", "url": "https://example.com/jobs"}}
    ]
}}
"""
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [None]:
li = Website("https://huggingface.co")
li.links

# STEP 1: Calling LLAMA to get the complete links of the scraped webpage


In [None]:
llm=llm_client('llama3.2',format='json')
prompt_result=create_prompt(link_system_prompt,get_links_user_prompt(li))
print(f"formatted prompt is {prompt_result}")
result=llm.invoke(prompt_result)
print(f"llm response is {result}")

# STEP2 : Calling LLAMA to get the final broucher

In [9]:
system_prompt = """You are an AI assistant that creates company brochures in Markdown format.
Use the provided company website content to generate a well-structured brochure.

### Structure:
1. **Company Overview** - A short introduction about the company.
2. **Mission & Vision** - What the company stands for.
3. **Products & Services** - Key offerings and innovations.
4. **Customers & Clients** - Notable partners or users.
5. **Careers** - Job opportunities and work culture.
6. **Contact Information** - Website, social links, etc.

### Guidelines:
- Keep sections concise and engaging.
- Use bullet points where needed.
- Exclude unnecessary details like navigation menus.

Format your response in **valid Markdown**.
"""

user_prompt=get_brochure_user_prompt("google", "https://www.google.com/")
print(f"the final user prompt received is ",user_prompt)

calling the get all details
in get all details function websiste contents is :: Webpage Title:Google Webpage Contents:Gmail
Images
Sign in
See more
Delete
Delete
Report inappropriate predictions
Google offered in:
हिन्दी
বাংলা
తెలుగు
मराठी
தமிழ்
ગુજરાતી
ಕನ್ನಡ
മലയാളം
ਪੰਜਾਬੀ
India
About
Advertising
Business
How Search works
Privacy
Terms
Settings
Search settings
Advanced search
Your data in Search
Search history
Search help
Send feedback
Dark theme: Off
Google apps
llm response is content='{\n\n    "links": [\n        {"type": "about page", "url": "https://www.google.com/intl/en_in/about"},\n        {"type": "company page", "url": "https://www.google.com/intl/en_in/"},\n        {"type": "careers page", "url": "https://careers.google.com/"}\n    ]\n}' additional_kwargs={} response_metadata={'model': 'llama3.2', 'created_at': '2025-02-26T10:59:32.3162147Z', 'done': True, 'done_reason': 'stop', 'total_duration': 223744649900, 'load_duration': 7434443900, 'prompt_eval_count': 1314, 'prompt_e

In [13]:
llm=llm_client('llama3.2')
prompt_result=create_prompt(system_prompt,user_prompt)
#print(f"formatted prompt is {prompt_result}")
result=llm.invoke(prompt_result)
print(f"llm response is {result.content}")

llm response is **Google Brochure**

### Company Overview

Google is a technology company that specializes in Internet-related services and products. Our mission is to organize the world's information and make it universally accessible and useful.

### Mission & Vision

* Organize the world's information
* Make it universally accessible and useful
* Committed to significantly improving the lives of as many people as possible

### Products & Services

* Gmail: A free, widely-used email service
* Google Search: A powerful search engine that indexes the web
* Google Cloud: A suite of cloud computing services for businesses and developers
* Android: A mobile operating system used by billions of people worldwide
* Google Drive: A cloud storage service that allows users to store and access files from anywhere

### Customers & Clients

* Individuals and families using Gmail, Google Search, and other Google products
* Businesses and organizations using Google Cloud, Google Drive, and other Goo