In [0]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import ollama

In [0]:
load_dotenv(override=True)
MODEL = "llama3.2"


In [0]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [0]:
ed = Website("https://edwarddonner.com")
ed.links

In [0]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [0]:
print(link_system_prompt)

In [0]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [0]:
print(get_links_user_prompt(ed))

In [0]:
import json
import logging
import pprint
#pprint.pprint(response)

import re

def extract_json_from_text(text):
    """
    Extract the first JSON object found in the text.
    """
    match = re.search(r'\{.*\}', text, re.DOTALL)
    if match:
        return match.group(0)
    return None

def get_links(url):
    website = Website(url)
    
    try:
        response = ollama.chat(
            model="llama3.2",
            messages=[
                {"role": "system", "content": link_system_prompt},
                {"role": "user", "content": get_links_user_prompt(website)}
            ]
        )

        result = response['message']['content']
       
        # Log the raw result for debugging
        logging.debug(f"Raw result: {result}")

       
        if isinstance(result, str):
            if not result.strip():
                logging.warning("Result string is empty.")
                return None

            json_text = extract_json_from_text(result)
            if not json_text:
                logging.warning("No JSON object found in the result string.")
                return None

            logging.debug(f"Extracted JSON string: {repr(json_text)}")

            try:
                return json.loads(json_text)
            except json.JSONDecodeError as e:
                logging.error(f"JSON decoding error: {e}")
                logging.debug(f"Problematic JSON string: {repr(json_text)}")
                return None
        
    except Exception as e:
        logging.exception("An unexpected error occurred in get_links.")
        return None



In [0]:
get_links("https://huggingface.co")

In [0]:
import requests

def is_url_reachable(url, timeout=5):
    try:
        response = requests.head(url, timeout=timeout)
        return response.status_code < 400
    except requests.RequestException:
        return False

In [0]:
def get_all_details(url):
    if is_url_reachable(url,5):
        result = "Landing page:\n"
        result += Website(url).get_contents()
        links = get_links(url)
        print("Found links:", links)
        for link in links["links"]:
            result += f"\n\n{link['type']}\n"
            result += Website(link["url"]).get_contents()
        return result

In [0]:
print(get_all_details("https://huggingface.co"))

In [0]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [0]:
def get_brochure_user_prompt(company_name, url):
    try:
        if is_url_reachable(url):
            web_content = get_all_details(url)[:5000] 
            user_prompt = f"You are looking at a company called: {company_name}\n"
            user_prompt += f"Use the name {company_name} clearly in the brochure.\n"
            user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
            user_prompt += f"\n\nReminder: the company name is {company_name}."
            #user_prompt += get_all_details(url)
            #user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
            user_prompt += web_content
            return user_prompt
    except requests.RequestException:
        return False

In [0]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

In [0]:
import requests

def is_url_reachable1(url, timeout=5):
    try:
        response = requests.head(url, timeout=timeout)
        return response.status_code < 400
    except requests.RequestException:
        return False

In [0]:
def create_brochure(company_name, url):
    try:
        if is_url_reachable(url,5):
            response = ollama.chat(
                model="llama3.2",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
                  ]
            )
    
        result = response['message']['content']
        display(Markdown(result))
    except requests.RequestException:
        return False

In [0]:
create_brochure("HuggingFace", "https://huggingface.co")

In [0]:
def stream_brochure(company_name, url):
    if not is_url_reachable(url):
        print("❌ URL not reachable")
        return
    try:
        #if is_url_reachable(url,5):
         stream = ollama.chat(
            model="llama3.2",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
                ],
            stream=True
            )
    
       #result = response['message']['content']
       # display(Markdown(result))
    except requests.RequestException:
        return False
        
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    #for chunk in stream:
        #response += chunk.choices[0].delta.content or ''
        #response += chunk['message']['content'] or ''
        #response = response.replace("```","").replace("markdown", "")
        #update_display(Markdown(response), display_id=display_handle.display_id)

    for chunk in stream:
        content = chunk.get('message', {}).get('content', '')
        if content:
            response += content.replace("```", "")
            update_display(Markdown(response), display_id=display_handle.display_id)


In [0]:
stream_brochure("HuggingFace", "https://huggingface.co")