In [1]:
# library imports
# If this fails, check if you are running
# from the active "llms" environment on the command line
import os
import re
import requests
import socket
from requests.exceptions import ConnectionError
from requests.exceptions import MissingSchema
from requests.exceptions import InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [2]:
load_dotenv(override=True)
MODEL = 'llama3.2'

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [11]:
page = Website("https://ans-elblag.pl")
page.links

['polityka-cookies.html',
 'https://ans-elblag.pl',
 'terminarz-zajec.html',
 '#',
 'terminarz-zajec.html',
 '#',
 '#',
 '#',
 'https://www.facebook.com/anselblag',
 'https://www.instagram.com/anselblag/',
 'https://www.youtube.com/user/pwszelblag',
 'https://study.ans-elblag.pl/ua',
 'https://study.ans-elblag.pl',
 'https://study.ans-elblag.pl/ru',
 '#',
 'rekrutacja/',
 'kierunki-studiow/',
 'rekrutacja-on-line.html',
 'kandydaci-z-ukrainy.html',
 'wymagane-dokumenty.html',
 'studia-podyplomowe/',
 'biuro-informacji-rekrutacyjnej.html',
 'terminarz-rekrutacji.html',
 'kandydaci-z-zagranicy.html',
 'blog/',
 'o-uczelni.html',
 'struktura/',
 'wzory-dyplomow.html',
 'ksiega-jakosci-akademii-nauk-stosowanych-w-elblagu.html',
 'sukcesy.html',
 'dzial-zamowien-publicznych/',
 'oferty-pracy-konkursy.html',
 'wspolpraca/',
 'projekty/',
 'inspektor-ochrony-danych/',
 'biblioteka/',
 'wydawnictwo/',
 'kasa-zapomogowo-pozyczkowa/',
 'zasady-postepowania-zagrozenie-zdrowia-lub-zycia.html',
 'p

In [12]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages.\n"
link_system_prompt += "You should respond only in JSON, without text, object as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [13]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond only in JSON, without text, object as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [14]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for \
a brochure about the company, respond with the full https URL in clean JSON format \
wihout text json on the beginning of the response. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [15]:
print(get_links_user_prompt(page))

Here is the list of links on the website of https://ans-elblag.pl - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in clean JSON format wihout text json on the beginning of the response. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
polityka-cookies.html
https://ans-elblag.pl
terminarz-zajec.html
#
terminarz-zajec.html
#
#
#
https://www.facebook.com/anselblag
https://www.instagram.com/anselblag/
https://www.youtube.com/user/pwszelblag
https://study.ans-elblag.pl/ua
https://study.ans-elblag.pl
https://study.ans-elblag.pl/ru
#
rekrutacja/
kierunki-studiow/
rekrutacja-on-line.html
kandydaci-z-ukrainy.html
wymagane-dokumenty.html
studia-podyplomowe/
biuro-informacji-rekrutacyjnej.html
terminarz-rekrutacji.html
kandydaci-z-zagranicy.html
blog/
o-uczelni.html
struktura/
wzory-dyplomow.html
ksiega-jakosci-akademii-nauk-stosowanych-w-elblagu.html
sukcesy.html
dzial-zamowien-pub

In [16]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        options={"format": "json"}
    )
    result = response['message']['content']
    
    result = re.sub(r'<think>.*?</think>', '', result, flags=re.DOTALL)
    result = re.sub(r'```json\n|\n```$', "", result.strip(), flags=re.MULTILINE).strip()
    
    print("--- Raw LLM output after cleaning ---")
    print(result)
    print("------------------------------------")

    try:
        content_json = json.loads(result)
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedź nie jest poprawnym JSON. Zwracam pustą listę linków.")
        return {"links": []}

In [17]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nanonets/Nanonets-OCR-s',
 '/google/magenta-realtime',
 '/mistralai/Mistral-Small-3.2-24B-Instruct-2506',
 '/MiniMaxAI/MiniMax-M1-80k',
 '/Menlo/Jan-nano',
 '/models',
 '/spaces/ilcve21/Sparc3D',
 '/spaces/enzostvs/deepsite',
 '/spaces/tencent/Hunyuan3D-2.1',
 '/spaces/OmniGen2/OmniGen2',
 '/spaces/multimodalart/self-forcing',
 '/spaces',
 '/datasets/EssentialAI/essential-web-v1.0',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/institutional/institutional-books-1.0',
 '/datasets/nvidia/AceReason-1.1-SFT',
 '/datasets/nvidia/OpenScience',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers'

In [18]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links_data = get_links(url)

    if links_data and "links" in links_data:
        for link_item in links_data["links"]:
            try:
                if isinstance(link_item, dict):
                    page_url = link_item.get("url")
                    page_type = link_item.get("type", "Link") 
                    if not page_url:
                        continue 
                    
                    result += f"\n\n--- {page_type} ---\n"
                    result += Website(page_url).get_contents()
                
                elif isinstance(link_item, str):
                    page_url = link_item
                    result += f"\n\n--- Link: {page_url} ---\n"
                    result += Website(page_url).get_contents()

            except socket.gaierror as e:
                print(f"DNS resolution failed for {page_url}: {e}")
            except NameResolutionError as e:
                print(f"Name resolution error for {page_url}: {e}")
            except MaxRetryError as e:
                print(f"Max retries exceeded for {page_url}: {e}")
            except ConnectionError as e:
                print(f"Connection error for {page_url}: {e}")
            except MissingSchema as e:
                print(f"Invalid URL schema for {page_url}: {e}")
            except InvalidSchema as e:
                print(f"Omitted unsupported URL for {page_url} (InvalidSchema): {e}")
            except Exception as e:
                print(f"An unexpected error occurred for {page_url}: {e}")

    return result

In [19]:
print(get_all_details("https://huggingface.co"))

--- Raw LLM output after cleaning ---
{
    "links": [
        "https://huggingface.co/models",
        "https://huggingface.co/datasets/EssentialAI/essential-web-v1.0",
        "https://huggingface.co/docs",
        "https://huggingface.co/spaces",
        "https://endpoints.huggingface.co",
        "https://discuss.huggingface.co",
        "https://status.huggingface.co/",
        "https://github.com/huggingface",
        "https://twitter.com/huggingface",
        "https://www.linkedin.com/company/huggingface/"
    ]
}
------------------------------------
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
nanonets/Nanonets-OCR-s
Updated
5 days ago
•


In [20]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. \
Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [21]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [22]:
get_brochure_user_prompt("ANS", "https://ans-elblag.pl")

--- Raw LLM output after cleaning ---
{
  "links": [
    "https://www.facebook.com/anselblag",
    "https://www.instagram.com/anselblag/",
    "https://www.youtube.com/user/pwszelblag",
    "https://study.ans-elblag.pl",
    "http://iso.ans.elblag.pl/",
    "https://moodle.ans-elblag.pl",
    "https://ans-elblag.pl/kierunki-studiow/",
    "https://hackathon.ans-elblag.pl",
    "https://bip.ans-elblag.pl/",
    "https://twitter.com/anselblag"
  ]
}
------------------------------------
Connection error for http://iso.ans.elblag.pl/: HTTPConnectionPool(host='iso.ans.elblag.pl', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x000002E4A8101B90>: Failed to resolve 'iso.ans.elblag.pl' ([Errno 11001] getaddrinfo failed)"))
Connection error for https://hackathon.ans-elblag.pl: HTTPSConnectionPool(host='hackathon.ans-elblag.pl', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, "[S

'You are looking at a company called: ANS\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nStrona główna | Akademia Nauk Stosowanych w Elblągu\nWebpage Contents:\nStrona wykorzystuje pliki cookies. Korzystając ze strony wyrażasz zgodę na wykorzystywanie plików cookies.\nDowiedz się więcej\nZAMKNIJ\nTERMINARZ ZAJĘĆ\nKONTRAST\nTerminarz zajęć\n-A\nA\nA+\nSocial Media\nFacebook\nInstagram\nYouTube\nLanguage\nУкраїнська\nEnglish\nРусский\nszukaj\nSzukana fraza\nPrzycisk szukaj\nKandydaci\nKierunki studiów\nZapisz się on-line\nKandydaci z Ukrainy\nWymagane dokumenty\nStudia podyplomowe\nBiuro Rekrutacji\nTerminarz rekrutacji\nKandydaci z zagranicy\nStrefa kandydata\nUczelnia\nO Uczelni\nStruktura\nDyplomy ukończenia studiów\nKsięga jakości\nSukcesy\nZamówienia publiczne\nOferty pracy\nWspółpraca\nProjekty\nInspektor Ochrony Danych / RODO\nBiblioteka\nWydawnictwo\nKa

In [23]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response['message']['content']
    result = re.sub(r'<think>.*?</think>', '', result, flags=re.DOTALL)
    result = result.strip()
    display(Markdown(result))

In [24]:
create_brochure("ANS", "https://ans-elblag.pl")

--- Raw LLM output after cleaning ---
{
    "links": [
        "https://www.facebook.com/anselblag",
        "https://instagram.com/anselblag/",
        "https://www.youtube.com/user/pwszelblag",
        "https://study.ans-elblag.pl",
        "https://moodle.ans-elblag.pl",
        "http://iso.ans.elblag.pl",
        "https://hackathon.ans-elblag.pl",
        "https://bip.ans-elblag.pl",
        "https://twitter.com/anselblag",
        "https://www.instagram.com/anselblag/",
        "https://moodle.ans-elblag.pl",
        "https://ans-elblag.pl/kierunki-studiow/"
    ]
}
------------------------------------
Connection error for http://iso.ans.elblag.pl: HTTPConnectionPool(host='iso.ans.elblag.pl', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x000002E4A676C290>: Failed to resolve 'iso.ans.elblag.pl' ([Errno 11001] getaddrinfo failed)"))
Connection error for https://hackathon.ans-elblag.pl: HTTPSConnectionPool(ho

# Welcome to ANS - Academy of Applied Sciences in Elbląg

[Image: A group of students from diverse backgrounds, smiling and engaged in a discussion]

At ANS, we believe that learning is a journey, not a destination. Our academy is dedicated to providing high-quality education in the fields of economics, information technology, pedagogy, and politics.

## We're passionate about our work
Our team of experienced professors and researchers are committed to delivering innovative and engaging courses that prepare students for success in their chosen careers.

## Our community
We pride ourselves on creating a supportive and inclusive environment where students from all backgrounds can thrive. Whether you're from Elbląg or the world, we welcome you to join our vibrant community!

### Meet our departments

* **Economics**: Explore the intricacies of economic systems and policy-making.
* **Information Technology**: Develop your skills in IT, data analysis, and cybersecurity.
* **Pedagogy**: Discover the art of teaching and learning, and become a future educator.
* **Politics**: Delve into the world of politics, governance, and international relations.

## Careers at ANS
Join our team as a:

* Professor/Lecturer
* Researcher
* Administrative Staff

### Why work with us?

* Competitive salaries
* Opportunities for professional growth and development
* Collaborative and dynamic work environment

## Join the conversation!

Stay up-to-date with the latest news and events from ANS on our social media channels:

[Facebook logo]
[Instagram logo]
[Youtube logo]

## Contact us

For more information about our programs, admissions, or careers, please don't hesitate to reach out:

* Phone: +48 55 629 05 05
* Email: [info@ans.edu.pl](mailto:info@ans.edu.pl)
* Address: ul. Wojska Polskiego 1, 82-300 Elbląg

# Join the ANS community today!

[Call-to-action button: Apply Now / Get in touch]

In [25]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream=True
    )
    result = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        result += chunk['message']['content'] or ''
        result = re.sub(r'<think>.*?</think>', '', result, flags=re.DOTALL)
        result = result.strip()
        result = result.replace("```", "").replace("markdown", "")
        update_display(Markdown(result), display_id=display_handle.display_id)

In [26]:
stream_brochure("ANS", "https://ans-elblag.pl")

--- Raw LLM output after cleaning ---
{
  "links": [
    "https://www.facebook.com/anselblag/",
    "https://www.instagram.com/anselblag/",
    "https://www.youtube.com/user/pwszelblag",
    "https://study.ans-elblag.pl/ua",
    "https://study.ans-elblag.pl",
    "https://study.ans-elblag.pl/ru",
    "http://iso.ans.elblag.pl/",
    "https://bip.ans-elblag.pl/",
    "https://hackathon.ans-elblag.pl",
    "https://twitter.com/anselblag",
    "https://moodle.ans-elblag.pl",
    "https://ans-elblag.pl/kierunki-studiow/"
  ]
}
------------------------------------
Connection error for http://iso.ans.elblag.pl/: HTTPConnectionPool(host='iso.ans.elblag.pl', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x000002E4A8211D50>: Failed to resolve 'iso.ans.elblag.pl' ([Errno 11001] getaddrinfo failed)"))
Connection error for https://hackathon.ans-elblag.pl: HTTPSConnectionPool(host='hackathon.ans-elblag.pl', port=443): Max ret

**Welcome to ANS - Academy of Applied Sciences in Elbląg**[Image: A group of students from diverse backgrounds studying together]Are you looking for a place where knowledge meets passion? Where curiosity is encouraged and creativity is nurtured? Look no further than ANS!**Our Story**ANS is a Polish academy of applied sciences located in the charming city of Elbląg. Our institution was founded with the mission of providing high-quality education and research opportunities to students from all over the world.[Image: A photo of the academy's building, with a beautiful view of the surrounding landscape]**Our Culture**At ANS, we value:* **Interdisciplinary approach**: We believe that knowledge is not limited to one field or discipline. Our programs are designed to foster connections between seemingly unrelated fields.* **International community**: Our students come from diverse backgrounds, and we strive to create an inclusive environment where everyone feels welcome.* **Creativity and innovation**: We encourage our students to think outside the box, explore new ideas, and develop innovative solutions.**Our Students**Meet some of our talented students who are shaping the future:[Image: A student project showcase]* **Students from Ukraine**: ANS has a long history of supporting students from Ukraine. Our program is designed to help them achieve their academic goals while navigating the challenges of war.* **Studying abroad**: We offer scholarships and programs that allow our students to study abroad, explore new cultures, and gain valuable international experience.**Our Career Opportunities**Are you ready to join our vibrant community? We offer:* **Jobs in various fields**: From teaching and research to administration and entrepreneurship, we have a range of career opportunities for everyone.* **Internships and collaborations**: We partner with companies and organizations worldwide to provide our students with hands-on experience and valuable networking opportunities.**Join the ANS Community**If you're passionate about learning, innovation, and creativity, join us at ANS! Apply now and become part of a dynamic community that will change your life for the better.[Image: A group of happy students celebrating]Visit our website to learn more and apply today![Link: https://www.anselblag.pl/](https://www.anselblag.pl/)