In [48]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from crawler import WebCrawler
import ast
import asyncio
import json

In [2]:
llm = ChatOllama(model="llama3.2", device="cuda")
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You are a helpful assitant that takes in instructions and provides relevant keywords and filters that can be used to scrape a website for relevant data.

            For example, if the instructions are "We're making a chatbot for the HR in San Francisco" and the provided website is "https://www.sf.gov" you should use the website and context 
            to provide keywords in the format and of the type: ["Human Resources", "HR", "Employee Benefits", "Employment Practices", "Hiring process", "Job Opportunities", "Employee Rights", 
            "Employee assistance programs", "Workplace safety", "Employee training", "Labor Laws", "San Francisco employee benefits", "SF hiring process", "SF employee rights", "San Francisco labor laws", 
            "San Francisco workers' rights", "City employee assistance"].
            
            Use the context and the website to provide relevant information. Provide only 5 keywords and 5 keywords only. These keywords will be used to rank the URLs and 
            scrape the website for relevant data, so don't provide irrelevant keywords. Do not provide generic keywords. Provide keywords that will be useful 
            to rank website titles based on their relevance to the instructions. Only provide the list and nothing else.
            """
        ),
        ("human", "Instructions: {instructions}, website: {website}"),
    ]
)

In [3]:
chain = prompt | llm
instructions = "Find application information on university websites."
website = "https://ucsd.edu/"
response = chain.invoke(
    {
        "instructions": instructions,
        "website": website,
    }
)
print(response.content)

["University Admissions", "UCSD Application", "Application Requirements", "Admission Process", "UCSD Tuition Fees"]


In [4]:
keywords = ast.literal_eval(response.content)
print(keywords)

['University Admissions', 'UCSD Application', 'Application Requirements', 'Admission Process', 'UCSD Tuition Fees']


In [59]:
crawler = WebCrawler(base_url="https://apple.com/", depth_limit=2)
cache = await crawler.crawl()

Crawling: https://apple.com/, Depth: 1
Crawling complete. Pages visited: 1


In [6]:
cache

{'https://alumni.ucsd.edu/': ('UC San Diego Alumni Home',
  "This November 8-10, return to where it all began. Together. Tritons. Always. The UC San Diego Alumni Association introduces new executive officers and directors, led by Melody Gonzales ’02.  Expand your reach, worldwide, using exclusive tools that allow you to explore the possibilities and increase your networking capacity. As a UC San Diego graduate, you are automatically a part of the Alumni Association. No fees, no need to sign-up. All the perks. We're your resource to stay connected to the university and each other, to assist you as you grow professionally and inspire the next generation of alumni. At the heart of our campus, we are building our first-ever Triton Alumni and Welcome Center as a site for journeys home, including yours. The center will feature amenities for graduates with dedicated rooms for socializing and gathering, exhibition space to share our campus’s distinctive history, and a welcome center for introd

In [None]:
def get_relevance_score(url, title, content, instructions, keywords):
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """
                You are a helpful assistant that will take in a URL, title, content, instructions, and keywords and provide a relevance score 
                based on the relevance of the URL, title, and content to the instructions provided. If you do not think that the webpage would contain 
                information that the instructions are looking for, be sure to give the link a low score. Make sure to provide a score from 0-10 where 0 is not 
                relevant at all and 10 is extremely relevant. The relevance score should be based on the relevance of the URL, title, and content, 
                to the instructions provided. Be as accurate as possible. If there is no title or no content for the URL, give a score of 0.
                You must only return a number that is the score and no other text. Be very critical in your judgement.
                """
            ),
            (
                "human",
                """
                Instructions: {instructions}
                
                URL: {url}
                Title: {title}
                Content: {content}  # Limit content length for prompt efficiency
                """
            ),
        ]
    )
    chain = prompt | llm
    response = chain.invoke(
        {
            "instructions": instructions,
            "url": url,
            "title": title,
            "content": content,
        }
    )
    return float(response.content)

def filter_relevant_links(links_dict, instructions, keywords, threshold=9):
    relevant_links = {}
    for link, (title, content) in links_dict.items():
        score = get_relevance_score(link, title, content[:500], instructions, keywords)
        print(score)
        if score >= threshold:
            relevant_links[link] = (title, content)
    return relevant_links
        

In [12]:
filtered_links = filter_relevant_links(cache, instructions, keywords)

8.0
6.0
6.0
7.0
6.0
6.0
7.0
8.0
8.0
6.0
8.0
4.0
9.0
3.0
8.0
6.0
8.0
9.0
6.0
8.0
8.0
9.0
6.0
9.0
6.0
8.0
6.0
6.0
8.0
8.0
6.0
8.0
8.0
6.0
7.0
6.0
7.0
8.0
3.0
8.0
6.0
4.0
6.0
2.0
6.0
8.0
6.0
5.0
6.0
8.0
8.0
6.0
6.0
8.0
6.0
6.0
6.0
9.0
6.0
6.0
6.0
6.0
7.0
8.0
6.0
6.0
6.0
7.0
7.0
6.0
7.0
8.0
4.0
8.0
6.0
6.0
6.0
6.0
6.0
8.0
9.0
6.0
8.0
6.0
6.0
8.0
3.0
7.0
8.0
8.0
7.0
5.0
6.0
6.0
8.0
9.0
8.0
8.0
8.0
4.0
6.0
2.0
6.0
8.0
8.0
7.0
8.0
6.0
8.0
6.0
6.0
3.0
8.0
6.0
4.0
6.0
8.0
8.0
6.0
8.0
6.0
7.0
6.0
6.0
8.0
6.0
8.0
6.0
7.0
6.0
8.0
6.0
6.0
8.0
6.0
6.0
6.0
8.0
6.0
6.0
4.0
8.0
6.0
7.0
6.0
5.0
3.0
4.0
9.0
5.0
8.0
8.0
6.0
6.0
8.0
6.0
6.0
9.0
8.0
8.0
8.0
8.0
8.0
6.0
8.0
8.0
9.0
8.0
7.0
8.0
4.0
6.0
7.0
6.0
6.0
6.0
6.0
6.0
6.0
6.0
7.0
8.0
6.0
6.0
6.0
6.0
8.0
7.0
8.0
7.0
6.0
9.0
7.0
8.0
6.0
9.0
6.0
9.0
7.0
6.0
6.0
8.0
6.0
8.0
6.0
4.0
8.0
7.0
8.0
8.0
9.0
8.0
8.0
6.0
6.0
2.0
8.0
8.0
8.0
8.0
7.0
8.0
8.0
3.0
8.0
8.0
7.0
8.0
9.0
9.0
9.0
8.0
7.0
8.0
8.0
8.0
9.0
8.0
6.0
4.0
9.0
8.0
9.0
6.0
8.0
6.0
8.0
6.0
8.0
6.0


In [16]:
filtered_links = filter_relevant_links(filtered_links, instructions, keywords)

8.0
8.0
7.0
7.0
9.0
8.0
8.0
6.0
8.0
8.0
9.0
8.0
9.0
6.0
9.0
8.0
8.0
8.0
8.0
7.0
9.0
9.0
7.0
8.0
9.0
8.0
9.0
9.0
8.0
8.0
8.0
9.0
8.0
7.0
9.0
6.0
8.0
9.0
8.0
9.0
9.0
7.0
8.0
8.0
8.0
8.0
6.0
9.0
7.0
7.0
8.0
8.0
6.0
9.0
9.0
8.0
6.0
9.0
9.0
8.0
8.0
9.0
9.0
8.0
8.0
8.0
7.0
8.0
8.0
7.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
6.0
8.0
9.0
6.0
9.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
8.0
9.0
9.0
8.0
6.0
8.0
9.0
9.0
7.0
8.0
9.0
8.0
9.0
9.0
8.0
9.0
8.0
8.0
8.0
7.0
8.0
8.0
8.0
8.0
6.0
9.0
8.0
8.0
8.0
9.0
8.0
6.0
8.0
6.0
6.0
8.0
6.0
6.0
9.0
9.0
6.0
8.0
8.0
8.0
6.0
8.0
6.0
8.0
7.0
8.0
8.0
8.0
8.0
8.0
8.0
6.0
6.0
6.0
6.0
8.0
8.0
8.0
8.0
8.0
7.0
8.0
8.0
9.0
9.0
8.0
8.0
8.0
9.0
8.0
6.0
8.0
9.0
8.0
9.0
7.0
6.0
8.0
8.0
8.0
6.0
9.0
9.0
9.0
9.0
8.0
8.0
8.0
9.0
8.0
8.0
9.0
6.0
8.0
9.0
8.0
9.0
8.0
8.0
4.0
8.0
8.0
9.0
7.0
8.0
8.0
9.0
8.0
6.0
8.0
6.0
8.0
8.0
6.0
8.0
8.0


In [31]:
def info_to_output(instructions, url, title, content):
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """
                You are a helpful assistant that will take in an instruction, URL, title, and content and provide a structured output in the format of a JSON object. The JSON object 
                should contain 3 keys: "section", "content", and "url". The "section" key should contain a one word string that describes the section of the website 
                that the content is from. You should describe the content accurately and descriptively as it pertains to the instruction. The "url" key should contain the URL 
                of the website. If the content of the website does not have information that the instructions are looking for, you should provide an empty string for the "section" and "content" keys. You should 
                attempt to do this job in the most accurate way possible. You must only return the JSON object and no other text.
                """
            ),
            (
                "human",
                """
                Instructions: {instructions}
                
                URL: {url}
                Title: {title}
                Content: {content}
                """
            ),
        ]
    )
    chain = prompt | llm
    response = chain.invoke(
        {
            "instructions": instructions,
            "url": url,
            "title": title,
            "content": content,
        }
    )
    return response.content

In [50]:
ex_url = 'https://www.ucop.edu/local-human-resources/op-life/sexual-harassment-violence-support.html'
output = info_to_output(instructions, ex_url, *filtered_links[ex_url])
output = json.loads(output)