In [None]:
from dotenv import load_dotenv
import os
import requests
from openai import OpenAI
from bs4 import BeautifulSoup
import markdownify
import re

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
search_engine_id = os.getenv("SEARCH_ENGINE_ID")

openai_client = OpenAI(api_key=openai_api_key)

search_url = "https://customsearch.googleapis.com/customsearch/v1"

In [None]:
def get_search_results(query, site=""):
    params = {
        "q": query,
        "key": google_api_key,
        "cx": search_engine_id,
        "siteSearch": site,
        "siteSearchFilter": "i",
    }

    response = requests.get(search_url, params=params)
    return response.json()

def get_page_content(results):
    content = dict()

    for item in results["items"]:
        link = item["link"]
        response = requests.get(link)
        
        soup = BeautifulSoup(response.text, "html.parser")
        html = soup.find("main") or soup.find("body") or soup
        text = markdownify.markdownify(str(html), strip=["a", "img"])
        text = re.sub(r"\n\n+", "\n", text).strip()

        content[link] = text

    return content

def filter_content(query, content):
    filtered_content = dict()

    prompt = """You are given a web page content formatted in markdown. Filter out any text in its original form that best relates to the user's query.
    If no content matches the user's query, respond nothing with no ellipses or characters."""

    for url, text in content.items():
        msgs = [
            {"role": "system", "content": prompt},
            {
                "role": "user", 
                "content": f"Here is the user's query: {query}\nHere is the markdown content: {text}"
            }
        ]

        response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs).choices[0].message.content
        # if response == "":
        #     continue
        
        print(response)
        filtered_content[url] = response

    return filtered_content


In [None]:
query = "what is cse 144 at ucsc"
results = get_search_results(query, "ucsc.edu")
# get_html(results)
# results["items"]
content = get_page_content(results)
content

In [None]:
filter_content(query, content)