In [123]:
from dotenv import load_dotenv
import os
import requests
from openai import OpenAI, AsyncOpenAI
from bs4 import BeautifulSoup
import markdownify
import re
import json
import asyncio
import threading

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
search_engine_id = os.getenv("SEARCH_ENGINE_ID")

openai_client = OpenAI(api_key=openai_api_key)
# async_openai_client = AsyncOpenAI(api_key=openai_api_key)

search_url = "https://customsearch.googleapis.com/customsearch/v1"

In [124]:
def get_search_results(query):
    # Get base domain of the university mentioned in the query
    prompt = """Identify the institution mentioned in this user query, and respond with only the base domain of the corresponding institution. \
    If no institution is found within the query, respond with nothing."""

    msgs = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": f"Here is the user's query: {query}"}
    ]

    response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs, temperature=0).choices[0].message.content
    site = response

    params = {
        "q": query+" -filetype:pdf -filetype:docx",
        "key": google_api_key,
        "cx": search_engine_id,
        "siteSearch": site,
        "siteSearchFilter": "i",
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(search_url, params=params, headers=headers)
    return response.json()

def get_page_content(results):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    content = dict()

    for item in results["items"]:
        link = item["link"]
        response = requests.get(link, headers=headers)
        
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.find("title").text if soup.find("title") else "Untitled"
        html = soup.find("main") or soup.find("body") or soup
        text = markdownify.markdownify(str(html), strip=["a", "img"])
        text = re.sub(r"\n\n+", "\n", text).strip()

        content[link] = (title, text)

    return content

def generate_filtered_content(query, content, filtered_content):
    prompt = """You are given a web page content formatted in markdown. Filter out any text in its original markdown form that best relates to the user's query. \
    If no content matches the user's query, respond nothing with no ellipses, characters, or words."""

    url, title, text = content

    msgs = [
        {"role": "system", "content": prompt},
        {
            "role": "user", 
            "content": f"Here is the user's query: {query}\nHere is the markdown content:\nTitle: {title}\nContent: {text}"
        }
    ]

    response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs).choices[0].message.content
    filtered_content[url] = (title, response)


def filter_content(query, content):
    filtered_content = dict()

    threads = []
    for url, (title, text) in content.items():
        filtered_content[url] = ""
        thread = threading.Thread(target=generate_filtered_content, args=(query, (url, title, text), filtered_content))
        thread.start()
        threads.append(thread)
    
    for thread in threads:
        thread.join()   

    return filtered_content

def generate_response(query, content):
    prompt = """You are an expert in answering questions about applying to universities for college applicants. \
    You are given a few documents formatted in markdown. Answer the student's question/query using only the information below. \
    Omit any irrelevant or duplicate information that might appear. There is no need to cite the sources; only provide a comprehensive answer."""
    
    context = ""
    for i, (url, data) in enumerate(content.items()):
        title, text = data
        if text == "" or text == "Nothing":
            continue

        context += f"Document {i}:\nTitle: {title}\nContent: {text}\n\n"
    
    msgs = [
            {"role": "system", "content": prompt},
            {
                "role": "user", 
                "content": f"Here is the student's question: {query}\n\nHere are the documents:\n{context}"
            }
        ]
    
    response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs).choices[0].message.content
    return response


In [125]:
query = "What is the 4+1 program for Computer Science at UCSC?"
results = get_search_results(query)
# get_html(results)
# results["items"]
content = get_page_content(results)
{url: text[:200] for url, text in content.items()}

{'https://grad.soe.ucsc.edu/contiguous_pathways': ("Contiguous Bachelor's/Master's Pathways | Graduate Studies",
 'https://sam.soe.ucsc.edu/41pathway': ('41pathway | sam.soe.ucsc.edu',
  "4+1 pathway into SciCAM\nWhy join the 4+1 pathway? \nIf you are interested in completing the MS in Scientific Computing and Applied Mathematics, there are many advantages in joining the 4+1 pathway\n* **Fast track application to graduate school**: no GRE required, minimalist essays, and the fees are reimbursed if you join SciCAM. Winter start is also possible if you are planning on completing your BS in the Fall.\n* **Ideal preparation**: The 4+1 program ensures that all students have taken the correct preparatory courses towards the MS.\n* **A head start**:\xa0Courses taken as undergraduates can also (sometimes) count towards your graduate degree, and enable you to take more advanced electives and/or start your thesis early.\nWho can join the 4+1 pathway?\nThe 4+1 pathway is ideally designed for **st

In [126]:
filtered_content = filter_content(query, content)
filtered_content

{'https://grad.soe.ucsc.edu/contiguous_pathways': ("Contiguous Bachelor's/Master's Pathways | Graduate Studies",
  "**Computer Science & Engineering**  \nThe Computer Science and Engineering Contiguous 5-Year Bachelor's/Master's Pathway offers a competitive edge to Baskin School of Engineering (BSOE) students who are completing their undergraduate degree at UC Santa Cruz by enabling those with advanced preparation to move directly from a BSOE undergraduate program to the Computer Science and Engineering (CSE) M.S. program. The path assists qualified enrolled students with a simplified graduate application process that does not require students to take the Graduate Record Examination (GRE) if they have and maintain an overall GPA of 3.0 or above by the end of their junior year until the completion of their bachelor's degree. With appropriate planning, the CSE Contiguous Pathway Program makes it possible to complete a bachelor's and a master's in as little as five years.  \nPlease note t

In [127]:
print(query)
print(generate_response(query, filtered_content))

What is the 4+1 program for Computer Science at UCSC?
The 4+1 program for Computer Science at UC Santa Cruz (UCSC) is known as the Contiguous Bachelor's/Master's Pathway in Computer Science and Engineering (CSE). This program allows qualified undergraduate students from the Baskin School of Engineering (BSOE) to transition directly into the Computer Science and Engineering Master’s program upon completing their undergraduate degree. 

Key features of the 4+1 program include:

- **Duration**: Students can complete both their bachelor’s and master’s degrees in as little as five years.
- **Application Process**: While the program provides a streamlined application process where students do not need to submit GRE scores if they maintain a GPA of 3.0 or higher, it does not guarantee admission to the CSE Master’s program. Applicants must still formally apply and be admitted.
- **Eligibility**: Students must declare a major within BSOE and meet specific GPA and academic requirements. They sho