In [2]:
from dotenv import load_dotenv
import os
import requests
from openai import OpenAI
from bs4 import BeautifulSoup
import markdownify
import re

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
search_engine_id = os.getenv("SEARCH_ENGINE_ID")

openai_client = OpenAI(api_key=openai_api_key)

search_url = "https://customsearch.googleapis.com/customsearch/v1"

In [3]:
def get_search_results(query, site=""):
    params = {
        "q": query,
        "key": google_api_key,
        "cx": search_engine_id,
        "siteSearch": site,
        "siteSearchFilter": "i",
    }

    response = requests.get(search_url, params=params)
    return response.json()

def get_page_content(results):
    content = dict()

    for item in results["items"]:
        link = item["link"]
        response = requests.get(link)
        
        soup = BeautifulSoup(response.text, "html.parser")
        html = soup.find("main") or soup.find("body") or soup
        text = markdownify.markdownify(str(html), strip=["a", "img"])
        text = re.sub(r"\n\n+", "\n", text).strip()

        content[link] = text

    return content

def filter_content(query, content):
    filtered_content = dict()

    prompt = """You are given a web page content formatted in markdown. Filter out any text in its original form that best relates to the user's query.
    If no content matches the user's query, respond nothing with no ellipses or characters."""

    for url, text in content.items():
        msgs = [
            {"role": "system", "content": prompt},
            {
                "role": "user", 
                "content": f"Here is the user's query: {query}\nHere is the markdown content: {text}"
            }
        ]

        response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs).choices[0].message.content
        # if response == "":
        #     continue
        
        # print(response)
        filtered_content[url] = response

    return filtered_content

def generate_response(query, content):
    prompt = """You are an expert in answering questions about applying to universities for college applicants. \
        You are given a few documents formatted in markdown. Answer the student's question/query using only the information below. \
        Omit any irrelevant or duplicate information that might appear. There is no need to cite the sources; only provide a comprehensive answer."""
    
    context = ""
    for i, (_, text) in enumerate(content.items()):
        if text == "":
            continue

        context += f"Document {i}:\n{text}\n\n"
    
    msgs = [
            {"role": "system", "content": prompt},
            {
                "role": "user", 
                "content": f"Here is the student's question: {query}\n\nHere are the documents:\n{context}"
            }
        ]
    
    response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs).choices[0].message.content
    return response


In [7]:
query = "What is the NLP MS program at UCSC"
results = get_search_results(query, "ucsc.edu")
# get_html(results)
# results["items"]
content = get_page_content(results)
{url: text[:200] for url, text in content.items()}

{'https://nlp.ucsc.edu/': '* Search\n* Log In\n \n* Facebook\n* X\n \n \nnlp@ucsc.edu\n* Home\n* About Us\n* Program Overview\n* Apply\n* News\n* Contact\n \n \n \nAPPLICATIONS ARE NOW OPEN!\n--------------------------\nWe are currently accepting',
 'https://nlp.ucsc.edu/program-overview/': '* Search\n* Log In\n \n* Facebook\n* X\n \n \nnlp@ucsc.edu\n* Home\n* About Us\n* Program Overview\n* Apply\n* News\n* Contact\n \n \n \nPROGRAM OVERVIEW\n----------------\n**Natural Language Processing (NLP)** combines',
 'https://nlp.ucsc.edu/admissions/': '* Search\n* Log In\n \n* Facebook\n* X\n \n \nnlp@ucsc.edu\n* Home\n* About Us\n* Program Overview\n* Apply\n* News\n* Contact\n \n \n \nADMISSIONS\n----------\n**Get ready to apply for Fall 2025 admission consideration',
 'https://catalog.ucsc.edu/en/current/general-catalog/academic-units/baskin-engineering/computer-science-and-engineering/natural-language-processing-ms': 'Skip to main content\nCatalog Search\nSearch Options\nEntire Catalog\n

In [8]:
filtered_content = filter_content(query, content)

In [9]:
print(query)
generate_response(query, filtered_content)

What is the NLP MS program at UCSC


"The Natural Language Processing (NLP) Master of Science program at the University of California, Santa Cruz (UCSC) is an intensive, professional master's program designed to develop expertise in computer programs that understand and generate human language. The program typically spans 15-18 months and is tailored for those with a strong background in computer science, linguistics, and artificial intelligence.\n\nKey features of the program include:\n\n- **Curriculum**: It emphasizes practical skills through courses focusing on core NLP algorithms, machine learning, and data analytics. The program includes six core courses worth 30 credits, two elective courses for 10 credits, and a 15-unit capstone project that provides real-world experience through team-based projects addressing industry-relevant NLP challenges.\n\n- **Industry Collaboration**: Students benefit from close collaboration with an advisory board of industry scientists, guest lectures, mentoring, and networking opportunit