In [22]:
from dotenv import load_dotenv
import os
import requests
from openai import OpenAI, AsyncOpenAI
from bs4 import BeautifulSoup
import markdownify
import re
import json
import asyncio
import threading

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
search_engine_id = os.getenv("SEARCH_ENGINE_ID")

openai_client = OpenAI(api_key=openai_api_key)
async_openai_client = AsyncOpenAI(api_key=openai_api_key)

search_url = "https://customsearch.googleapis.com/customsearch/v1"

In [None]:
def get_search_results(query):
    # Get base domain of the university mentioned in the query
    prompt = """Identify the institution mentioned in this user query, and respond with only the base domain of the corresponding institution. \
    If no institution is found within the query, respond with nothing."""

    msgs = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": f"Here is the user's query: {query}"}
    ]

    response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs, temperature=0).choices[0].message.content
    site = response

    params = {
        "q": query,
        "key": google_api_key,
        "cx": search_engine_id,
        "siteSearch": site,
        "siteSearchFilter": "i",
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(search_url, params=params, headers=headers)
    return response.json()

def get_page_content(results):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    content = dict()

    for item in results["items"]:
        link = item["link"]
        response = requests.get(link, headers=headers)
        
        soup = BeautifulSoup(response.text, "html.parser")
        html = soup.find("main") or soup.find("body") or soup
        text = markdownify.markdownify(str(html), strip=["a", "img"])
        text = re.sub(r"\n\n+", "\n", text).strip()

        content[link] = text

    return content

def generate_filtered_content(query, content, filtered_content):
    prompt = """You are given a web page content formatted in markdown. Filter out any text in its original form that best relates to the user's query.
    If no content matches the user's query, respond nothing with no ellipses, characters, or words."""

    url, text = content

    msgs = [
        {"role": "system", "content": prompt},
        {
            "role": "user", 
            "content": f"Here is the user's query: {query}\nHere is the markdown content: {text}"
        }
    ]

    response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs).choices[0].message.content
    filtered_content[url] = response


def filter_content(query, content):
    filtered_content = dict()

    threads = []
    for url, text in content.items():
        filtered_content[url] = ""
        thread = threading.Thread(target=generate_filtered_content, args=(query, (url, text), filtered_content))
        thread.start()
        threads.append(thread)
    
    for thread in threads:
        thread.join()   

    return filtered_content

def generate_response(query, content):
    prompt = """You are an expert in answering questions about applying to universities for college applicants. \
    You are given a few documents formatted in markdown. Answer the student's question/query using only the information below. \
    Omit any irrelevant or duplicate information that might appear. There is no need to cite the sources; only provide a comprehensive answer."""
    
    context = ""
    for i, (url, text) in enumerate(content.items()):
        if text == "":
            continue

        context += f"Document {i}:\n{text}\n\n"
    
    msgs = [
            {"role": "system", "content": prompt},
            {
                "role": "user", 
                "content": f"Here is the student's question: {query}\n\nHere are the documents:\n{context}"
            }
        ]
    
    response = openai_client.chat.completions.create(model="gpt-4o-mini", messages=msgs).choices[0].message.content
    return response


In [30]:
query = "What are some CS courses that I should already have taken to be considered for admission in the MS CS program at USC"
results = get_search_results(query)
# get_html(results)
# results["items"]
content = get_page_content(results)
{url: text[:200] for url, text in content.items()}

{'https://viterbigradadmission.usc.edu/programs/masters/msprograms/computer-science/ms-computer-science/': 'Programs  Master’s  Master’s Programs  Computer Science  MS in Computer Science\n#### Master of Science in Computer Science\n#### ApplicationDeadlines\nSpring:  \nSeptember 1\nFall:  \nDecember 15\nUSC GRADU',
 'https://viterbigradadmission.usc.edu/programs/masters/msprograms/computer-science/ms-cs-scientists-engineers/': 'Programs  Master’s  Master’s Programs  Computer Science  MS in Computer Science – Scientists and Engineers\n#### MS in Computer Science \\- Scientists and Engineers\n#### ApplicationDeadlines\nSpring:  \nS',
 'https://viterbiadmission.usc.edu/transfer/': 'Skip to main content\n* youtube\n* instagram\n* phone\n* email\n * PODCAST\n* BLOG\n* MEET STUDENTS\n* CONTACT US\nClose Search\n \n \n \nMenu   \n* EVENTS \\& VISITS\n* THE VITERBI EXPERIENCE\n\t+ About the School\n\t+ '}

In [31]:
filtered_content = filter_content(query, content)
filtered_content

{'https://viterbigradadmission.usc.edu/programs/masters/msprograms/computer-science/ms-computer-science/': 'Applicants are expected to have completed or be in process of completing an undergraduate degree in computer science or its equivalent. For those who do not, they should meet the eligibility criteria on our MSCS Eligibility Criteria page.',
 'https://gradadm.usc.edu/prospective-international-students/english-proficiency/': '',
 'https://www.cs.usc.edu/academic-programs/undergrad/': '',
 'https://gradadm.usc.edu/domestic-students/how-to-apply/': 'Nothing',
 'https://viterbigradadmission.usc.edu/programs/masters/faq/': 'Qualified applicants will have an undergraduate degree (Bachelor of Science or equivalent) in engineering/computer science, math, physics, or another hard science such as biology or chemistry, depending on the Master’s program. Please contact us if you have questions about your eligibility. We perform a holistic review of the entire application, including overall ac

In [32]:
print(query)
print(generate_response(query, filtered_content))

What are some CS courses that I should already have taken to be considered for admission in the MS CS program at USC
To be considered for admission to the MS CS program at USC, you should ideally have completed or be in progress of completing an undergraduate degree in computer science or a closely related field. If your degree is not in computer science, you will need coursework and demonstrated proficiency in at least three of the following core Computer Science courses:

1. Artificial Intelligence
2. Operating Systems
3. Compilers
4. Software Engineering
5. Computer Architecture
6. Database Systems
7. Computer Communications
8. Software Design in Computer Science
9. Computer Vision
10. 3D Graphics and Rendering
11. Geometric Modeling
12. Robotics

Additionally, you must have at least one of the following Computer Science related Mathematics courses:

- Algorithms
- Data Structures
- Discrete Math
- Discrete Structures
- Numerical Computation

It's also recommended to have a backgrou