In [None]:
pip install python-dotenv requests beautifulsoup4 openai chromadb

In [None]:
import os
import json
import uuid
from google.colab import userdata
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import chromadb

# Constants
USER_PROMPT = "How much is tuition at Colorado College in 2024-25?"
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
BRAVE_API_KEY = userdata.get("BRAVE_API_KEY")

# Set up OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

def call_gpt4_turbo(messages):
    """
    Make a call to the GPT-4 Turbo model.
    """
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )
    return response.choices[0].message.content.strip()

def rephrase_query(text):
    """
    Use GPT-4 Turbo to rephrase the user's query into a search-friendly format.
    This can help in getting more relevant search results.
    """
    messages = [
        {"role": "system", "content": "Rephrase the following as a concise search query:"},
        {"role": "user", "content": text}
    ]
    return call_gpt4_turbo(messages)

def search_urls(query):
    """
    Use Brave Search API to find relevant URLs based on the query.
    Returns the top 5 URLs from the search results.
    """
    url = f"https://api.search.brave.com/res/v1/web/search?q={query}"
    headers = {"X-Subscription-Token": BRAVE_API_KEY}
    response = requests.get(url, headers=headers)
    results = response.json()
    return [result['url'] for result in results.get('web', {}).get('results', [])][:5]

def scrape_and_parse(urls):
    """
    Scrape content from the given URLs and parse it using BeautifulSoup.
    Returns the text content and source URLs.
    """
    contents = []
    sources = []
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"}

    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            contents.append(soup.get_text())
            sources.append(url)
        except requests.exceptions.RequestException as e:
            print(f"Error scraping {url}: {str(e)}")

    return contents, sources

def embed_into_chroma_db(contents, sources, collection_name):
    """
    Embed the scraped content into a Chroma database for efficient searching.
    Each document is stored with its source URL as metadata.
    """
    client = chromadb.PersistentClient()
    collection = client.get_or_create_collection(collection_name)

    for content, source in zip(contents, sources):
        collection.add(
            documents=[content],
            metadatas=[{"source": source}],
            ids=[str(uuid.uuid4())]
        )

def query_chroma_db(query, collection_name, top_k=5):
    """
    Query the Chroma database to find the most relevant documents for the given query.
    Returns the top k documents and their sources.
    """
    client = chromadb.PersistentClient()
    collection = client.get_collection(collection_name)

    results = collection.query(
        query_texts=[query],
        n_results=top_k,
        include=["documents", "metadatas"]
    )

    return results["documents"][0], [m["source"] for m in results["metadatas"][0]]

def generate_response(context, query):
    """
    Use GPT-4 Turbo to generate a response based on the context and query.
    The context is the relevant information retrieved from the Chroma database.
    """
    messages = [
        {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer the user's query accurately and concisely."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuery: {query}\n\nResponse:"}
    ]
    return call_gpt4_turbo(messages)

def main():
    # Step 1: Rephrase the user's query
    rephrased_query = rephrase_query(USER_PROMPT)
    print(f"Rephrased Query: {rephrased_query}")

    # Step 2: Search for relevant URLs
    urls = search_urls(rephrased_query)

    # Step 3: Scrape and parse content from the URLs
    contents, sources = scrape_and_parse(urls)

    # Step 4: Embed the scraped content into the Chroma database
    embed_into_chroma_db(contents, sources, "colorado_college_tuition")

    # Step 5: Query the Chroma database for relevant information
    relevant_docs, relevant_sources = query_chroma_db(rephrased_query, "colorado_college_tuition")

    # Step 6: Prepare the context for the response generation
    context = "\n".join([f"Source: {source}\nContent: {doc}" for doc, source in zip(relevant_docs, relevant_sources)])

    # Step 7: Generate the final response
    response = generate_response(context, rephrased_query)

    # Step 8: Print the results
    print("Response:")
    print(response)
    print("\nRelevant Sources:")
    print("\n".join(relevant_sources))

if __name__ == '__main__':
    main()