In [4]:
# importing all of the necessary libraries
import os
import re
import json
import html
import uuid
import asyncio
import aiohttp
import time
from bs4 import BeautifulSoup
from supabase import create_client, Client
from datetime import datetime
from langchain_openai import OpenAIEmbeddings
import nest_asyncio

# set environment variables for supabase and scraperapi
# Supabase is an online database that can handle a lot of data; it is where all of the data scraped is being stored
# Scraper API is essentially an API that rotates proxies that make it so that the website that is being scraped is not going to rate limit me. 
# the proxies it rotates through decieve the website, as they see it as multiple users making a lot of requests, not just one, so I cannot be rate limited.
os.environ["SUPABASE_URL"] = "supabase url"
os.environ["SUPABASE_KEY"] = "supabase key"
os.environ["SCRAPERAPI_KEY"] = "scraper api key"

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
scraperapi_key = os.environ.get("SCRAPERAPI_KEY")

# create supabase client
supabase: Client = create_client(url, key)

# read all links from links.txt
# this file is where all of the category links are stored, with 552 categories on College Confidential in total
# in the code I ran, not the sample code, I looped through the links.txt file to get all the category links
links = ["https://talk.collegeconfidential.com/c/college-essays/43/none"]

for linksy in links:
    linky_list = [linksy]

    # essentially saying that it can make 25 requests at one time, signifigantly speeding up the scraping process
    semaphore = asyncio.Semaphore(25)

    async def fetch_json(session, url, retries=3):
        async with semaphore:
            for attempt in range(retries):
                try:
                    async with session.get(url) as response:
                        content_type = response.headers.get('Content-Type', '').lower()
                        # this is error handling, so whenever an error that says something like "you can't get your content" it returns the string error to the function
                        # when the string "error" is detected later in the code, it returns "null" for the content.
                        if 'html' in content_type:
                            print(f"Skipped HTML content at {url}")
                            return "error"  # skip HTML content

                        try:
                            return await response.json(content_type=None)
                        except aiohttp.ContentTypeError:
                            print(f"ContentTypeError: Skipped non-JSON content at {url}")
                            return "error" # skip HTML content
                        except json.JSONDecodeError:
                            print(f"JSONDecodeError: Failed to parse JSON at {url}")
                            return "error" # skip HTML content
                # whenever we get an error that is related to the College Confidential website or any other API we are spamming requests to gets tired, we wait for two minutes.
                except (aiohttp.ClientOSError, asyncio.TimeoutError, aiohttp.client_exceptions.ClientPayloadError) as e:
                    print(f"Error: {e} - Retrying in 2 minutes...")
                    time.sleep(120)
            print(f"Failed to fetch JSON from {url} after {retries} attempts")
            return "error"

    json_files = []

    #saves the JSON data later use for each category in a folder called json_data
    def save_json_data(json_files, category_url):
        # create the directory if it doesn't exist
        os.makedirs("json_data", exist_ok=True)
        # create a unique filename using category URL and time of publishing
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        unique_id = uuid.uuid4()
        filename = f"json_data/{category_url.replace('/', '_')}_{timestamp}_{unique_id}.json"
        with open(filename, "w") as save_file:
            json.dump(json_files, save_file, indent=6)

# async function to process each category
async def process_category(session, URL):
    base_url = URL + ".json?page="
    base = 'https://talk.collegeconfidential.com'
    category_url = URL

    # trim base URL from category URL
    if category_url.startswith(base):
        category_url = category_url[len(base):]

    # remove pagination and .json part from category URL
    category_url = re.sub(r'\.json\?page=.*', '', category_url)
    print(f"Processing category: {category_url}")

    page = 0
    slugs = []
    ids = []
    json_files = []  # initialize the json_files list

    # fetch data for up to 100 pages
    while page <= 100:
        url = base_url + str(page)
        print(url)
        data_json = await fetch_json(session, url)
        if not data_json:
            break

        json_files.append(data_json)  # save the JSON data

        topics = data_json.get('topic_list', {}).get('topics', [])
        if not topics:
            break

        # collect slugs and ids from topics
        for topic in topics:
            if 'slug' in topic and 'id' in topic:
                slugs.append(topic['slug'])
                ids.append(topic['id'])

        page += 1

    # save JSON data    
    save_json_data(json_files, category_url)

    # create list of full links
    full_links_list = [f"https://talk.collegeconfidential.com/t/{slug}/{id}" for slug, id in zip(slugs, ids)]
    print(f"Finished link list for category: {category_url}")

    content = []

    # fetch post data for each link
    async def fetch_post_data(url):
        scraperapi_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={url}.json"
        data = await fetch_json(session, scraperapi_url)
        if not data:
            return
        
        if data == "error":
            content.append("null")
            return
        
        # extract first post from the data
        posts = data["post_stream"]["posts"]
        first_post = posts[0]
        cooked_html = first_post["cooked"]
        decoded_text = html.unescape(cooked_html)
        plain_text = re.sub(r'<.*?>', '', decoded_text)
        content.append(plain_text)
        
    await asyncio.gather(*[fetch_post_data(url) for url in full_links_list])
    print("Got the content from the entire list")

    times_for_each_chunk = []

    # fetch post time for each link
    async def fetch_post_time(url):
        scraperapi_url = f"http://api.scraperapi.com?api_key={scraperapi_key}&url={url}.json"
        json_response = await fetch_json(session, scraperapi_url)
        if json_response == "error":
            times_for_each_chunk.append("1970-09-08T21:37:25.000Z")
            return
        
        if not json_response:
            return
        
        posts = json_response['post_stream'].get('posts', [])
        if not posts:
            return
        published_time = posts[0].get('created_at')
        times_for_each_chunk.append(published_time)
        print(published_time)
        
    await asyncio.gather(*[fetch_post_time(url) for url in full_links_list])
    time.sleep(20)

    print("Finished extracting times")

    # combine slugs and content
    content_final_form = [f"{slug.replace('-', ' ')}\n\n{paragraph}" for paragraph, slug in zip(content, slugs)]
    print("Added the titles")

    embeddings_list = []

    # initialize the embedding model
    embed_model = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key="your open api key")
    x = 0
    for text in content_final_form:
        embeddings = embed_model.embed_query(text)
        embeddings_list.append(embeddings)
        print("got embedding, doing more")
        x += 1
        # for every 200 iterations, take a 3 second break to reduce chances of timing out
        if (x + 1) % 200 == 0:
            print(f"{x + 1} embeddings done")
            time.sleep(3)
            
    print("Embedded the embeddings")

    # insert rows into supabase
    for i in range(len(full_links_list)):
        guid = uuid.uuid4()
        row = {
            # a UUID is a unique numerical identifier for each thread 
            "UUID": str(guid),
            # url of the category
            "category_url": category_url,
            # url of the thread
            "thread_url": full_links_list[i],
            # the slug, or title of the post which-comes-in-this-form
            "slug": slugs[i],
            # the id of the thread in the College Confidential url
            "thread_id": ids[i],
            # the time the thread was created at
            "created_at": times_for_each_chunk[i],
            # the content of the first introductory post on the thread
            "content": content_final_form[i],
            # the embeddings of the thread
            # "Embeddings are vectors that represent real-world objects, like words, images, or videos, in a form that machine learning models can easily process." - Cloudflare
            # the embeddings for the content of each thread will later be used to create an AI chatbot that can answer questions similar to how questions are answered in threads on the College Confidential website
            "embeddings": embeddings_list[i],
            "json_file": None
        }
        #appending the row the the Supabase table
        supabase.table("your supabase table").insert(row).execute()
        # for every 200 iterations, take a 5 second break to reduce chances of timing out
        if (i + 1) % 200 == 0:
            print(f"{i + 1} rows appended, taking a 1-minute break...")
            time.sleep(5)

    print(f"Appended data to Supabase for category: {category_url}")

# main async function to run all tasks (it makes it so that it can do multiple requests in one go, and most of the portions in the code are run like that for efficiency)
async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [process_category(session, URL) for URL in linky_list]
        await asyncio.gather(*tasks)

# Check if the code is running inside an existing event loop
if __name__ == "__main__":
    try:
        loop = asyncio.get_running_loop()
        if loop and loop.is_running():
            loop.create_task(main())
        else:
            asyncio.run(main())
    except RuntimeError:
        asyncio.run(main())

# To run this code you will need a local version of Supabase powered by a Docker container, and a whole lot of tokens from a proxy rotater if your want to run it on a large scale
# doing a sample run to show the code works, 10 pages of the "college essays" category

Processing category: /c/college-essays/43/none
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=0
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=1
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=2
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=3
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=4
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=5
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=6
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=7
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=8
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=9
https://talk.collegeconfidential.com/c/college-essays/43/none.json?page=10
Finished link list for category: /c/college-essays/43/none
Got the content from the entire list
2024-06-15T23:00:42.225Z
2024-02-16T01:48:

Task exception was never retrieved
future: <Task finished name='Task-2574' coro=<main() done, defined at /var/folders/y6/8sflfxlx4cn4ngv4161zjwyr0000gn/T/ipykernel_1998/2719421661.py:233> exception=Error PGRST204:
Message: Column 'json_file' of relation 'final_cc_threads' does not exist>
Traceback (most recent call last):
  File "/var/folders/y6/8sflfxlx4cn4ngv4161zjwyr0000gn/T/ipykernel_1998/2719421661.py", line 236, in main
    await asyncio.gather(*tasks)
  File "/var/folders/y6/8sflfxlx4cn4ngv4161zjwyr0000gn/T/ipykernel_1998/2719421661.py", line 224, in process_category
    supabase.table("final_cc_threads").insert(row).execute()
  File "/Users/jair/anaconda3/lib/python3.10/site-packages/postgrest/_sync/request_builder.py", line 78, in execute
    raise APIError(r.json())
postgrest.exceptions.APIError: {'code': 'PGRST204', 'details': None, 'hint': None, 'message': "Column 'json_file' of relation 'final_cc_threads' does not exist"}


got embedding, doing more
Embedded the embeddings


In [14]:
import os
import time
from supabase import create_client, Client
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_postgres.vectorstores import PGVector

# set environment variables directly in the script
os.environ["SUPABASE_URL"] = "your supabase url"
os.environ["SUPABASE_KEY"] = "your supabase key"

url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")

print("creating supabase client...")
supabase = create_client(url, key)
print(f"supabase client created: {supabase}")

# initialize openai embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key="your openai key")

# connection details
connection = "postgresql://postgres:postgres@127.0.0.1:54322/postgres"
collection_name = "vector_store_cc"

# initialize pgvector store
print("initializing pgvector store...")
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)
print("pgvector store initialized.")

# function to update embeddings in the vector store
def update_vector_store_embeddings(starting_offset=0, batch_size=20):
    offset = starting_offset

    while True:
        try:
            print(f"fetching records from offset {offset} to {offset + batch_size - 1}...")
            response = supabase.table('final_cc_threads_duplicate').select('*').range(offset, offset + batch_size - 1).execute()
            records = response.data
            if not records:
                print("no more records to process. exiting loop.")
                break

            updated_docs = []
            for record in records:
                content = record['content']
                # generate embeddings for the content
                embedding_vector = embeddings.embed_query(content)
                doc = Document(
                    page_content=content,
                    metadata={
                        "id": record['UUID'],
                        "category_url": record['category_url'],
                        "thread_url": record['thread_url'],
                        "slug": record['slug'],
                        "thread_id": record['thread_id'],
                        "created_at": record['created_at'],
                        "embeddings": embedding_vector  # store the generated embeddings
                    }
                )
                updated_docs.append(doc)

            # update the vector store with the new embeddings
            vectorstore.add_documents(updated_docs)

            offset += batch_size
            print(f"processed records up to offset {offset}")

        except Exception as e:
            print(f"error encountered: {e}. waiting for 5 minutes before retrying...")
            time.sleep(10)  # wait for 5 minutes before retrying

update_vector_store_embeddings()
print("finished updating vector store embeddings.")

#below is an example batch of 250 docs

INFO:__main__:Initializing Supabase client...
INFO:__main__:Supabase client created: <supabase._sync.client.SyncClient object at 0x11c9fdc00>
INFO:__main__:Starting data extraction process...
INFO:__main__:Fetching records from offset 0 to 9...
INFO:httpx:HTTP Request: GET http://127.0.0.1:54321/rest/v1/final_cc_threads_duplicate?select=%2A&offset=0&limit=10 "HTTP/1.1 200 OK"
INFO:__main__:Processed batch 2 (offset 0), total documents: 10
INFO:__main__:Fetching records from offset 10 to 19...
INFO:httpx:HTTP Request: GET http://127.0.0.1:54321/rest/v1/final_cc_threads_duplicate?select=%2A&offset=10&limit=10 "HTTP/1.1 200 OK"
INFO:__main__:Processed batch 3 (offset 10), total documents: 20
INFO:__main__:Fetching records from offset 20 to 29...
INFO:httpx:HTTP Request: GET http://127.0.0.1:54321/rest/v1/final_cc_threads_duplicate?select=%2A&offset=20&limit=10 "HTTP/1.1 200 OK"
INFO:__main__:Processed batch 4 (offset 20), total documents: 30
INFO:__main__:Fetching records from offset 30 t

In [78]:
from langchain_cohere import CohereEmbeddings
from langchain_core.documents import Document
from langchain_postgres.vectorstores import PGVector
from langchain_openai import OpenAIEmbeddings


connection = "postgresql://postgres:postgres@127.0.0.1:54322/postgres"  # Uses psycopg3
collection_name = "vector_store_cc"

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key="your openai key")

# initializing PGVector store
print("Initializing PGVector store...")
vectorstore = PGVector(
    embeddings = embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)
print("PGVector store initialized.")

Initializing PGVector store...
PGVector store initialized.


In [87]:
# showing the first three results in the similarity search
# these are the most similar threads to the question
search_results = vectorstore.similarity_search_with_score("What is the international student experience like at top colleges?", k=10)
for result, score in search_results:
    print(f"* [score={score:3f}] {result.page_content}")
    print()

* [score=0.335278809244524] the key for international students to get into top universities

I have a few questions for international students who are currently studying in the United States.

What country are you from originally?
What was your perception of studying in the United States before starting and how has it changed now as a current student?
How are you integrating your cultural identity into your academic, social, and residential lives in college?


* [score=0.395698070526123] rejection international students perspective

Which summer programs do you recommend for a high achiever who isn’t too clear about what he wants to do in long term but is aiming for an stimulating experience and help in getting into top 25 colleges.

* [score=0.409983407260961] chance an international student

Does anybody have any direct experience with this college ?  Is it hard to get in and what is the perceived reputation among the academic community ?

* [score=0.417191743850708] do top canadian 

In [89]:
import openai
import os
import aiohttp
import asyncio
import nest_asyncio

# set open ai key
openai.api_key = os.getenv("your openai key")

# get the first 10 responses in a thread
async def fetch_responses(session, thread_url):
    responses = []
    for page in range(1, 2): 
        url = f"{thread_url}.json?page={page}"
        async with session.get(url) as response:
            # verify if the request goes through
            if response.status == 200:
                data = await response.json()
                posts = data["post_stream"]["posts"]
                for post in posts[:10]: 
                    responses.append(post["cooked"]) # 'cooked' contains the post content
            else:
                print(f"Failed: {response.status}")
    return responses

#the question the similarity search is being done for
question = "What is the international student experience like at top colleges?"


async def main():
    # similarity search (the best 3 results)
    search_results = vectorstore.similarity_search(question, k=3)
    
    # Debugging: Print the search results to inspect their structure
    print("Search Results:", search_results)
    
    # get all the thread urls from the search results
    thread_urls = [result.metadata['thread_url'] for result in search_results]
    for url in thread_urls:
        print(url)
    print() 

    async with aiohttp.ClientSession() as session:
        # request the responses from the thread(s)
        tasks = [fetch_responses(session, url) for url in thread_urls]
        all_responses = await asyncio.gather(*tasks)
        all_responses_flat = [response for responses in all_responses for response in responses]
        
        # join all the responses
        combined_responses = "\n\n".join(all_responses_flat)
        print(combined_responses)
        return combined_responses

# make it so that you can run it in a notebook
nest_asyncio.apply()

combined_responses = asyncio.run(main())
print(combined_responses)


Search Results: [Document(page_content='the key for international students to get into top universities\n\nI have a few questions for international students who are currently studying in the United States.\n\nWhat country are you from originally?\nWhat was your perception of studying in the United States before starting and how has it changed now as a current student?\nHow are you integrating your cultural identity into your academic, social, and residential lives in college?\n', metadata={'id': '8e873f46-6ffc-4146-a32a-afb284f99160', 'slug': 'the-key-for-international-students-to-get-into-top-universities', 'thread_id': 1967954, 'created_at': '2009-01-30T17:49:15+00:00', 'embeddings': [0.022219154983758926, -0.012083372101187706, 0.039925601333379745, 0.06830816715955734, 0.007416280917823315, -0.021886639297008514, 0.022183528169989586, 0.022231029346585274, 0.005804175045341253, 0.001277363859117031, -0.02923760376870632, 0.026268716901540756, -0.08716654032468796, 0.013502500951290

In [92]:
import openai

# Directly set your OpenAI API key
client = OpenAI(
    api_key="your openai key"
)

response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    # setting the rules/parameters for the prompt
    {"role": "system", "content": "You are a knowledgeable college counselor designed to use information provided to you and then give a response based on the context and information you receive. Be sure to Provide the links given in the responses as well, with the context and usage behind them. Make answering the given question a priority above anything else. Avoid being repetitive."},
    # giving the context for the prompt
    {"role": "user", "content": f"Here is the question you need to answer and prioritize: {question}. Here is the context to assist in answering the question: \n\n{combined_responses} Note that the context is not related to the person asking the question, it is someone else's experience in a similar situation to the person asking the given question."}
  ]
)

response_message = response.choices[0].message.content
print(response_message )

The international student experience at top colleges can vary significantly based on a range of factors, including the institution's policies towards international students, financial aid availability, and the overall campus culture. Here are some key aspects of the international student experience at top colleges:

1. **Application Process**: As highlighted in the discussion, international students often face tougher admission standards. Top universities tend to have lower acceptance rates for international candidates compared to domestic students. Factors such as academic excellence (high GPA and test scores), unique extracurricular achievements, and demonstrated for financial support play critical roles in the application process.

2. **Financial Aid and Affordability**: A significant challenge for international students is securing financial aid. Many top universities in the U.S. do not offer substantial financial aid to international students. As discussed, schools like UCLA and U