In [1]:
import requests
import json
import time
import re
from bs4 import BeautifulSoup
import os
from google import genai
from google.genai import types
from google.cloud import aiplatform 
from dotenv import load_dotenv
from google.cloud import firestore
from google.cloud.aiplatform_v1.types import IndexDatapoint

# Get and clean SEC filing data

In [None]:
HEADERS = {'User-Agent': 'YourName YourCompany your.email@example.com'}
TICKER = "SBET"
TARGET_FORMS = ['10-K', '10-Q']

In [None]:
def get_cik_from_ticker():

    url = "https://www.sec.gov/files/company_tickers.json"
    try:
        response = requests.get(url, headers = HEADERS)
        response.raise_for_status()
        data = response.json()
        
        for _, d in data.items():
            if d['ticker'] == TICKER.upper():
                cik = str(d['cik_str'])
                cik = cik.zfill(len(cik) + 3)
                
        return cik
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

def get_sec_data(cik):

    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    try:
        response = requests.get(url, headers = HEADERS)
        response.raise_for_status() 
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

def parse_sec_data(data):

    filing_data = data.get("filings", {}).get("recent", {})
    if not filing_data:
        raise Exception("No filing data found.")

    extracted_data = []
    num_filings = len(filing_data.get("form", []))
    if num_filings:
        for i in range(num_filings):
            form_type = filing_data.get("form", [])[i]
            if form_type in TARGET_FORMS:
                try:
                    accession_number = filing_data.get('accessionNumber', [])[i]
                    primary_document = filing_data.get('primaryDocument', [])[i]
                    filing_date = filing_data.get('filingDate', [])[i]
                    report_date = filing_data.get('reportDate', [])[i]

                    filing_details = {
                        'form_type': form_type,
                        'accession_number': accession_number,
                        'primary_document': primary_document,
                        'filing_date': filing_date,
                        'report_date': report_date
                    }
                    extracted_data.append(filing_details)

                except IndexError:
                    print(f"Warning: Data inconsistency at index {i}. Skipping this filing.")
                    continue
    return extracted_data

def construct_sec_url(cik, filing_details):

    url = "https://www.sec.gov/Archives/edgar/data"
    target_cik = cik.lstrip("0")
    accession_number = filing_details.get("accession_number", "").replace("-", "")
    primary_document = filing_details.get("primary_document", "")
    
    if accession_number and primary_document:
        url = f"{url}/{target_cik}/{accession_number}/{primary_document}"
    else:
        raise Exception("Info is missing.")

    return url

def get_html_text(url):

    try:
        res = requests.get(url, headers = HEADERS)
        res.raise_for_status()
        html_text = res.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

    return html_text

def html_table_to_markdown(table_tag):
    """
    Converts a BeautifulSoup table tag into a Markdown formatted string.
    This helps preserve the structure of financial data for the LLM.
    """

    markdown_lines = []
    # process table headers
    headers = [th.get_text(strip = True).replace("\n", "") for th in table_tag.find_all("th")]

    # calculate the space between cells?
    if headers:
        markdown_lines.append("| " + " | ".join(headers) + " |")
        markdown_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")

    # process table rows
    for row in table_tag.find_all("tr"):
        cells = [td.get_text(strip = True).replace("\n", "") for td in row.find_all(["td", "th"])]
        # only add rows that have content and match the header count if headers exist
        if cells and (not headers or len(cells) == len(headers)):
            markdown_lines.append("| " + " | ".join(cells) + " |")

    return "\n\n" + "\n".join(markdown_lines) + "\n\n" 

def clean_html(html_text):

    if not html_text:
        return ""

    soup = BeautifulSoup(html_text, "lxml")

    # decompose (completely remove) all script, style, and other non-content tags
    for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
        tag.decompose()

    # convert tables to Markdown and replace the original table tag
    for table in soup.find_all("table"):
        markdown_text_tag = soup.new_string(html_table_to_markdown(table))
        table.replace_with(markdown_text_tag)

    text = soup.get_text(separator = "\n", strip = True)
    # remove excessive blank lines to make it more readable
    cleaned_text = re.sub(r'\n\n+', '\n\n', text)

    return cleaned_text

def chunk_filing_by_section(cleaned_text, metadata):

    pattern = r'(?i)(item\s*\d+[a-z]?\.?)'
    parts = re.split(pattern, cleaned_text)

    chunks = []

    intro_content = parts[0].strip()
    if len(intro_content.split()) > 20:
        chunks.append({
            "content": intro_content,
            "metadata": {**metadata, "section": "Introduction"}
        })

    # The rest of the list is ['Item 1.', 'Content of Item 1...', 'Item 1A.', 'Content of 1A...']
    # We iterate through them in pairs.
    for i in range(1, len(parts), 2):
        header = parts[i].strip()
        content = parts[i + 1].strip() if (i + 1) < len(parts) else ""

        chunk_content = f"{header}\n\n{content}"

        if len(content.split()) > 20:
            chunk_obj = {
                "content": chunk_content,
                "metadata": {**metadata, "section": header}
            }
            chunks.append(chunk_obj)

    return chunks

In [None]:
cik = get_cik_from_ticker()
data = get_sec_data(cik)
extracted_data = parse_sec_data(data)

# test 1 filing first
if extracted_data:
    for filing in extracted_data:
        url = construct_sec_url(cik, filing)
        filing["url"] = url
        html_text = get_html_text(url)
        cleaned_text = clean_html(html_text)
        chunks = chunk_filing_by_section(cleaned_text, filing)
        break



# Google Cloud and Gemini configurations

In [None]:
load_dotenv()

PROJECT_ID = os.environ["PROJECT_ID"]
REGION = "europe-west2"
GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]
GEMINI_EMBEDDING_MODEL = "models/embedding-001"
GEMINI_EMBEDDING_MODEL_DIMENSION = 768 # Gemini embedding-001 model has 768 dimensions
COLLECTION_NAME = os.environ["COLLECTION_NAME"]
INDEX_NAME = os.environ["INDEX_NAME"]
INDEX_ENDPOINT_NAME = os.environ["INDEX_ENDPOINT_NAME"]

client = genai.Client(api_key = GEMINI_API_KEY)
aiplatform.init(project = PROJECT_ID, location = REGION)

### Retrieval part of Retrieval-Augmented Generation (RAG)

In [None]:
def store_chunk_in_firestore(chunk_id, chunk_data):

    try:
        db = firestore.Client(PROJECT_ID)
        doc_ref = db.collection(COLLECTION_NAME).document(chunk_id)
        if doc_ref.get().exists:
            print(f"Document with ID: {chunk_id} already exists. Skipping write.")
            return
        doc_ref.set(chunk_data)
        print(f"Successfully stored document with ID: {chunk_id} in collection '{collection_name}'")

    except Exception as e:
        print(f"Error storing document {chunk_id}: {e}")

def get_chunk_in_firestore(chunk_id):

    try:
        db = firestore.Client(PROJECT_ID)
        doc_ref = db.collection(COLLECTION_NAME).document(chunk_id)
        doc = doc_ref.get()

        if doc.exists:
            print(f"Successfully retrieved document: {chunk_id}")
            
            return doc.to_dict()
        else:
            print(f"No document found with ID: {chunk_id}")
    except Exception as e:
        print(f"Error retrieving document {chunk_id}: {e}")

def create_and_deploy_vector_index(chunks_for_index):
    """
    Creates a Vertex AI Vector Search Index, an Index Endpoint, and deploys the index.
    This is a one-time setup process.
    """

    # Create the index
    try:
        vector_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
            display_name = INDEX_NAME,
            dimensions = GEMINI_EMBEDDING_MODEL_DIMENSION,
            approximate_neighbors_count = 150,
            distance_measure_type = "DOT_PRODUCT_DISTANCE",
            index_update_method = "STREAM_UPDATE"
        )
        index_id = vector_index.resource_name.split("/")[-1]

    except Exception as e:
        if "already exists" in str(e):
            print(f"Index '{INDEX_NAME}' already exists. Reusing it.")
        else:
            raise Exception(e)
    
    # Create an Index Endpoint
    try:
        index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
            display_name = INDEX_ENDPOINT_NAME, public_endpoint_enabled = True
        )
    except Exception as e:
        if "already exists" in str(e):
            print(f"Endpoint '{INDEX_ENDPOINT_NAME}' already exists. Reusing it.")
            index_endpoint = aiplatform.MatchingEngineIndexEndpoint.list(filter = f'display_name="{INDEX_ENDPOINT_NAME}"')[0]
        else:
            raise Exception(e)

    # Deploy the Index to the Endpoint
    try:
        # A unique ID for this deployment
        deployed_index_id = f"gemini_deployed_{int(time.time())}" 
        index_endpoint.deploy_index(
            index = vector_index, deployed_index_id = deployed_index_id
        )
    except Exception as e:
        if "has been deployed" in str(e):
            print("Index is already deployed to this endpoint.")
        else:
            raise Exception(e)

    return index_id

def generate_embeddings_and_prepare_datapoints(chunks):
    """
    Takes a list of chunk dictionaries, generates an embedding for each using Gemini,
    and formats them for uploading to Vertex AI Vector Search.
    """

    datapoints = []
    for i, chunk in enumerate(chunks):
        try:
            response = client.models.embed_content(
                model = GEMINI_EMBEDDING_MODEL,
                contents = chunk["content"],
                config = types.EmbedContentConfig(task_type = "RETRIEVAL_DOCUMENT"),
            ).embeddings
            embedding_vector = response[0].values

            # Create the datapoint structure required by Vertex AI
            # store unique id and embedding only
            datapoint = IndexDatapoint(
                datapoint_id = f"{TICKER}-{chunk['metadata']['form_type']}-{chunk['metadata']['accession_number']}-{i}",
                feature_vector = embedding_vector
            )
            datapoints.append(datapoint)
        except Exception as e:
            print(f"Error generating embedding for chunk {i}: {e}")
            continue

    return datapoints

def upload_datapoints_to_vertex_ai(index_resource_name, datapoints):

    try:
        index = aiplatform.MatchingEngineIndex(index_name = index_resource_name)
        index.upsert_datapoints(datapoints = datapoints)
    except Exception as e:
        raise Exception(e)

def get_index_id():

    try:
        indexes = aiplatform.MatchingEngineIndex.list(
            filter = f'display_name="{INDEX_NAME}"'
        )
        index = indexes[0].resource_name.split("/")[-1]
        
        return index

    except Exception as e:
        raise Exception(f"An error occurred: {e}")

In [None]:
# store metadata on Firestore
for i, chunk in enumerate(chunks):
    id = f"{TICKER}-{chunk['metadata']['form_type']}-{chunk['metadata']['accession_number']}-{i}"
    store_chunk_in_firestore(id, chunk)

# get metadata of the chunks
# doc = get_chunk_in_firestore("chunk_id")

In [None]:
# one-time setup
# index_id = create_and_deploy_vector_index(INDEX_NAME, INDEX_ENDPOINT_NAME, chunks)

index_id = get_index_id()
datapoints = generate_embeddings_and_prepare_datapoints(chunks)

if datapoints:
    index_resource_name = f"projects/{PROJECT_ID}/locations/{REGION}/indexes/{index_id}" 
    upload_datapoints_to_vertex_ai(index_resource_name, datapoints)

### Augmented Generation part of RAG

In [29]:
GEMINI_GENERATIVE_MODEL = "gemini-2.5-flash"

In [30]:
def find_relevant_chunks(query, index_id, num_results = 5):
    """
    Takes a user's query, generates an embedding, and finds the most similar chunks in the Vertex AI Vector Search index.
    """
    
    try:
        response = client.models.embed_content(
            model = GEMINI_EMBEDDING_MODEL,
            contents = query,
            config = types.EmbedContentConfig(task_type = "RETRIEVAL_QUERY"),
        ).embeddings
        embedding_vector = response[0].values
    except Exception as e:
        print(f"Can't generate embeddings for the query: {e}") 
        return
    
    try:
        index_endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name = INDEX_ENDPOINT_NAME)
        response = index_endpoint.find_neighbors(
            deployed_index_id = index_id,
            queries = embedding_vector,
            num_neighbors = num_results
        )
        matched_ids = [neighbor.id for neighbor in response[0]]
        print(f"Found {len(matched_ids)} matching document IDs.")

        return matched_ids
    except Exception as e:
        print(f"Error querying Vertex AI: {e}")
        return

def retrieve_chunks_from_firestore(chunk_ids):
    """
    Retrieves the full text and metadata for a list of chunk IDs from Firestore.
    """

    retrieved_chunks = []
    for chunk_id in chunk_ids:
        doc = get_chunk_in_firestore(chunk_id)
        retrieved_chunks.append(doc)

    return retrieved_chunks

def augemnt_and_generate_answer(query, context_chunks):
    """
    Combines the user's query and the retrieved context into a prompt, then asks the AI model to generate a final answer.
    """

    context_str = "\n---\n".join([chunk['content'] for chunk in context_chunks])

    prompt = f"""
    You are a helpful financial analyst assistant. Answer the following question based ONLY on the context provided below.
    If the context does not contain the answer, say "I cannot answer this question based on the provided context."

    QUESTION:
    {query}

    CONTEXT:
    {context_str}

    ANSWER:
    """

    try:
        response = client.models.generate_content(
            model = GEMINI_GENERATIVE_MODEL,
            contents = prompt
        )
        return response.text
    except Exception as e:
        print(f"Error generating answer with Gemini: {e}")
        return

def ask_question(query):
    """
    The main function that orchestrates the entire RAG pipeline.
    """

    relavant_ids = find_relevant_chunks(query, index_id)
    if not relavant_ids:
        print("Could not find any relevant documents.")
        return
    
    context_data = retrieve_chunks_from_firestore(relevant_ids)
    if not context_data:
        print("Could not retrieve document content from Firestore.")
        return

    final_ans = augemnt_and_generate_answer(query, context_data)
    print("\n--- FINAL ANSWER ---")
    print(final_answer)
    print("\n--- SOURCES ---")
    for chunk in context_data:
        print(f"- {chunk['metadata']['form_type']}, Report Date: {chunk['metadata']['report_date']}, Section: {chunk['metadata']['section']}, URL: {chunk['metadata']['url']}")

In [None]:
user_question = ""
    
ask_question(user_question)