In [7]:
import requests
import time
import re
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from google import genai
from google.genai import types
from google.cloud import aiplatform 
from google.cloud import firestore
from google.cloud.aiplatform_v1.types import IndexDatapoint
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

In [11]:
%run ./data_scraping.ipynb
%run ./data_retrieval.ipynb
%run ./data_augmentation.ipynb

# Google Cloud and Gemini configurations

In [3]:
load_dotenv()

HEADERS = {'User-Agent': 'YourName YourCompany your.email@example.com'}
TICKER = "SBET"
TARGET_FORMS = ['10-K', '10-Q']

PROJECT_ID = os.environ["PROJECT_ID"]
REGION = "europe-west2"
GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]
GEMINI_EMBEDDING_MODEL = "models/embedding-001"
GEMINI_EMBEDDING_MODEL_DIMENSION = 768 # Gemini embedding-001 model has 768 dimensions
GEMINI_GENERATIVE_MODEL = "gemini-2.5-flash"
COLLECTION_NAME = os.environ["COLLECTION_NAME"]
INDEX_NAME = os.environ["INDEX_NAME"]
INDEX_ENDPOINT_NAME = os.environ["INDEX_ENDPOINT_NAME"]

client = genai.Client(api_key = GEMINI_API_KEY)
aiplatform.init(project = PROJECT_ID, location = REGION)

In [12]:
# get the SEC filing data from data_scraping.ipynb

cik, company_name = get_cik_and_name_from_ticker()
data = get_sec_data(cik)
extracted_data = parse_sec_data(data)
chunks = get_filing_chunks(extracted_data)

In [None]:
# store metadata on Firestore

for key, values in chunks.items():
    for i, chunk in enumerate(values):
        unique_id = f"{key}-{i}"
        store_chunk_in_firestore(unique_id, chunk)

In [None]:
# one-time setup
index_id = create_and_deploy_vector_index()

index_id = get_index_id()
datapoints = generate_embeddings_and_prepare_datapoints(chunks)
index_resource_name = f"projects/{PROJECT_ID}/locations/{REGION}/indexes/{index_id}"

if datapoints: 
    upload_datapoints_to_vertex_ai(index_resource_name, datapoints)

Upserting datapoints MatchingEngineIndex index: projects/633076006955/locations/europe-west2/indexes/3905025497209241600
MatchingEngineIndex index Upserted datapoints. Resource name: projects/633076006955/locations/europe-west2/indexes/3905025497209241600


# Generation part of Retrieval-Augmented Generation (RAG)

In [None]:
def augemnt_and_generate_answer(query, system_prompt, context_chunks, temperature):
    """
    Combines the user's query and the retrieved context into a prompt, then asks the AI model to generate a final answer.
    """

    context_str = "**CONTEXT:**\n" + "\n---\n".join([chunk['content'] for chunk in context_chunks])
    query_formatted = "\n\n---\n\n**QUESTION:**\n\n" + query + "\n\n**ANSWER:**"

    try:
        response = client.models.generate_content(
            model = GEMINI_GENERATIVE_MODEL,
            contents = context_str + query_formatted,
            config = types.GenerateContentConfig(
                system_instruction = system_prompt,
                temperature = temperature
            )
        )

        return response.text
    except Exception as e:
        print(f"Error generating answer with Gemini: {e}")

        return

def create_safe_filename(query):
    """Helper function to create a short, safe filename from a user query."""

    # Remove special characters
    query = re.sub(r'[^\w\s-]', '', query)
    query = re.sub(r'\s+', '_', query)
    # Take the first 5 words and join them
    return '_'.join(query.split('_')[:5]).lower()

def save_analysis_to_markdown(query, final_ans, context_data):

    # naming convention: [TICKER]_[QUERY_SUMMARY]_[YYYY-MM-DD].md
    date_str = datetime.now().strftime("%Y-%m-%d")
    query_summary = create_safe_filename(query)
    filename = f"{TICKER}_{query_summary}_{date_str.replace('-', '_')}.md"
    
    output_dir = "output"
    os.makedirs(output_dir, exist_ok = True)
    filepath = os.path.join(output_dir, filename)

    # markdown file content and format
    content = []
    content.append(f"# Financial Analysis for {TICKER}")
    content.append(f"**Analysis Date:** {date_str}")
    
    content.append("\n## Question:")
    content.append(f"{query}")
    content.append("\n---")

    content.append("\n## Generated Analysis:")
    content.append(f"{final_ans}")
    content.append("\n---")

    content.append("\n## Sources")
    for chunk in context_data:
        form_type = chunk.get('metadata', {}).get('form_type', 'N/A')
        report_date = chunk.get('metadata', {}).get('report_date', 'N/A')
        section = chunk.get('metadata', {}).get('section', 'N/A')
        url = chunk.get('metadata', {}).get('url', 'N/A') 
        
        content.append(f"- **Form:** {form_type}")
        content.append(f"  - **Report Date:** {report_date}")
        content.append(f"  - **Section:** {section}")
        content.append(f"  - **URL:** {url}")

    try:
        with open(filepath, 'w', encoding = 'utf-8') as f:
            f.write("\n".join(content))
        print(f"\nSuccessfully saved analysis to: {filepath}")
    except Exception as e:
        print(f"Error saving file: {e}")

def ask_question(query, question_no, temperature = 0.2):
    """
    The main function that orchestrates the entire RAG pipeline.
    """

    relevant_ids = find_relevant_chunks(query)
    if not relevant_ids:
        print("Could not find any relevant documents.")

        return
    
    context_data = retrieve_chunks_from_firestore(relevant_ids)
    if not context_data:
        print("Could not retrieve document content from Firestore.")

        return

    final_ans = augemnt_and_generate_answer(query, system_prompt, context_data, temperature)
    if final_ans:
        save_analysis_to_markdown(query, final_ans, context_data)

# Prompt management

In [17]:
PROMPTLAYER_API_KEY = os.environ["PROMPTLAYER_API_KEY"]
PROMPT_TEMPLATE_IDENTIFIER = os.environ["PROMPT_TEMPLATE_IDENTIFIER"]

In [18]:
def get_prompt():
  
    url = f"https://api.promptlayer.com/prompt-templates/{PROMPT_TEMPLATE_IDENTIFIER}"
    headers = {
        "X-API-KEY": PROMPTLAYER_API_KEY,
        "Content-Type": "application/json"
    }

    response = requests.post(url, headers = headers)
    response.raise_for_status()
    data = response.json()
    
    messages = data.get("prompt_template", {}).get("messages", {})

    system_prompt = ""
    user_prompts = []
    for m in messages:
        if m.get("role", {}) == "system":
            system_prompt = m.get("content", [])[0].get("text", "")
        if m.get("role", {}) == "user":
            user_prompt = m.get("content", [])[0].get("text", "")
            user_prompts.append(user_prompt)

    if not system_prompt or not user_prompts:
        raise ValueError("System or User prompts not found in the PromptLayer response.")

    return system_prompt, user_prompts

In [None]:
system_prompt, user_prompts = get_prompt()

# final execution function to get the analysis from LLM

for i, user_question in enumerate(user_prompts):
    user_question_formatted = user_question.format(company_name = company_name)
    ask_question(user_question_formatted, i + 1)

Found 5 matching document IDs.
Successfully retrieved document: SBET-10-Q-0001641172-25-010881-5
Successfully retrieved document: SBET-10-Q-0001641172-25-010881-8
Successfully retrieved document: SBET-10-Q-0001641172-25-010881-4
Successfully retrieved document: SBET-10-Q-0001641172-25-024107-4
Successfully retrieved document: SBET-10-Q-0001641172-25-024107-8

Successfully saved analysis to: output/SBET_according_to_the_managements_discussion_2025_08_21.md
Found 5 matching document IDs.
Successfully retrieved document: SBET-10-Q-0001641172-25-010881-8
Successfully retrieved document: SBET-10-Q-0001641172-25-024107-8
Successfully retrieved document: SBET-10-Q-0001641172-25-010881-5
Successfully retrieved document: SBET-10-Q-0001641172-25-024107-16
Successfully retrieved document: SBET-10-Q-0001641172-25-010881-4

Successfully saved analysis to: output/SBET_summarize_sharplink_gaming_incs_strategy_2025_08_21.md
Found 5 matching document IDs.
Successfully retrieved document: SBET-10-Q-0001