In [1]:
import os

from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.document import Document

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.vectorstores import Qdrant
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    GoogleGenerativeAIEmbeddings,
    HarmBlockThreshold,
    HarmCategory,
)
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# portfolio_constants is the constants file which contains 
# these 2 variables. The constants file can be found below.
from portfolio_constants import VECTOR_DB_COLLECTION, portfolio

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

llm = ChatGoogleGenerativeAI(
        model="gemini-pro",
        safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        },
    )
Embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

cnbc_quarter_report = {
    'GOOGL': {"report_link": 'https://www.cnbc.com/2024/01/30/alphabet-googl-q4-earnings-report-2023.html', "quarter": "Q4", "year": "2023"},
    'AAPL': {"report_link": 'https://www.cnbc.com/2023/11/02/apple-aapl-earnings-report-q4-2023.html', "quarter": "Q4", "year": "2023"},
    'TSLA': {"report_link": 'https://www.cnbc.com/2024/01/24/tesla-tsla-earnings-q4-2023.html', "quarter": "Q4", "year": "2023"},
    'MSFT': {"report_link": 'https://www.cnbc.com/2024/01/30/microsoft-msft-q2-earnings-report-2024.html', "quarter": "Q4", "year": "2023"},
    'WMT': {"report_link": 'https://www.cnbc.com/2024/02/20/walmart-wmt-q4-2024-earnings-.html', "quarter": "Q4", "year": "2023"}
}

In [3]:
# To locate the earning report, we're meticulously extracting relevant information like financial data,
# key metrics, and analyst commentary from this webpage, while discarding distractions such as navigation menus,
# page reference links, banner ads, social media widgets, contact information, and legal disclaimers.
def extract_content(url):
    template = """You are an experienced equity research analyst and you do a fantastic job of extracting company's earning information from the `company's earning report`. 

        You are instructed that, if the given text doesnot belong to `company's earning report` then ignore the text and return only the text `**NA**`.

        You are instructed to extract the exact lines from the `company's earning report` as it is. Don't update or modify the extracted lines.

        Below is the `company's earning report`:
        {earning_report}
    """

    chunked_docs = chunk_web_data(url)
    extracted_text_content = ""
    for doc in chunked_docs:
        prompt = ChatPromptTemplate.from_template(template)
        chain = prompt | llm
        data = chain.invoke({"earning_report": doc}).content
        if "**NA**" in data:
            continue
        extracted_text_content += data

    return extracted_text_content

In [4]:
# Breaking down the webpage content into small documents so that it can be passed to the LLM to remove the noise
# from the financial data
def chunk_web_data(url):
    documents = scrape_content(url)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    return text_splitter.split_documents(documents)

In [5]:
# We are using Selenium to scrape the webpage content of the given URL
def scrape_content(url):
    urls = [url]
    loader = SeleniumURLLoader(urls=urls)
    return loader.load()


In [6]:
# The LLM filtered data is now broken down smaller documents before storing them in the Qdrant Vector store. In the
# metadata we are passing the company ticker, and the quarter and the year of the earning report. This will help in
# fetching the relevant information.
def chunk_text_data(text, ticker, quarter, year):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    metadata_source = ticker + "-" + quarter + "-" + year
    document = Document(page_content=text, metadata={"source": metadata_source})
    return text_splitter.split_documents([document])

In [7]:
# Using this function we are inserting the docs in the Qdrant DB
def insert_data_to_vector_store(docs):
    Qdrant.from_documents(
        docs,
        Embeddings,
        url=QDRANT_URL,
        prefer_grpc=True,
        api_key=QDRANT_API_KEY,
        collection_name=VECTOR_DB_COLLECTION,
    )


In [8]:
# This is the main function which orchestrates the entire flow from fetching content to storing them in the vector store.
# def main():
for entry in portfolio:
    ticker = entry["ticker"]
    company_name = entry["company_name"]

    report_dict = cnbc_quarter_report[ticker]
    report_link = report_dict["report_link"]
    year = report_dict["year"]
    quarter = report_dict["quarter"]

    print("Extracting content for: ", company_name)
    extracted_text_content = extract_content(report_link)

    print("Chunking document for " + ticker + "-" + quarter + "-" + year)
    chunked_docs = chunk_text_data(extracted_text_content, ticker, quarter, year)

    print("Inserting Report to Qdrant for " + company_name)
    insert_data_to_vector_store(chunked_docs)


Extracting content for:  Tesla
Chunking document for TSLA-Q4-2023
Inserting Report to Qdrant for Tesla
