# Env Setup

**Install dependencies on colab**

In [1]:
!pip install "langchain>=0.2" langchain-community langchain-openai qdrant-client "pypdf>=4.0.0" "langchain-qdrant" gdown


Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.15.1-py3-none-any.whl.metadata (11 kB)
Collecting pypdf>=4.0.0
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-qdrant
  Downloading langchain_qdrant-0.2.1-py3-none-any.whl.metadata (1.4 kB)
Collecting requests<3,>=2 (from langchain>=0.2)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-non

In [None]:
# https://github.com/qdrant/qdrant/releases
# On Windows use PowerShell, on macOS use the regular Terminal.
# On macOS you probably use "/" instead of "\"
# cd C:\qdrant <- your path
# .\qdrant.exe <- run in the terminal


**Import libs**

In [2]:
import hashlib
import uuid
import os
import zipfile
import gdown
from pathlib import Path
from typing import List, Dict, Any

# --- LangChain core
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# --- Loaders
from langchain_community.document_loaders import TextLoader, PyPDFLoader

# --- OpenAI embeddings (LangChain)
from langchain_openai import OpenAIEmbeddings

# --- Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from langchain_qdrant import QdrantVectorStore


**Setup keys and pathes**

In [3]:
# we set environment variables while the program is running
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY_HERE"
os.environ["QDRANT_URL"] = "https://23c58579-503f-4a87-93a0-2e5b386c65f0.europe-west3-0.gcp.cloud.qdrant.io:6333"
os.environ['QDRANT_API_KEY'] = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.KsoZCg8rz5OLcu4NazcswJbCr0psRKgjdIkXTKG9aig'



DATA_DIR = Path("./data")
COLLECTION_NAME = "my_rag_collection"

# Embeddings (OpenAI)
EMBEDDING_MODEL = "text-embedding-3-large"  # -small (cheaper)
CHAT_MODEL = "gpt-4o-mini"



**Download dataset**

In [6]:

URL = "https://drive.google.com/drive/folders/1wQQUmLxxqXABXzkj1D3je9HTPOMcHgrt?usp=share_link"
os.makedirs(DATA_DIR, exist_ok=True)
gdown.download_folder(URL, output=str(DATA_DIR), quiet=False)      # pobiera wszystkie pliki z folderu



Retrieving folder contents


Processing file 1EX4S0bcJRDgh7VgjLlkmzni_VZOAiBpj NYSE_CS_2016.pdf
Processing file 1zKCyWiseB5wcyDBoAmbfR6XIpxrrs6PK NYSE_CS_2017.pdf
Processing file 1pZL2w-2HX9hlryndqjZTBO_3qFHLcDYy NYSE_CS_2018.pdf
Processing file 1xlbQvR_5OQ1JxDTIg6RVC2fYpKZwVGHc NYSE_CS_2019.pdf
Processing file 1c2oHH-ch9rRIKb019gqnfldcJOuD7oxb NYSE_CS_2023.pdf


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1EX4S0bcJRDgh7VgjLlkmzni_VZOAiBpj
To: /content/data/NYSE_CS_2016.pdf
100%|██████████| 6.94M/6.94M [00:00<00:00, 28.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zKCyWiseB5wcyDBoAmbfR6XIpxrrs6PK
To: /content/data/NYSE_CS_2017.pdf
100%|██████████| 6.61M/6.61M [00:00<00:00, 28.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pZL2w-2HX9hlryndqjZTBO_3qFHLcDYy
To: /content/data/NYSE_CS_2018.pdf
100%|██████████| 5.91M/5.91M [00:00<00:00, 23.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xlbQvR_5OQ1JxDTIg6RVC2fYpKZwVGHc
To: /content/data/NYSE_CS_2019.pdf
100%|██████████| 5.35M/5.35M [00:00<00:00, 32.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1c2oHH-ch9rRIKb019gqnfldcJOuD7oxb
To: /content/data/NYSE_CS_2023.pdf
100%|██████████| 3.26M/3.26M [00:00<00:00, 23.8MB/s]
Download completed


['data/NYSE_CS_2016.pdf',
 'data/NYSE_CS_2017.pdf',
 'data/NYSE_CS_2018.pdf',
 'data/NYSE_CS_2019.pdf',
 'data/NYSE_CS_2023.pdf']

In [7]:
ls data/

NYSE_CS_2016.pdf  NYSE_CS_2018.pdf  NYSE_CS_2023.pdf
NYSE_CS_2017.pdf  NYSE_CS_2019.pdf


# Populate vectordb (qdrant)

**Chunking params setup**

In [8]:
# Chinking params
CHUNK_SIZE = 800          # number of characters
CHUNK_OVERLAP = 100       # number of characters
SEPARATORS = ["\n\n", "\n", " ", ""]  # To be specified after analyzing the file structure. It may significantly affect the quality of RAG responses.

# Batching writes (performance)
WRITE_BATCH_SIZE = 256


**Utils functions**

In [9]:
def load_documents(data_dir: Path) -> List[Document]:
    """
    Loads documents from the data_dir directory.
    Supports PDF files.
    Each Document has the following metadata: source, page (for PDF), filetype.
    """
    docs: List[Document] = []

    for p in data_dir.rglob("*.pdf"):
        loader = PyPDFLoader(str(p))
        pdf_docs = loader.load()
        for d in pdf_docs:
            d.metadata = {**d.metadata, "source": str(p), "filetype": ".pdf", "page": d.metadata.get("page")}
        docs.extend(pdf_docs)

    return docs


def make_chunks(docs: List[Document]) -> List[Document]:
    """
    Chunking documents based on RecursiveCharacterTextSplitter.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=SEPARATORS,
        add_start_index=True,
    )
    return splitter.split_documents(docs)


def deterministic_id(text: str, metadata: dict) -> str:
    """
    Returns a stable, deterministic UUID v5 based on the chunk content
    and key metadata. Compliant with Qdrant requirements (UUID or int).
    """
    source = str(metadata.get("source", ""))
    page = str(metadata.get("page", ""))
    text_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
    name = f"{source}|{page}|{text_hash}"

    return str(uuid.uuid5(uuid.NAMESPACE_URL, name))


def ensure_collection(client: QdrantClient, collection_name: str, vector_size: int) -> None:
    """
    Creates a collection if it does not exist. For simplicity: 1 vector per record, Cosine.
    """
    exists = client.collection_exists(collection_name)
    if exists:
        return

    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=rest.VectorParams(size=vector_size, distance=rest.Distance.COSINE),
    )



**Main function**

In [10]:
def populate_vector_store():
    openai_api_key = os.environ.get("OPENAI_API_KEY")
    if not openai_api_key:
        raise RuntimeError("Setup OPENAI_API_KEY.")

    qdrant_url = os.environ.get("QDRANT_URL")
    qdrant_api_key = os.environ.get('QDRANT_API_KEY')

    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=openai_api_key)

    if EMBEDDING_MODEL == "text-embedding-3-large":
        dim = 3072
    elif EMBEDDING_MODEL == "text-embedding-3-small":
        dim = 1536
    else:
        raise RuntimeError("Setup embeddings shape.")

    client = QdrantClient(
        url=qdrant_url,
        api_key=qdrant_api_key,
        timeout=60,
    )

    # Load pdfs
    if not DATA_DIR.exists():
        raise RuntimeError(f"Directory {DATA_DIR} does not exist. Create {DATA_DIR} and add some files!")

    base_docs = load_documents(DATA_DIR)
    if not base_docs:
        raise RuntimeError(f"Empty directory {DATA_DIR}. Add some .pdf files.")

    # Chunking
    chunks = make_chunks(base_docs)

    # Idempotentne IDs
    ids = [deterministic_id(doc.page_content, doc.metadata) for doc in chunks]

    # Setup Qdrant collection
    ensure_collection(client, COLLECTION_NAME, dim)
    vectorstore = QdrantVectorStore(
        client=client,
        collection_name=COLLECTION_NAME,
        embedding=embeddings,
    )

    # Save chunks to Qdrant
    for i in range(0, len(chunks), WRITE_BATCH_SIZE):
        batch_docs = chunks[i : i + WRITE_BATCH_SIZE]
        batch_ids = ids[i : i + WRITE_BATCH_SIZE]
        vectorstore.add_documents(batch_docs, ids=batch_ids)

    print(f"\nFinished. Collection: {COLLECTION_NAME}, new vectors in collection: {len(chunks)}")





**Populate vector storage**

In [11]:
populate_vector_store()


Finished. Collection: my_rag_collection, new vectors in collection: 13097


# Ask database (RAG)

In [12]:
import os
from typing import List, Tuple, Optional, Dict, Any, Iterable
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from langchain_qdrant import QdrantVectorStore
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import Document

**Utils**

In [13]:
## Utils

def _format_docs(docs: List[Document]) -> str:
    """Combines chunks into a single context with short [i] tags corresponding to sources."""
    parts: List[str] = []
    for i, d in enumerate(docs, 1):
        src = d.metadata.get("source", "unknown")
        page = d.metadata.get("page")
        tag = f"{src}" + (f":{page}" if page is not None else "")
        parts.append(f"{d.page_content}\n[[{i}] {tag}]")
    return "\n\n---\n\n".join(parts)


def _pretty_sources(docs_with_scores: List[Tuple[Document, float]], topk: int = 4) -> List[str]:
    """Returns a unique list of sources (file[:page]) with score."""
    out: List[str] = []
    seen = set()
    for doc, score in docs_with_scores:
        src = doc.metadata.get("source", "unknown")
        page = doc.metadata.get("page")
        label = f"{src}" + (f":{page}" if page is not None else "")
        if label in seen:
            continue
        seen.add(label)
        out.append(f"{label} (score={score:.4f})")
        if len(out) >= topk:
            break
    return out


def _build_vectorstore(client: Optional[QdrantClient] = None) -> QdrantVectorStore:
    """Creates a VectorStore on an existing Qdrant collection."""
    client = QdrantClient(url=os.environ.get("QDRANT_URL"), api_key= os.environ.get('QDRANT_API_KEY'), timeout=60)

    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=os.environ.get("OPENAI_API_KEY"))
    vs = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME, embedding=embeddings)
    return vs


def build_retriever(
    client: Optional[QdrantClient] = None,
    mode: str = "mmr",
    k: int = 6,
    **kwargs
):
    """
    Creates a retriever from QdrantVectorStore.
    mode: "mmr" or "similarity"
    k: number of chunks for the context
    **kwargs: e.g. lambda_mult, fetch_k, filter (Qdrant Filter)
    """
    vs = _build_vectorstore(client)
    if mode == "mmr":
        search_kwargs = {"k": k, "fetch_k": max(10, 3 * k), "lambda_mult": kwargs.pop("lambda_mult", 0.5)}
    else:
        search_kwargs = {"k": k}
    search_kwargs.update(kwargs)
    return vs.as_retriever(search_type=mode, search_kwargs=search_kwargs)


def _build_prompt_and_chain(retriever) -> Any:
    """Builds a simple RAG chain: retriever → prompt → ChatOpenAI → text."""
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system",
             "You are a helpful assistant. Answer concisely using ONLY the provided context. "
             "If the answer is not in the context, say you don't know. "
             "Use short citations like [1], [2] that correspond to the provided chunks."),
            ("human", "Question: {question}\n\nContext:\n{context}")
        ]
    )
    llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)
    chain = (
        {
            "context": retriever | _format_docs,
            "question": RunnablePassthrough(),
        }
        | prompt
        | llm
        | StrOutputParser()
    )
    return chain

**Main function**

In [14]:
def rag_answer(
    question: str,
    client: Optional[QdrantClient] = None,
    mode: str = "mmr",
    k: int = 6,
    topk_sources: int = 4
) -> Tuple[str, List[str]]:
    """Returns (answer, sources_list)."""
    vs = _build_vectorstore(client)
    retriever = build_retriever(client=client, mode=mode, k=k)
    chain = _build_prompt_and_chain(retriever)

    answer = chain.invoke(question)
    topk_with_scores = vs.similarity_search_with_score(question, k=max(topk_sources, k))
    sources = _pretty_sources(topk_with_scores, topk=topk_sources)
    return answer, sources



**Ask RAG**

In [22]:
ans, srcs = rag_answer("Focus your analysis through the CAMELS framework on these leading indicators - Capital adequacy & buffers: CET1, Tier 1, Total capital ratios, Leverage ratio, Pillar 2 requirements, MDA headroom, buffer adequacy vs. requirements")
print("\n=== ANSWER ===\n", ans)
print("\n=== SOURCES ===")
for s in srcs:
    print("-", s)



=== ANSWER ===
 **Capital Adequacy & Buffers Analysis through CAMELS Framework:**

1. **CET1 Ratio**: The Common Equity Tier 1 (CET1) ratio has shown a declining trend from 6.175% in 2016 to 4.5% in 2020, indicating a potential weakening in capital adequacy over the years [1].

2. **Tier 1 and Total Capital Ratios**: The Tier 1 capital ratio has increased from 1.825% in 2016 to 3.5% in 2020, while the total capital ratio has improved from 10.75% in 2016 to 14.3% in 2020, suggesting a strengthening of overall capital position despite the CET1 decline [1].

3. **Leverage Ratio**: The BIS Tier 1 leverage ratio was reported at 5.5% as of the end of 2019, which is a critical measure of capital adequacy relative to total exposure [6].

4. **Pillar 2 Requirements**: The capital management framework includes internal capital targets consistent with the risk profile, indicating adherence to Pillar 2 requirements [4].

5. **MDA Headroom**: The analysis does not provide specific figures for MDA 