In [4]:
from __future__ import annotations
import os, json, numpy as np, pandas as pd
from pathlib import Path
from typing import Dict, Any, List
from sentence_transformers import SentenceTransformer
import faiss  # type: ignore
_FAISS_OK = True


class RetrieverIndex:
    """
    Loads a FAISS index if possible; otherwise loads/creates a local NumPy index:
      - uses documents.parquet and metadata.parquet
      - builds/saves embeddings.npz on first run if not present
    """
    def __init__(self, store_dir: Path, embed_model_name: str):
        self.store_dir = Path(store_dir)
        self.embedder = SentenceTransformer(embed_model_name, device="cpu")

        self.index_faiss = self.store_dir / "index.faiss"
        self.meta_path   = self.store_dir / "metadata.parquet"
        self.docs_path   = self.store_dir / "documents.parquet"
        self.emb_npz     = self.store_dir / "embeddings.npz"  # optional

        if not self.meta_path.exists() or not self.docs_path.exists():
            raise FileNotFoundError(
                f"Missing metadata/documents parquet in: {self.store_dir}\n"
                f"Expected files: {self.meta_path.name}, {self.docs_path.name}"
            )

        self.meta = pd.read_parquet(self.meta_path)
        self.docs = pd.read_parquet(self.docs_path)

        # Choose backend
        self.backend = "faiss" if _FAISS_OK and self.index_faiss.exists() else "numpy"

        if self.backend == "faiss":
            self.index = faiss.read_index(str(self.index_faiss))
            # Ensure ids column exists in meta/docs
            if "id" not in self.meta.columns or "id" not in self.docs.columns:
                raise ValueError("Parquet metadata/documents must contain an 'id' column aligned to FAISS IDs.")
        else:
            # NumPy fallback: we need embeddings matrix
            if self.emb_npz.exists():
                self.emb = np.load(self.emb_npz)["emb"].astype(np.float32)
            else:
                # Build embeddings once, save to disk
                texts = self.docs.sort_values("id")["text"].tolist()
                embs: List[np.ndarray] = []
                B = 512
                for i in range(0, len(texts), B):
                    batch = self.embedder.encode(
                        texts[i:i+B],
                        batch_size=64,
                        normalize_embeddings=True,
                        convert_to_numpy=True,
                        show_progress_bar=False,
                    )
                    embs.append(batch.astype(np.float32))
                self.emb = np.vstack(embs)
                np.savez_compressed(self.emb_npz, emb=self.emb)

            # Align meta/docs by id ascending
            self.meta = self.meta.sort_values("id").reset_index(drop=True)
            self.docs = self.docs.sort_values("id").reset_index(drop=True)

    def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
        q = self.embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype(np.float32)
        if self.backend == "faiss":
            sims, ids = self.index.search(q, k)
            sims, ids = sims[0], ids[0]
            rows = []
            id_to_row = self.meta.set_index("id")
            txt_map = self.docs.set_index("id")["text"]
            for s, i in zip(sims, ids):
                md = id_to_row.loc[int(i)].to_dict()
                md["score"] = float(s)
                md["text"] = txt_map.loc[int(i)][:240]
                rows.append(md)
            return rows
        else:
            # cosine with normalized vectors => inner product
            sims = (self.emb @ q[0])
            idx = np.argpartition(-sims, k)[:k]
            top = idx[np.argsort(-sims[idx])]
            rows = []
            for i in top:
                md = self.meta.iloc[int(i)].to_dict()
                md["score"] = float(sims[i])
                md["text"] = self.docs.iloc[int(i)]["text"][:240]
                rows.append(md)
            return rows

    def info(self) -> str:
        n = len(self.meta)
        return f"RetrieverIndex(backend={self.backend}, vectors={n})"

  from tqdm.autonotebook import tqdm, trange


In [7]:
import time
from typing import List, Dict, Any
import yfinance as yf
import feedparser

from config import VECTOR_FAISS_DIR, EMBEDDING_MODEL, TOP_K

class DataAgent:
    def __init__(self):
        self.index = RetrieverIndex(VECTOR_FAISS_DIR, EMBEDDING_MODEL)

    # ---------- Vector retrieval ----------
    def retrieve(self, query: str, k: int = TOP_K) -> List[Dict[str, Any]]:
        return self.index.search(query, k=k)

    # ---------- Live prices ----------
    def fetch_prices(self, symbols: List[str], period: str = "1mo", interval: str = "1d") -> Dict[str, List[Dict[str, Any]]]:
        out: Dict[str, List[Dict[str, Any]]] = {}
        for s in symbols:
            try:
                df = yf.Ticker(s).history(period=period, interval=interval, auto_adjust=False)
                if df.empty: continue
                df = df.rename_axis("date").reset_index()
                df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)
                df = df.rename(columns={
                    "Open":"open","High":"high","Low":"low","Close":"close",
                    "Adj Close":"adj_close","Volume":"volume"
                })
                if "adj_close" not in df.columns and "close" in df.columns:
                    df["adj_close"] = df["close"]
                out[s] = df[["date","open","high","low","close","adj_close","volume"]].tail(5).to_dict(orient="records")
                time.sleep(0.15)
            except Exception:
                pass
        return out

    # ---------- Google News RSS ----------
    @staticmethod
    def _rss_url(q: str, hl="en-IN", gl="IN", ceid="IN:en") -> str:
        from urllib.parse import quote_plus
        return f"https://news.google.com/rss/search?q={quote_plus(q)}&hl={hl}&gl={gl}&ceid={ceid}"

    def fetch_rss(self, queries: List[str], per_query_limit: int = 30, sleep_s: float = 0.25) -> List[Dict[str, Any]]:
        items, seen = [], set()
        for q in queries:
            feed_url = self._rss_url(q)
            feed = feedparser.parse(feed_url)
            entries = feed.entries[:per_query_limit] if getattr(feed, "entries", None) else []
            for e in entries:
                link = (e.get("link") or "").strip()
                if not link or link in seen: continue
                seen.add(link)
                items.append({
                    "title": (e.get("title") or "").strip(),
                    "link": link,
                    "published": (e.get("published") or e.get("updated") or "").strip(),
                    "source_feed": feed_url,
                    "query": q,
                })
            time.sleep(sleep_s)
        return items

    # ---------- One-call pipeline ----------
    def run_pipeline(self, user_query: str, tickers: List[str], rss_queries: List[str], k: int = TOP_K) -> Dict[str, Any]:
        return {
            "index_info": self.index.info(),
            "query": user_query,
            "retrieval": self.retrieve(user_query, k=k),
            "prices": self.fetch_prices(tickers),
            "rss": self.fetch_rss(rss_queries),
        }

In [9]:
from config import NSE_TICKERS, RSS_QUERIES

agent = DataAgent()
print(agent.index.info())

user_query = "Impact of recent RBI repo rate decisions on Indian bank margins and credit growth"
bundle = agent.run_pipeline(user_query, tickers=NSE_TICKERS, rss_queries=RSS_QUERIES, k=10)

print("\n=== Retrieval (top chunks) ===")
for i, hit in enumerate(bundle["retrieval"][:5], 1):
    print(f"[{i}] score={hit['score']:.4f} | {hit.get('title','')}")
    print(hit.get("url",""))
    print((hit.get("text","") or "")[:220].replace("\n"," ") + "...\n")

print("=== Prices (last 5 rows) ===")
for sym, rows in bundle["prices"].items():
    print(sym, "→", rows)

print("\n=== RSS (sample) ===")
for h in bundle["rss"][:5]:
    print("-", h["title"], "->", h["link"])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


RetrieverIndex(backend=faiss, vectors=3544)

=== Retrieval (top chunks) ===
[1] score=0.8395 | RBI Repo Rate cut : A golden opportunity for homebuyers
http://www.dailypioneer.com/2025/columnists/rbi---s-repo-rate-cut--a-golden-opportunity-for-homebuyers.html
RBI Repo Rate cut : A golden opportunity for homebuyers...

[2] score=0.8279 | Kaushik Das : Expect another RBI rate cut in support of economic growth
https://www.livemint.com/opinion/online-views/kaushik-das-expect-another-rbi-rate-cut-in-support-of-economic-growth-sanjay-malhotra-repo-rate-interest-rate-crr-vrr-11740638380241.html
Kaushik Das: Expect another RBI rate cut in support of economic growth - This April may see the Indian central bank’s monetary policy panel cut its repo rate by 25 basis points. Current economic dynamics also suggest we’...

[3] score=0.8177 | Banks shed bulk deposits for healthy net interest margins
https://www.financialexpress.com/business/banking-finance-banks-shed-bulk-deposits-for-healthy-net-inter