In [1]:
!unzip '/content/data.zip'

Archive:  /content/data.zip
   creating: data/
   creating: data/api_ninjas/
  inflating: data/api_ninjas/cryptoprice.csv  
  inflating: data/api_ninjas/marketcap.csv  
  inflating: data/api_ninjas/stockprice.csv  
   creating: data/macro/
  inflating: data/macro/cpi.jsonl    
   creating: data/news/
  inflating: data/news/google_news.csv  
   creating: data/news_articles/
  inflating: data/news_articles/articles.jsonl  
   creating: data/news_gdelt/
  inflating: data/news_gdelt/gdelt_news.csv  
   creating: data/prices/
  inflating: data/prices/HDFCBANK_NS.parquet  
  inflating: data/prices/ICICIBANK_NS.parquet  
  inflating: data/prices/INFY_NS.parquet  
  inflating: data/prices/RELIANCE_NS.parquet  
  inflating: data/prices/TCS_NS.parquet  


In [2]:
# Colab: ensure GPU is on (Runtime → Change runtime type → GPU)
!nvidia-smi

Sun Sep 14 15:41:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Keep numpy pinned, upgrade hnswlib to a compatible build
!pip -q install -U "pip<24.3"
!pip -q uninstall -y numpy hnswlib chromadb > /dev/null
!pip -q install "numpy==1.26.4"
!pip -q install "hnswlib==0.8.1" "chromadb==0.5.5"
!pip -q install "sentence-transformers==3.0.1" "pandas==2.2.2" "tqdm==4.66.5" "jsonlines==4.0.0"

import os, time
print("✅ Installed compatible versions. Restarting runtime…")
time.sleep(1)
os.kill(os.getpid(), 9)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement hnswlib==0.8.1 (from versions: 0.3.2.0, 0.3.4, 0.4.0, 0.5.0, 0.5.1, 0.5.2, 0.6.0, 0.6.1, 0.6.2, 0.7.0, 0.8.0)[0m[31m
[0m[31mERROR: No matching distribution found for hnswlib==0.8.1[0m[31m
[0m✅ Installed compatible versions. Restarting runtime…


In [7]:
!pip -q install hnswlib chromadb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m142.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
!pip -q install transformers

In [1]:
import os, json, jsonlines, pandas as pd
from tqdm import tqdm

DATA_DIR = "/content/data"
ART_JSONL = os.path.join(DATA_DIR, "news_articles", "articles.jsonl")
GDELT_CSV = os.path.join(DATA_DIR, "news_gdelt", "gdelt_news.csv")
GNEWS_CSV = os.path.join(DATA_DIR, "news", "google_news.csv")

PERSIST_DIR = "/content/vectorstore/news_v1"
COLLECTION_NAME = "india_news_v1"

for p in [ART_JSONL, GDELT_CSV, GNEWS_CSV]:
    print("OK" if os.path.exists(p) else "MISSING", "→", p)

OK → /content/data/news_articles/articles.jsonl
OK → /content/data/news_gdelt/gdelt_news.csv
OK → /content/data/news/google_news.csv


In [2]:
def load_articles_jsonl(path):
    out = []
    if not os.path.exists(path): return out
    with jsonlines.open(path, "r") as reader:
        for obj in reader:
            text = (obj.get("text") or "").strip()
            if not text: continue
            out.append({
                "url": (obj.get("url") or "").strip(),
                "title": (obj.get("title") or "").strip(),
                "published": (obj.get("seendate") or "").strip(),
                "domain": (obj.get("domain") or "").strip(),
                "source": "articles.jsonl",
                "text": text
            })
    return [r for r in out if r["url"] and r["text"]]

def load_news_csv(path, title_col="title", link_col="url", published_col="published"):
    out = []
    if not os.path.exists(path): return out
    df = pd.read_csv(path)
    # normalize col names
    cols = {c.lower(): c for c in df.columns}
    tcol = cols.get(title_col, cols.get("title"))
    lcol = cols.get(link_col, cols.get("link", cols.get("resolved_url", "url")))
    pcol = cols.get(published_col, cols.get("seendate", cols.get("published", "published_at")))
    for _, r in df.iterrows():
        url = str(r.get(lcol, "")).strip() if lcol in r else ""
        title = str(r.get(tcol, "")).strip() if tcol in r else ""
        published = str(r.get(pcol, "")).strip() if pcol in r else ""
        if not url or not title: continue
        out.append({
            "url": url, "title": title, "published": published,
            "domain": str(r.get("domain","")).strip(),
            "source": os.path.basename(path),
            "text": title  # fallback text
        })
    return out

art_recs   = load_articles_jsonl(ART_JSONL)
gdelt_recs = load_news_csv(GDELT_CSV, title_col="title", link_col="url", published_col="seendate")
gnews_recs = load_news_csv(GNEWS_CSV, title_col="title", link_col="link", published_col="published")

len(art_recs), len(gdelt_recs), len(gnews_recs)

(300, 21894, 337)

In [3]:
from collections import defaultdict

def dedup_keep_longest(*lists):
    bucket = defaultdict(list)
    for L in lists:
        for r in L:
            bucket[r["url"]].append(r)
    out = []
    for url, items in bucket.items():
        out.append(max(items, key=lambda x: len(x.get("text",""))))
    return out

records = dedup_keep_longest(art_recs, gdelt_recs, gnews_recs)
# Keep full-text; add at most 2k headline-only items to improve coverage
full = [r for r in records if len(r["text"]) >= 200]
short = [r for r in records if len(r["text"]) < 200][:2000]
records = full + short
len(records)

2519

In [4]:
# Install once if transformers isn't present (Colab usually has it via sentence-transformers)
# !pip -q install transformers

import re
from transformers import AutoTokenizer

# Use the same model family as your embedder for token counting
tok = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")

_sentence_splitter = re.compile(r'(?<=[\.\?\!])\s+(?=[A-Z0-9“"])')

def split_into_sentences(text: str):
    text = re.sub(r'\s+', ' ', text).strip()
    # quickly bail if very short
    if len(text) < 400:
        return [text]
    parts = _sentence_splitter.split(text)
    # merge tiny fragments back to previous sentence
    merged = []
    for p in parts:
        if merged and len(p) < 40:
            merged[-1] += " " + p
        else:
            merged.append(p)
    return merged

def chunk_by_tokens(sentences, max_tokens=256, overlap_tokens=32):
    chunks, cur, cur_tokens = [], [], 0
    for s in sentences:
        t = len(tok.encode(s, add_special_tokens=False))
        if t > max_tokens:  # very long sentence → hard-split by commas/spaces
            subparts = re.split(r'([,;:–-])', s)
            for sp in subparts:
                tt = len(tok.encode(sp, add_special_tokens=False))
                if cur_tokens + tt <= max_tokens:
                    cur.append(sp); cur_tokens += tt
                else:
                    if cur: chunks.append("".join(cur).strip())
                    # start new with overlap from tail of previous
                    if overlap_tokens and chunks:
                        # take tail tokens from previous chunk
                        tail = tok.encode(chunks[-1], add_special_tokens=False)[-overlap_tokens:]
                        tail_text = tok.decode(tail, skip_special_tokens=True)
                        cur, cur_tokens = [tail_text, sp], len(tail) + tt
                    else:
                        cur, cur_tokens = [sp], tt
            continue

        # normal sentence
        if cur_tokens + t <= max_tokens:
            cur.append(s); cur_tokens += t
        else:
            if cur: chunks.append(" ".join(cur).strip())
            # overlap
            tail = []
            if overlap_tokens and chunks:
                tail_tok = tok.encode(chunks[-1], add_special_tokens=False)[-overlap_tokens:]
                tail = [tok.decode(tail_tok, skip_special_tokens=True)]
                cur_tokens = len(tail_tok)
            else:
                cur_tokens = 0
            cur = tail + [s]
            cur_tokens += t

    if cur: chunks.append(" ".join(cur).strip())
    return chunks

def recursive_sentence_token_chunk(text, max_tokens=256, overlap_tokens=32):
    sents = split_into_sentences(text)
    return chunk_by_tokens(sents, max_tokens=max_tokens, overlap_tokens=overlap_tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
docs, metas, ids = [], [], []
for idx, r in enumerate(records):  # records loaded from articles.jsonl
    chunks = recursive_sentence_token_chunk(r["text"], max_tokens=256, overlap_tokens=32)
    for j, ch in enumerate(chunks):
        docs.append(ch)
        metas.append({
            "url": r["url"], "title": r["title"], "published": r["published"],
            "domain": r["domain"], "source": r["source"], "chunk": j
        })
        ids.append(f"{idx}:{j}")

print("chunks:", len(docs), "avg_len_tokens≈",
      sum(len(tok.encode(d, add_special_tokens=False)) for d in docs)//max(1,len(docs)))

Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors


chunks: 3544 avg_len_tokens≈ 94


In [9]:
import torch
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device=device)

client = chromadb.PersistentClient(path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False))
# clean rebuild
try: client.delete_collection(COLLECTION_NAME)
except: pass
collection = client.create_collection(name=COLLECTION_NAME, metadata={"hnsw:space":"cosine"})

BATCH = 512
for i in tqdm(range(0, len(docs), BATCH)):
    batch_docs = docs[i:i+BATCH]
    batch_ids  = ids[i:i+BATCH]
    batch_m    = metas[i:i+BATCH]
    embs = embed_model.encode(batch_docs, batch_size=64, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)
    collection.add(documents=batch_docs, embeddings=embs, metadatas=batch_m, ids=batch_ids)

print("✅ Chroma build complete. Persisted at:", PERSIST_DIR)

100%|██████████| 7/7 [00:18<00:00,  2.68s/it]

✅ Chroma build complete. Persisted at: /content/vectorstore/news_v1





In [10]:
def search(q, k=5):
    q_emb = embed_model.encode([q], normalize_embeddings=True, convert_to_numpy=True)
    res = collection.query(query_embeddings=q_emb, n_results=k, include=["metadatas","documents","distances"])
    for i in range(len(res["ids"][0])):
        score = 1 - res["distances"][0][i]
        md = res["metadatas"][0][i]
        print(f"\n[{i+1}] score={score:.4f} | {md.get('title','')}\n{md.get('url','')}\n{res['documents'][0][i][:240]} ...")

search("impact of RBI repo rate hikes on Indian bank margins", k=5)


[1] score=0.8295 | RBI Repo Rate cut : A golden opportunity for homebuyers
http://www.dailypioneer.com/2025/columnists/rbi---s-repo-rate-cut--a-golden-opportunity-for-homebuyers.html
RBI Repo Rate cut : A golden opportunity for homebuyers ...

[2] score=0.8230 | Retail inflation to come below RBI 4 . 4 % estimates in Jan - Mar quarter at 3 . 8 %: Report
https://economictimes.indiatimes.com/news/economy/indicators/retail-inflation-to-come-below-rbis-4-4-estimates-in-jan-mar-quarter-at-3-8-report/articleshow/119000696.cms
, including advanced economies, but india has largely managed to steer its inflation trajectory quite well. the rbi had kept the repo rate elevated to keep inflation contained. The repo rate is the rate of interest at which the RBI lends to ...

[3] score=0.8188 | Check latest loan interest rates after RBI repo rate cut
https://www.moneycontrol.com/news/business/personal-finance/these-banks-have-revised-personal-loan-interest-rates-after-rbi-repo-rate-cut-check-full-li

In [11]:
!zip -r /content/vectorstore.zip /content/vectorstore

  adding: content/vectorstore/ (stored 0%)
  adding: content/vectorstore/news_v1/ (stored 0%)
  adding: content/vectorstore/news_v1/ac3316c6-0efa-4eba-9b37-fdbdacddeccc/ (stored 0%)
  adding: content/vectorstore/news_v1/ac3316c6-0efa-4eba-9b37-fdbdacddeccc/index_metadata.pickle (deflated 65%)
  adding: content/vectorstore/news_v1/ac3316c6-0efa-4eba-9b37-fdbdacddeccc/link_lists.bin (deflated 84%)
  adding: content/vectorstore/news_v1/ac3316c6-0efa-4eba-9b37-fdbdacddeccc/header.bin (deflated 61%)
  adding: content/vectorstore/news_v1/ac3316c6-0efa-4eba-9b37-fdbdacddeccc/length.bin (deflated 85%)
  adding: content/vectorstore/news_v1/ac3316c6-0efa-4eba-9b37-fdbdacddeccc/data_level0.bin (deflated 13%)
  adding: content/vectorstore/news_v1/chroma.sqlite3 (deflated 54%)


In [12]:
from google.colab import files
files.download('/content/vectorstore.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>