In [1]:
import os
import time
from pypdf import PdfReader
import camelot
import chromadb
from chromadb.config import Settings
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted
from google.cloud import storage  # for optional GCS upload
from tiktoken import get_encoding
from chromadb import PersistentClient

PDF_PATH = "ltimindtree_annual_report.pdf"
CHROMA_DIR = "./chromadb_dir"
GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    GOOGLE_API_KEY = ""
GENIE_MODEL = "gemini-embedding-exp-03-07"
BATCH_SIZE = 32
CHUNK_TOKENS = 800

ModuleNotFoundError: No module named 'pypdf'

In [None]:
reader = PdfReader(PDF_PATH)
raw_pages = [p.extract_text() for p in reader.pages]

In [None]:
enc = get_encoding("cl100k_base")
chunks, metadatas = [], []
for page_num, page in enumerate(raw_pages, start=1):
    token_ids = enc.encode(page)
    for idx in range(0, len(token_ids), CHUNK_TOKENS):
        chunk = enc.decode(token_ids[idx : idx + CHUNK_TOKENS])
        chunks.append(chunk)
        metadatas.append({"page": page_num, "chunk_id": idx // CHUNK_TOKENS})

In [None]:
genai.configure(api_key=GOOGLE_API_KEY)
storage_client = storage.Client(
    project=os.getenv("HACKAI_GCS_PROJECT")
)



In [None]:
embeddings = []
for i in range(0, len(chunks), BATCH_SIZE):
    batch = chunks[i : i + BATCH_SIZE]
    while True:
        try:
            resp = genai.embed_content(model=GENIE_MODEL, content=batch)
            batch_embs = resp.get("embeddings") or resp.get("embedding")
            embeddings.extend(batch_embs)
            break
        except ResourceExhausted:
            print("Quota hit—sleeping 60s…")
            time.sleep(60)
    time.sleep(1.0)


In [None]:
len(chunks)

46

In [None]:
client = PersistentClient(path=CHROMA_DIR)
# Create or get collection in one step
collection = client.get_or_create_collection(
    name="annual_report",
    metadata={"hnsw:space": "cosine"}
)
# Add documents
for idx, (chunk, emb, meta) in enumerate(zip(chunks, embeddings, metadatas)):
    collection.add(
        ids=[str(idx)],
        documents=[chunk],
        embeddings=[emb],
        metadatas=[meta]
    )


In [None]:
def upload_dir_to_gcs(local_dir: str, bucket_name: str, gcs_prefix: str = "chromadb"):
    bucket = storage_client.bucket(bucket_name)
    for root, _, files in os.walk(local_dir):
        for fname in files:
            local_path = os.path.join(root, fname)
            rel_path = os.path.relpath(local_path, local_dir)
            blob_path = f"{gcs_prefix}/{rel_path}"
            blob = bucket.blob(blob_path)
            blob.upload_from_filename(local_path)
            print(f"Uploaded {local_path} to gs://{bucket_name}/{blob_path}")

if GCS_BUCKET_NAME:
    upload_dir_to_gcs(CHROMA_DIR, GCS_BUCKET_NAME)

In [None]:
tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="stream")
table_dict = {f"table_{i+1}": t.df for i, t in enumerate(tables)}

In [None]:
# (Re‑define or import the function if needed)

def upload_dir_to_gcs(local_dir: str, bucket_name: str, gcs_prefix: str = "chromadb"):
    from google.cloud import storage
    client = storage.Client(project=os.getenv("HACKAI_GCS_PROJECT"))
    bucket = client.bucket(bucket_name)
    for root, _, files in os.walk(local_dir):
        for fname in files:
            local_path = os.path.join(root, fname)
            rel_path   = os.path.relpath(local_path, local_dir)
            blob_path  = f"{gcs_prefix}/{rel_path}"
            bucket.blob(blob_path).upload_from_filename(local_path)
            print(f"Uploaded {local_path} → gs://{bucket_name}/{blob_path}")

# Call it:
upload_dir_to_gcs("./chromadb_dir", "report-vectors")




Uploaded ./chromadb_dir/chroma.sqlite3 → gs://report-vectors/chromadb/chroma.sqlite3
Uploaded ./chromadb_dir/423f205e-9cd3-4efc-ade2-fb98896d2ce0/data_level0.bin → gs://report-vectors/chromadb/423f205e-9cd3-4efc-ade2-fb98896d2ce0/data_level0.bin
Uploaded ./chromadb_dir/423f205e-9cd3-4efc-ade2-fb98896d2ce0/length.bin → gs://report-vectors/chromadb/423f205e-9cd3-4efc-ade2-fb98896d2ce0/length.bin
Uploaded ./chromadb_dir/423f205e-9cd3-4efc-ade2-fb98896d2ce0/link_lists.bin → gs://report-vectors/chromadb/423f205e-9cd3-4efc-ade2-fb98896d2ce0/link_lists.bin
Uploaded ./chromadb_dir/423f205e-9cd3-4efc-ade2-fb98896d2ce0/header.bin → gs://report-vectors/chromadb/423f205e-9cd3-4efc-ade2-fb98896d2ce0/header.bin
