## Load JSON → SPLIT → EMBED → SAVE PINECONE (Vector DB)

In [None]:
import os
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# 📁 Folder containing JSON files (each with a list of PDF pages)
folder_path = os.path.abspath("../01_data_gathering_logic")

# 🧠 Text splitter configuration
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200)

# 📚 All resulting Document objects
all_documents = []

# 🔁 Process each JSON file
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        print(f"\n📄 Processing: {filename}")

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                entries = json.load(f)
        except Exception as e:
            print(f"❌ Failed to load {filename} | Error: {e}")
            continue

        print(f"✅ Loaded {len(entries)} entries")

        for entry in entries:
            pdf_name = entry.get("Text", filename)
            page_number = entry.get("page_number", None)
            raw_text = entry.get("Details", "").strip()

            if not raw_text:
                continue

            chunks = splitter.split_text(raw_text)

            for chunk in chunks:
                doc = Document(
                    page_content=chunk,
                    metadata={
                        "source": pdf_name,
                        "page_number": page_number
                    }
                )
                all_documents.append(doc)

print(f"\n📊 Total Chunks Created: {len(all_documents)}")

# Optional: Preview first few
# for i, doc in enumerate(all_documents[:3], 1):
#     print(f"\n🔹 Chunk {i}:")
#     print(f"Metadata: {doc.metadata}")
#     print(f"Content: {doc.page_content[:200]}")


📄 Processing: angelone_quick_10_links_support_data.json
✅ Loaded 10 entries

📄 Processing: angelone_support_full_data.json
✅ Loaded 17 entries

📄 Processing: insurance_pdfs_flat.json
✅ Loaded 26 entries

📊 Total Chunks Created: 462


Embedding

In [2]:
import numpy as np
from typing import List, Optional
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def generate_embeddings(
    texts: List[str],
    model_name: str = "all-MiniLM-L6-v2",
    batch_size: int = 32,
    device: Optional[str] = None  # e.g., "cuda" or "cpu"
) -> np.ndarray:
    if not texts:
        print("⚠️ No input texts provided.")
        return np.array([])

    # Load the model on the specified device
    model = SentenceTransformer(model_name, device=device)

    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="🔄 Generating embeddings"):
        batch = texts[i:i+batch_size]

        try:
            batch_embeddings = model.encode(
                batch,
                show_progress_bar=False,
                convert_to_numpy=True,
                normalize_embeddings=True  # Optional: normalize for cosine similarity
            )
            embeddings.append(batch_embeddings)
        except Exception as e:
            print(f"❌ Error in batch {i // batch_size}: {e}")
            continue

    return np.vstack(embeddings) if embeddings else np.array([])

# Example usage:
texts = [doc.page_content for doc in all_documents]
embeddings = generate_embeddings(texts)

if embeddings.size > 0:
    print(f"\n✅ Generated {embeddings.shape[0]} embeddings of dimension {embeddings.shape[1]}")
else:
    print("❌ Failed to generate embeddings.")




🔄 Generating embeddings: 100%|██████████| 15/15 [00:07<00:00,  1.97it/s]


✅ Generated 462 embeddings of dimension 384





Pinecone Vector DB | Saving the data here

In [3]:
import os
import json
import numpy as np
import pinecone
from typing import List
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
    raise EnvironmentError("Missing PINECONE_API_KEY in environment variables.")

# Initialize Pinecone client
pinecone_client = pinecone.Pinecone(api_key=PINECONE_API_KEY)

# Pinecone config
INDEX_NAME = "vector-search-index"
DIMENSION = 384
METRIC = "cosine"
REGION = "us-east-1"
CLOUD = "aws"


def create_index_if_not_exists() -> None:
    """Create Pinecone index if it doesn't exist."""
    try:
        if INDEX_NAME not in pinecone_client.list_indexes().names():
            pinecone_client.create_index(
                name=INDEX_NAME,
                dimension=DIMENSION,
                metric=METRIC,
                spec=pinecone.ServerlessSpec(cloud=CLOUD, region=REGION),
            )
            print(f"✅ Created index: {INDEX_NAME}")
        else:
            print(f"ℹ️ Index already exists: {INDEX_NAME}")
    except Exception as e:
        raise RuntimeError(f"Error creating index: {e}")


def get_index():
    """Retrieve Pinecone index object."""
    try:
        return pinecone_client.Index(INDEX_NAME)
    except Exception as e:
        raise RuntimeError(f"Error connecting to index: {e}")


def index_embeddings(embeddings: np.ndarray, texts: List[str], batch_size: int = 100) -> None:
    """Upsert vectors and associate them with text IDs."""
    if len(embeddings) != len(texts):
        raise ValueError("Embeddings and texts length mismatch.")

    index = get_index()

    for i in range(0, len(embeddings), batch_size):
        batch_ids = [str(i + j) for j in range(min(batch_size, len(embeddings) - i))]
        batch_vectors = embeddings[i:i + batch_size]
        upsert_data = [(id_, vec.tolist()) for id_, vec in zip(batch_ids, batch_vectors)]

        index.upsert(vectors=upsert_data)
        print(f"⬆️ Upserted batch {i // batch_size + 1}: {len(upsert_data)} vectors")

    print(f"✅ Indexed {len(embeddings)} vectors.")


def save_texts_to_json(texts: List[str], file_path: str = "documents.json") -> None:
    """Save text data to JSON file."""
    try:
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(texts, f, ensure_ascii=False, indent=2)
        print(f"💾 Saved texts to {file_path}")
    except Exception as e:
        raise IOError(f"Error saving texts: {e}")


# Example usage
if __name__ == "__main__":
    # embeddings = np.random.rand(10, DIMENSION)  # Replace with actual embeddings
    # texts = [f"Text chunk {i}" for i in range(10)]

    create_index_if_not_exists()
    index_embeddings(embeddings, texts)
    save_texts_to_json(texts)

ℹ️ Index already exists: vector-search-index
⬆️ Upserted batch 1: 100 vectors
⬆️ Upserted batch 2: 100 vectors
⬆️ Upserted batch 3: 100 vectors
⬆️ Upserted batch 4: 100 vectors
⬆️ Upserted batch 5: 62 vectors
✅ Indexed 462 vectors.
💾 Saved texts to documents.json
