# Load JSON → split → embed → save FAISS

In [3]:
import os
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Folder containing JSON files
folder_path = r"C:\Users\abdullah.shahid\Desktop\Python\2025\April-May\customer-support-rag\data"

# Configure text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)

# Store all documents
all_documents = []

# Process each JSON file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        print(f"\n📄 Processing file: {filename}")

        # Load JSON
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        print(f"✅ Loaded {len(data)} entries")

        # Process each entry in the JSON
        for entry in data:
            text = entry.get("Text", "")  # You can rename this to "pdf_name" if that matches your schema
            details = entry.get("Details", "").strip()

            if not details.strip():
                continue  # Skip empty entries

            # Split details into chunks
            chunks = splitter.split_text(details)

            # Create Document objects
            for chunk in chunks:
                all_documents.append(Document(
                    page_content=chunk,
                    metadata={"text": text}
                ))

print(f"\n🔍 Total Chunks Created: {len(all_documents)}")

# Preview the first chunk (optional)
# if all_documents:
    # preview = all_documents[0].page_content[:200].replace('\n', ' ')
    # print(f"📝 Sample Chunk Preview: {preview}")



📄 Processing file: angelone_quick_10_links_support_data.json
✅ Loaded 10 entries

📄 Processing file: angelone_support_full_data.json
✅ Loaded 17 entries

📄 Processing file: insurance_pdfs_text.json
✅ Loaded 26 entries

🔍 Total Chunks Created: 1840


In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm  # For showing progress bars during batch processing

def generate_embeddings(texts, model_name="all-MiniLM-L6-v2", batch_size=32):
    model = SentenceTransformer(model_name)
    
    # Store embeddings in a list (will convert to numpy array later)
    embeddings = []
    
    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch = texts[i:i+batch_size]
        
        try:
            # Generate embeddings for the current batch
            batch_embeddings = model.encode(batch, show_progress_bar=False)
            embeddings.append(batch_embeddings)
        except Exception as e:
            print(f"Error processing batch {i // batch_size}: {e}")
            continue
    
    # Flatten the list of embeddings and return as numpy array
    return np.vstack(embeddings)

# Extract text chunks from LangChain Document objects
texts = [doc.page_content for doc in all_documents]

# Generate embeddings with improved batch processing
embeddings = generate_embeddings(texts)

print(f"Generated {embeddings.shape[0]} embeddings with {embeddings.shape[1]} dimensions each.")


Generating embeddings: 100%|██████████| 58/58 [00:35<00:00,  1.63it/s]

Generated 1840 embeddings with 384 dimensions each.





In [8]:
import faiss
import numpy as np
import json

# Step 1: Create FAISS Index
def create_faiss_index(embeddings):
    # Embedding dimension (depends on the model)
    dim = embeddings.shape[1]
    
    # Create the FAISS index (L2 distance metric)
    index = faiss.IndexFlatL2(dim)
    
    # Add embeddings to the index
    index.add(embeddings)
    
    return index

# Step 2: Save FAISS Index to Disk
def save_faiss_index(index, index_file="faiss_index.index"):
    faiss.write_index(index, index_file)
    print(f"FAISS index saved to {index_file}")

# Step 3: Save Text Data to Disk
def save_texts(texts, text_file="documents.json"):
    with open(text_file, "w", encoding="utf-8") as f:
        json.dump(texts, f, ensure_ascii=False)
    print(f"Text data saved to {text_file}")


faiss_index = create_faiss_index(embeddings)
save_faiss_index(faiss_index)
save_texts(texts)

print(f"FAISS index contains {faiss_index.ntotal} vectors.")


FAISS index saved to faiss_index.index
Text data saved to documents.json
FAISS index contains 1840 vectors.


In [22]:
import pinecone
import numpy as np
import json
import os
from dotenv import load_dotenv
from typing import List, Any

# Load environment variables
load_dotenv()

# Get Pinecone API key
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY not found in environment variables.")

# Initialize Pinecone client
pinecone_client = pinecone.Pinecone(api_key=PINECONE_API_KEY)

# Index configuration
INDEX_NAME = "vector-search-index"
DIMENSION = 384  # Matches embedding model (e.g., all-MiniLM-L6-v2)
METRIC = "cosine"  # Cosine similarity for semantic search

# Check if index exists, create if it doesn't
def create_pinecone_index():
    try:
        if INDEX_NAME not in pinecone_client.list_indexes().names():
            pinecone_client.create_index(
                name=INDEX_NAME,
                dimension=DIMENSION,
                metric=METRIC,
                spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
            )
            print(f"Created Pinecone index '{INDEX_NAME}'.")
        else:
            print(f"Index '{INDEX_NAME}' already exists.")
    except Exception as e:
        raise RuntimeError(f"Failed to create Pinecone index: {e}")

# Connect to the Pinecone index
def get_pinecone_index():
    try:
        return pinecone_client.Index(INDEX_NAME)
    except Exception as e:
        raise RuntimeError(f"Failed to connect to Pinecone index '{INDEX_NAME}': {e}")

# Step 1: Index Vectors into Pinecone
def index_vectors(embeddings: np.ndarray, texts: List[str], batch_size: int = 100) -> None:
    try:
        index = get_pinecone_index()
        total_vectors = len(embeddings)
        
        # Ensure embeddings and texts have the same length
        if len(embeddings) != len(texts):
            raise ValueError("Number of embeddings and texts must match.")
        
        # Batch upsert to handle large datasets
        for i in range(0, total_vectors, batch_size):
            batch_embeddings = embeddings[i:i + batch_size]
            batch_ids = [str(j) for j in range(i, min(i + batch_size, total_vectors))]
            to_upsert = [(id_, vec.tolist()) for id_, vec in zip(batch_ids, batch_embeddings)]
            index.upsert(vectors=to_upsert)
            print(f"Upserted batch {i // batch_size + 1} ({len(batch_embeddings)} vectors).")
        
        print(f"Successfully indexed {total_vectors} vectors.")
    except Exception as e:
        raise RuntimeError(f"Failed to index vectors: {e}")

# Step 2: Save Text Data to Disk
def save_texts(texts: List[str], text_file: str = "documents.json") -> None:
    try:
        with open(text_file, "w", encoding="utf-8") as f:
            json.dump(texts, f, ensure_ascii=False, indent=2)
        print(f"Text data saved to {text_file}.")
    except Exception as e:
        raise RuntimeError(f"Failed to save texts to {text_file}: {e}")

# Example usage
if __name__ == "__main__":
    # Sample data (replace with your actual embeddings and texts)
    # embeddings = np.random.rand(10, DIMENSION)  # Mock embeddings (10 vectors of 384 dims)
    # texts = [f"Document {i}" for i in range(10)]  # Mock text data

    # Create index if it doesn't exist
    create_pinecone_index()

    # Index vectors and save texts
    index_vectors(embeddings, texts)
    save_texts(texts)

Index 'vector-search-index' already exists.
Upserted batch 1 (10 vectors).
Successfully indexed 10 vectors.
Text data saved to documents.json.
