In [1]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="llama3.2",
)

In [2]:
import os
import uuid
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from langchain.vectorstores import Pinecone as PineconeStore
from tqdm import tqdm


In [3]:
PINECONE_API_KEY = os.environ["PINECONE_KEY"]
PINECONE_INDEX_NAME = "documentations"

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

In [5]:
DOCS_DIR = "../docs"

In [6]:
from langchain_pinecone import PineconeVectorStore
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)


In [7]:
# === SCAN AND PROCESS DOCS ===
print("📦 Scanning files...")

all_txt_files = []
for root, _, files in os.walk(DOCS_DIR):
    for file in files:
        if file.endswith(".txt"):
            all_txt_files.append(os.path.join(root, file))

print(f"🔍 Found {len(all_txt_files)} text files to process\n")

📦 Scanning files...
🔍 Found 148 text files to process



In [9]:
# === INITIALIZE VECTOR STORE ===
vectorstore = PineconeVectorStore(
    index=index,
    embedding=embeddings,
    text_key="text"
)

In [14]:
# === LOOP AND UPSERT IN BATCHES ===
BATCH_SIZE = 200
MAX_PAYLOAD_SIZE = 2 * 1024 * 1024  # 2MB
import json
import sys 
for file_path in tqdm(all_txt_files, desc="📤 Uploading files", unit="file"):
    file_name = os.path.basename(file_path)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    chunks = splitter.split_text(text)
    if not chunks:
        continue

    ids = [str(uuid.uuid4()) for _ in chunks]
    metadatas = [{"source": file_name}] * len(chunks)

    # === Batch Dynamically Based on Payload Size ===
    batch_texts, batch_ids, batch_metas, size_acc = [], [], [], 0
    for chunk, _id, meta in zip(chunks, ids, metadatas):
        # Estimate size of this item
        embedding = embeddings.embed_query(chunk)
        payload = json.dumps({"id": _id, "values": embedding, "metadata": meta})
        payload_size = sys.getsizeof(payload)

        if size_acc + payload_size >= MAX_PAYLOAD_SIZE and batch_texts:
            # Flush batch
            vectorstore.add_texts(texts=batch_texts, ids=batch_ids, metadatas=batch_metas)
            batch_texts, batch_ids, batch_metas, size_acc = [], [], [], 0

        # Add to current batch
        batch_texts.append(chunk)
        batch_ids.append(_id)
        batch_metas.append(meta)
        size_acc += payload_size

    # Flush final batch
    if batch_texts:
        vectorstore.add_texts(texts=batch_texts, ids=batch_ids, metadatas=batch_metas)

print("✅ All files uploaded to Pinecone without exceeding size limit.")

📤 Uploading files: 100%|██████████| 148/148 [27:23<00:00, 11.10s/file] 

✅ All files uploaded to Pinecone without exceeding size limit.



