In [1]:
import os
import pickle
import hashlib
import warnings
from tqdm import tqdm
from dotenv import load_dotenv

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings

import faiss

# === Setup ===
warnings.filterwarnings("ignore")
load_dotenv()
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

# === Config ===
PDF_FOLDER = "Data"
CHUNK_SAVE_DIR = "chunk_batches"
VECTORSTORE_DIR = "vectorstore_parts"
MERGED_VECTORSTORE_PATH = "vectorstore_merged"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
EMBED_MODEL_NAME = "intfloat/e5-base-v2"

os.makedirs(CHUNK_SAVE_DIR, exist_ok=True)
os.makedirs(VECTORSTORE_DIR, exist_ok=True)

# === Custom Embedding Wrapper ===
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name="intfloat/e5-base-v2"):
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name  # Track model name
        
    def embed_documents(self, texts):
        # Add prefix required by E5 models for passages
        if "e5" in self.model_name:
            texts = ["passage: " + text for text in texts]
        return self.model.encode(texts, convert_to_numpy=True).tolist()
    
    def embed_query(self, text):
        # Add prefix required by E5 models for queries
        if "e5" in self.model_name:
            text = "query: " + text
        return self.model.encode(text, convert_to_numpy=True).tolist()

# === Helper: List PDF files ===
def list_pdf_files(folder):
    return [
        os.path.join(root, file)
        for root, _, files in os.walk(folder)
        for file in files if file.endswith(".pdf")
    ]

# === Helper: Check if .pkl is up to date ===
def is_pickle_up_to_date(pdf_path, pkl_path):
    return os.path.exists(pkl_path) and os.path.getmtime(pdf_path) <= os.path.getmtime(pkl_path)

# === Chunk a single PDF and save ===
def chunk_pdf(pdf_path):
    print(f"📄 Processing: {os.path.basename(pdf_path)}")
    try:
        loader = PyMuPDFLoader(pdf_path)
        pages = loader.load()
        for page in pages:
            page.metadata["source"] = os.path.basename(pdf_path)
    except Exception as e:
        print(f"❌ Error loading {pdf_path}: {e}")
        return []

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ".", " ", ""],
        add_start_index=True,
        keep_separator=True
    )

    chunks = splitter.split_documents(pages)
    for chunk in chunks:
        content_hash = hashlib.md5(chunk.page_content.encode()).hexdigest()[:8]
        chunk.metadata["chunk_id"] = f"{chunk.metadata['source']}_{chunk.metadata.get('page', 0)}_{content_hash}"

    # Save as .pkl
    pkl_name = os.path.join(CHUNK_SAVE_DIR, f"{os.path.basename(pdf_path)}.pkl")
    with open(pkl_name, "wb") as f:
        pickle.dump(chunks, f)
    print(f"💾 Saved {len(chunks)} chunks to {pkl_name}")
    return chunks

# === Load all chunks ===
def load_all_chunks():
    all_chunks = []
    for file in os.listdir(CHUNK_SAVE_DIR):
        if file.endswith(".pkl"):
            with open(os.path.join(CHUNK_SAVE_DIR, file), "rb") as f:
                chunks = pickle.load(f)
                all_chunks.extend(chunks)
    print(f"📦 Loaded total {len(all_chunks)} chunks from {CHUNK_SAVE_DIR}")
    return all_chunks

# === Create FAISS vectorstore ===
def create_vectorstore(chunks, embeddings):
    sample_vec = embeddings.embed_query("test")
    index = faiss.IndexFlatL2(len(sample_vec))
    vs = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={}
    )
    vs.add_documents(chunks)
    return vs

# === Save vectorstore ===
def save_vectorstore(vs, path):
    vs.save_local(path)
    print(f"✅ Vectorstore saved to: {path}")

# === MAIN ===
if __name__ == "__main__":
    print("🚀 Starting vectorization pipeline...")

    embeddings = SentenceTransformerEmbeddings(model_name=EMBED_MODEL_NAME)
    pdf_files = list_pdf_files(PDF_FOLDER)

    for pdf_path in tqdm(pdf_files, desc="📂 Checking PDFs"):
        pkl_path = os.path.join(CHUNK_SAVE_DIR, f"{os.path.basename(pdf_path)}.pkl")
        if is_pickle_up_to_date(pdf_path, pkl_path):
            print(f"✅ Skipping (cached): {os.path.basename(pdf_path)}")
        else:
            chunk_pdf(pdf_path)

    # Reload and merge everything
    print("🔄 Rebuilding merged vectorstore from all .pkl chunks...")
    all_chunks = load_all_chunks()
    vectorstore = create_vectorstore(all_chunks, embeddings)
    save_vectorstore(vectorstore, MERGED_VECTORSTORE_PATH)

    print("🎉 Done! All PDFs chunked and merged into final vectorstore.")


🚀 Starting vectorization pipeline...


📂 Checking PDFs: 100%|██████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]


✅ Skipping (cached): temp.pdf
✅ Skipping (cached): temp_uploaded.pdf
🔄 Rebuilding merged vectorstore from all .pkl chunks...
📦 Loaded total 56 chunks from chunk_batches
✅ Vectorstore saved to: vectorstore_merged
🎉 Done! All PDFs chunked and merged into final vectorstore.


# NEW

In [8]:
import os
import pickle
import hashlib
import warnings
import re
from tqdm import tqdm
from dotenv import load_dotenv

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import TokenTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
from transformers import AutoTokenizer
import faiss

# === Setup ===
warnings.filterwarnings("ignore")
load_dotenv()
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

# === Config ===
PDF_FOLDER = "Publications_PDFs/AFO-50"
CHUNK_SAVE_DIR = "chunk_batches2"
VECTORSTORE_DIR = "vectorstore_parts2"
MERGED_VECTORSTORE_PATH = "vectorstore_merged2"
TOKEN_CHUNK_SIZE = 256  # Optimal for E5 models
TOKEN_OVERLAP = 38      # 15% overlap
EMBED_MODEL_NAME = "intfloat/e5-base-v2"
TOKENIZER_NAME = "intfloat/e5-base-v2"  # Same as model for consistency

os.makedirs(CHUNK_SAVE_DIR, exist_ok=True)
os.makedirs(VECTORSTORE_DIR, exist_ok=True)

# === Custom Embedding Wrapper ===
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name="intfloat/e5-base-v2"):
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name
        
    def embed_documents(self, texts):
        # Add E5-specific prefix for passages
        if "e5" in self.model_name.lower():
            texts = ["passage: " + text for text in texts]
        return self.model.encode(texts, 
                                 convert_to_numpy=True, 
                                 batch_size=32, 
                                 show_progress_bar=False).tolist()
    
    def embed_query(self, text):
        # Add E5-specific prefix for queries
        if "e5" in self.model_name.lower():
            text = "query: " + text
        return self.model.encode(text, convert_to_numpy=True).tolist()

# === Helper: List PDF files ===
def list_pdf_files(folder):
    return [
        os.path.join(root, file)
        for root, _, files in os.walk(folder)
        for file in files if file.endswith(".pdf")
    ]

# === Helper: Check if .pkl is up to date ===
def is_pickle_up_to_date(pdf_path, pkl_path):
    return os.path.exists(pkl_path) and os.path.getmtime(pdf_path) <= os.path.getmtime(pkl_path)

# === Clean PDF content ===
def clean_page_content(page_content):
    """Remove common PDF artifacts and noise"""
    # Remove headers/footers with page numbers
    page_content = re.sub(r'Page \d+ of \d+', '', page_content, flags=re.IGNORECASE)
    # Remove isolated numbers (likely page numbers)
    page_content = re.sub(r'^\d+$', '', page_content, flags=re.MULTILINE)
    # Remove repeated headers
    page_content = re.sub(r'^.*\n\1$', '', page_content, flags=re.MULTILINE)
    # Remove excessive whitespace
    page_content = re.sub(r'\s+', ' ', page_content).strip()
    return page_content

# === Validate chunk quality ===
def is_valid_chunk(chunk):
    """Filter out low-quality chunks"""
    content = chunk.page_content.strip()
    # Remove chunks that are too short
    if len(content) < 50:
        return False
    # Remove chunks that are mostly numbers
    if sum(c.isdigit() for c in content) / len(content) > 0.3:
        return False
    # Remove chunks that are all uppercase (likely headers)
    if content.isupper() and len(content) < 100:
        return False
    return True

# === Chunk a single PDF and save ===
def chunk_pdf(pdf_path):
    print(f"📄 Processing: {os.path.basename(pdf_path)}")
    try:
        loader = PyMuPDFLoader(pdf_path)
        pages = loader.load()
        
        # Clean each page and update metadata
        for page in pages:
            page.metadata["source"] = os.path.basename(pdf_path)
            page.page_content = clean_page_content(page.page_content)
    except Exception as e:
        print(f"❌ Error loading {pdf_path}: {e}")
        return []

    # Initialize token-based splitter
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
    splitter = TokenTextSplitter(
        encoding_name=TOKENIZER_NAME,
        chunk_size=TOKEN_CHUNK_SIZE,
        chunk_overlap=TOKEN_OVERLAP,
        add_start_index=True,
        disallowed_special=(tokenizer.all_special_tokens)
    )

    chunks = splitter.split_documents(pages)
    
    # Add chunk IDs and filter low-quality chunks
    valid_chunks = []
    for chunk in chunks:
        content = chunk.page_content.strip()
        if not content:
            continue
            
        content_hash = hashlib.md5(content.encode()).hexdigest()[:8]
        chunk.metadata["chunk_id"] = f"{chunk.metadata['source']}_{chunk.metadata.get('page', 0)}_{content_hash}"
        
        if is_valid_chunk(chunk):
            valid_chunks.append(chunk)

    # Save as .pkl
    pkl_name = os.path.join(CHUNK_SAVE_DIR, f"{os.path.basename(pdf_path)}.pkl")
    with open(pkl_name, "wb") as f:
        pickle.dump(valid_chunks, f)
        
    print(f"💾 Saved {len(valid_chunks)}/{len(chunks)} chunks to {pkl_name}")
    return valid_chunks

# === Load all chunks ===
def load_all_chunks():
    all_chunks = []
    for file in os.listdir(CHUNK_SAVE_DIR):
        if file.endswith(".pkl"):
            with open(os.path.join(CHUNK_SAVE_DIR, file), "rb") as f:
                chunks = pickle.load(f)
                all_chunks.extend(chunks)
                
    print(f"📦 Loaded total {len(all_chunks)} chunks from {CHUNK_SAVE_DIR}")
    return all_chunks

# === Create FAISS vectorstore ===
def create_vectorstore(chunks, embeddings):
    # Create optimized FAISS index
    print("⚙️ Creating vectorstore (this may take a while)...")
    vs = FAISS.from_documents(
        documents=chunks,
        embedding=embeddings,
        # batch_size=128,  # Process in batches for efficiency
        normalize_L2=True  # Important for cosine similarity
    )
    return vs

# === Save vectorstore ===
def save_vectorstore(vs, path):
    vs.save_local(path)
    print(f"✅ Vectorstore saved to: {path}")

# === MAIN ===
if __name__ == "__main__":
    print("🚀 Starting optimized PDF vectorization pipeline...")
    print(f"Using embedding model: {EMBED_MODEL_NAME}")
    print(f"Token chunk size: {TOKEN_CHUNK_SIZE}, Overlap: {TOKEN_OVERLAP}")

    # Initialize embeddings
    embeddings = SentenceTransformerEmbeddings(model_name=EMBED_MODEL_NAME)
    
    # Get PDF files
    pdf_files = list_pdf_files(PDF_FOLDER)
    print(f"Found {len(pdf_files)} PDF files in {PDF_FOLDER}")

    # Process PDFs
    for pdf_path in tqdm(pdf_files, desc="📂 Processing PDFs"):
        pkl_path = os.path.join(CHUNK_SAVE_DIR, f"{os.path.basename(pdf_path)}.pkl")
        if is_pickle_up_to_date(pdf_path, pkl_path):
            print(f"✅ Skipping (cached): {os.path.basename(pdf_path)}")
        else:
            chunk_pdf(pdf_path)

    # Reload and merge everything
    print("\n🔄 Rebuilding merged vectorstore from all chunks...")
    all_chunks = load_all_chunks()
    
    # Analyze chunk distribution
    chunk_lengths = [len(c.page_content) for c in all_chunks]
    if chunk_lengths:
        avg_len = sum(chunk_lengths) / len(chunk_lengths)
        max_len = max(chunk_lengths)
        min_len = min(chunk_lengths)
        print(f"📊 Chunk stats: Avg {avg_len:.1f} chars | Min {min_len} | Max {max_len}")
    else:
        print("⚠️ No chunks found! Check PDF processing")
        exit(1)

    # Create and save vectorstore
    vectorstore = create_vectorstore(all_chunks, embeddings)
    save_vectorstore(vectorstore, MERGED_VECTORSTORE_PATH)

    print("\n🎉 Pipeline complete! All PDFs processed and indexed.")
    print(f"Total chunks: {len(all_chunks)}")
    print(f"Vectorstore saved to: {MERGED_VECTORSTORE_PATH}")

🚀 Starting optimized PDF vectorization pipeline...
Using embedding model: intfloat/e5-base-v2
Token chunk size: 256, Overlap: 38
Found 0 PDF files in Publications_PDFs/AFO-50


📂 Processing PDFs: 0it [00:00, ?it/s]


🔄 Rebuilding merged vectorstore from all chunks...
📦 Loaded total 56 chunks from chunk_batches2
📊 Chunk stats: Avg 889.3 chars | Min 270 | Max 999
⚙️ Creating vectorstore (this may take a while)...





✅ Vectorstore saved to: vectorstore_merged2

🎉 Pipeline complete! All PDFs processed and indexed.
Total chunks: 56
Vectorstore saved to: vectorstore_merged2


## Updated

In [9]:
import os
import pickle
import hashlib
import warnings
import re
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import TokenTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
from transformers import AutoTokenizer
import faiss

# === Setup ===
warnings.filterwarnings("ignore")
load_dotenv()
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

# === Config ===
PDF_FOLDER = "Data"
CHUNK_SAVE_DIR = "chunk_batches2"
MERGED_VECTORSTORE_PATH = "vectorstore_merged2"
TOKEN_CHUNK_SIZE = 256
TOKEN_OVERLAP = 38
EMBED_MODEL_NAME = "intfloat/e5-base-v2"
TOKENIZER_NAME = "intfloat/e5-base-v2"
EMBED_BATCH_SIZE = 32  # Optimized batch size for embedding generation

os.makedirs(CHUNK_SAVE_DIR, exist_ok=True)

# === Custom Embedding Wrapper ===
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name="intfloat/e5-base-v2"):
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name
        print(f"✅ Loaded embedding model: {model_name}")
        
    def embed_documents(self, texts):
        # Add E5-specific prefix for passages
        if "e5" in self.model_name.lower():
            texts = ["passage: " + text for text in texts]
            
        # Batch processing for efficiency
        embeddings = []
        for i in range(0, len(texts), EMBED_BATCH_SIZE):
            batch = texts[i:i+EMBED_BATCH_SIZE]
            embeddings.extend(self.model.encode(batch, 
                                              convert_to_numpy=True, 
                                              show_progress_bar=False).tolist())
        return embeddings
    
    def embed_query(self, text):
        # Add E5-specific prefix for queries
        if "e5" in self.model_name.lower():
            text = "query: " + text
        return self.model.encode(text, convert_to_numpy=True).tolist()

# === Helper: List PDF files ===
def list_pdf_files(folder):
    pdfs = [
        os.path.join(root, file)
        for root, _, files in os.walk(folder)
        for file in files if file.endswith(".pdf")
    ]
    print(f"🔍 Found {len(pdfs)} PDF files in {folder}")
    return pdfs

# === Helper: Check if .pkl is up to date ===
def is_pickle_up_to_date(pdf_path, pkl_path):
    if not os.path.exists(pkl_path):
        return False
    return os.path.getmtime(pdf_path) <= os.path.getmtime(pkl_path)

# === Clean PDF content ===
def clean_page_content(page_content):
    """Remove common PDF artifacts and noise"""
    # Remove headers/footers with page numbers
    page_content = re.sub(r'Page \d+ of \d+', '', page_content, flags=re.IGNORECASE)
    # Remove isolated numbers (likely page numbers)
    page_content = re.sub(r'^\d+$', '', page_content, flags=re.MULTILINE)
    # Remove excessive whitespace
    page_content = re.sub(r'\s+', ' ', page_content).strip()
    return page_content

# === Validate chunk quality ===
def is_valid_chunk(chunk):
    """Filter out low-quality chunks"""
    content = chunk.page_content.strip()
    # Remove chunks that are too short
    if len(content) < 50:
        return False
    # Remove chunks that are mostly numbers
    if sum(c.isdigit() for c in content) / len(content) > 0.3:
        return False
    return True

# === Chunk a single PDF and save ===
def chunk_pdf(pdf_path):
    filename = os.path.basename(pdf_path)
    print(f"📄 Processing: {filename}")
    try:
        loader = PyMuPDFLoader(pdf_path)
        pages = loader.load()
        
        # Clean each page and update metadata
        for page in pages:
            page.metadata["source"] = filename
            page.page_content = clean_page_content(page.page_content)
    except Exception as e:
        print(f"❌ Error loading {filename}: {e}")
        return []

    # Initialize token-based splitter
    try:
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, local_files_only=True)
        splitter = TokenTextSplitter(
            encoding_name=TOKENIZER_NAME,
            chunk_size=TOKEN_CHUNK_SIZE,
            chunk_overlap=TOKEN_OVERLAP,
            add_start_index=True,
            disallowed_special=(tokenizer.all_special_tokens)
        )
    except Exception as e:
        print(f"⚠️ Tokenizer error, using fallback: {e}")
        splitter = TokenTextSplitter(
            chunk_size=TOKEN_CHUNK_SIZE,
            chunk_overlap=TOKEN_OVERLAP,
            add_start_index=True
        )

    chunks = splitter.split_documents(pages)
    
    # Add chunk IDs and filter low-quality chunks
    valid_chunks = []
    for chunk in chunks:
        content = chunk.page_content.strip()
        if not content:
            continue
            
        content_hash = hashlib.md5(content.encode()).hexdigest()[:8]
        chunk.metadata["chunk_id"] = f"{chunk.metadata['source']}_{chunk.metadata.get('page', 0)}_{content_hash}"
        
        if is_valid_chunk(chunk):
            valid_chunks.append(chunk)

    # Save as .pkl
    pkl_name = os.path.join(CHUNK_SAVE_DIR, f"{filename}.pkl")
    with open(pkl_name, "wb") as f:
        pickle.dump(valid_chunks, f)
        
    print(f"💾 Saved {len(valid_chunks)}/{len(chunks)} chunks to {pkl_name}")
    return valid_chunks

# === Load all chunks ===
def load_all_chunks():
    all_chunks = []
    pkl_files = [f for f in os.listdir(CHUNK_SAVE_DIR) if f.endswith(".pkl")]
    
    if not pkl_files:
        print("⚠️ No chunk files found in {CHUNK_SAVE_DIR}")
        return all_chunks
        
    for file in pkl_files:
        with open(os.path.join(CHUNK_SAVE_DIR, file), "rb") as f:
            chunks = pickle.load(f)
            all_chunks.extend(chunks)
                
    print(f"📦 Loaded {len(all_chunks)} chunks from {len(pkl_files)} files")
    return all_chunks

# === Create FAISS vectorstore with batching ===
def create_vectorstore(chunks, embeddings):
    print("⚙️ Creating vectorstore (this may take a while)...")
    
    # Create empty FAISS index
    sample_embedding = embeddings.embed_query("test")
    dimension = len(sample_embedding)
    index = faiss.IndexFlatL2(dimension)
    docstore = InMemoryDocstore()
    index_to_docstore_id = {}
    
    vectorstore = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        normalize_L2=True
    )
    
    # Add documents in batches
    batch_size = 128
    for i in tqdm(range(0, len(chunks), batch_size), desc="Indexing chunks"):
        batch = chunks[i:i+batch_size]
        vectorstore.add_documents(batch)
        
    return vectorstore

# === Save vectorstore ===
def save_vectorstore(vs, path):
    vs.save_local(path)
    print(f"✅ Vectorstore saved to: {path}")

# === MAIN ===
if __name__ == "__main__":
    print("🚀 Starting optimized PDF vectorization pipeline...")
    print(f"Using embedding model: {EMBED_MODEL_NAME}")
    print(f"Token chunk size: {TOKEN_CHUNK_SIZE}, Overlap: {TOKEN_OVERLAP}")

    # Initialize embeddings
    embeddings = SentenceTransformerEmbeddings(model_name=EMBED_MODEL_NAME)
    
    # Get PDF files
    pdf_files = list_pdf_files(PDF_FOLDER)

    # Process PDFs
    for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
        pkl_path = os.path.join(CHUNK_SAVE_DIR, f"{os.path.basename(pdf_path)}.pkl")
        if is_pickle_up_to_date(pdf_path, pkl_path):
            print(f"✅ Skipping (cached): {os.path.basename(pdf_path)}")
        else:
            chunk_pdf(pdf_path)

    # Reload and merge everything
    print("\n🔄 Loading all chunks...")
    all_chunks = load_all_chunks()
    
    if not all_chunks:
        print("❌ No chunks found! Exiting.")
        exit(1)
        
    # Analyze chunk distribution
    chunk_lengths = [len(c.page_content) for c in all_chunks]
    avg_len = sum(chunk_lengths) / len(chunk_lengths)
    max_len = max(chunk_lengths)
    min_len = min(chunk_lengths)
    print(f"📊 Chunk stats: Avg {avg_len:.1f} chars | Min {min_len} | Max {max_len}")

    # Create and save vectorstore
    vectorstore = create_vectorstore(all_chunks, embeddings)
    save_vectorstore(vectorstore, MERGED_VECTORSTORE_PATH)

    print("\n🎉 Pipeline complete!")
    print(f"Total documents processed: {len(pdf_files)}")
    print(f"Total chunks indexed: {len(all_chunks)}")
    print(f"Vectorstore saved to: {MERGED_VECTORSTORE_PATH}")

🚀 Starting optimized PDF vectorization pipeline...
Using embedding model: intfloat/e5-base-v2
Token chunk size: 256, Overlap: 38
✅ Loaded embedding model: intfloat/e5-base-v2
🔍 Found 2 PDF files in Data


Processing PDFs: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2003.49it/s]


✅ Skipping (cached): temp.pdf
✅ Skipping (cached): temp_uploaded.pdf

🔄 Loading all chunks...
📦 Loaded 56 chunks from 2 files
📊 Chunk stats: Avg 889.3 chars | Min 270 | Max 999
⚙️ Creating vectorstore (this may take a while)...


Indexing chunks: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.56s/it]

✅ Vectorstore saved to: vectorstore_merged2

🎉 Pipeline complete!
Total documents processed: 2
Total chunks indexed: 56
Vectorstore saved to: vectorstore_merged2





In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer
model_name = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define a test sentence (prefix with "query:" or "passage:" as required)
text = "query: What is artificial intelligence?"

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0]  # CLS token embedding

print("Embedding shape:", embeddings.shape)
print("Embedding:", embeddings)


Embedding shape: torch.Size([1, 768])
Embedding: tensor([[-1.5627e-02, -3.0900e-01, -8.8487e-01, -1.8523e-01,  8.5564e-01,
          2.2566e-03, -3.6019e-02,  1.0778e+00, -4.0582e-01, -1.0489e-01,
         -5.3202e-01,  8.3078e-01, -1.2765e+00, -1.0724e-01, -5.3104e-01,
          3.6555e-01,  5.5917e-01, -6.9107e-01,  4.8836e-01, -2.3172e-01,
         -7.5056e-01, -3.9663e-01,  1.5038e-01,  4.8764e-01,  7.5376e-02,
         -4.3533e-01, -2.8811e-01,  3.9659e-01, -8.2029e-01, -7.9105e-01,
          5.1959e-02, -1.2434e-01,  3.4547e-02, -8.4559e-01, -8.3917e-01,
          4.2003e-01, -9.8905e-01,  4.4341e-02, -7.1568e-01,  3.9877e-01,
         -3.4694e-01, -2.2894e-01, -3.5760e-01,  9.6516e-01, -9.9262e-01,
         -1.4718e-01, -9.1589e-01,  4.8273e-01,  8.5412e-03, -2.8197e-01,
         -6.3935e-01,  4.2913e-01,  5.7181e-01, -3.0930e-01,  1.2736e-01,
          4.4413e-01, -8.0833e-02, -8.9270e-01, -8.1229e-01, -1.4990e-02,
          7.2220e-01,  2.6272e-01,  1.1360e-01, -2.4762e-01,  4

In [6]:
from sentence_transformers import SentenceTransformer

# This downloads the model and all required files
model = SentenceTransformer("intfloat/e5-base-v2")

# Save it locally
model.save("./e5-base-v2-local")


In [7]:
!zip -r e5-base-v2-local.zip e5-base-v2-local


'zip' is not recognized as an internal or external command,
operable program or batch file.


In [8]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("intfloat/e5-base-v2")
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2")

model.save_pretrained("./e5-base-v2-local")
tokenizer.save_pretrained("./e5-base-v2-local")


('./e5-base-v2-local\\tokenizer_config.json',
 './e5-base-v2-local\\special_tokens_map.json',
 './e5-base-v2-local\\vocab.txt',
 './e5-base-v2-local\\added_tokens.json',
 './e5-base-v2-local\\tokenizer.json')