# 📚 VectorVet Demo Notebook  
  
This notebook demonstrates embedding generation, loading embeddings, computing metrics, and summarizing results using the custom `VectorVet` toolkit. 

---    
## 🔧 Setup and Imports  

In [None]:
# Standard Libraries  
import sys  
from pathlib import Path  
  
# Data Science Libraries  
import numpy as np  
import pandas as pd  
from tqdm.auto import tqdm  
  
# Text Processing  
from sklearn.datasets import fetch_20newsgroups  
from langchain.text_splitter import RecursiveCharacterTextSplitter  
  
# Embedding Generation  
from llama_cpp import Llama  
  
# Custom VectorVet Modules  
PROJECT_ROOT = Path.cwd().parent  
sys.path.append(str(PROJECT_ROOT))  
  
from vectorvet.core.loader import load_multiple_embeddings
from vectorvet.core.metrics import run_all_metrics
from vectorvet.core.summarizer import summarize_to_dataframe
from vectorvet.core.utils import timer
  
# Display Configuration  
pd.set_option("display.max_columns", None) 

---  
  
## 🗃️ Load and Chunk the Dataset  
  
We'll use the well-known 20 Newsgroups dataset for this demo. We'll chunk the texts into manageable pieces.  

In [None]:
# Fetch the 20 Newsgroups dataset  
news = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))  
texts = [t for t in news.data if t.strip()]  
  
# Initialize a text splitter  
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=50)  
  
# Chunk the dataset  
chunked_data = []  
for idx, text in enumerate(tqdm(texts, desc="Chunking texts")):  
    chunks = splitter.split_text(text)  
    for chunk_idx, chunk in enumerate(chunks):  
        chunked_data.append({  
            "original_index": idx,  
            "chunk_index": chunk_idx,  
            "chunk": chunk  
        })  
  
# Create DataFrame of chunks  
chunked_df = pd.DataFrame(chunked_data)  
chunk_texts = chunked_df["chunk"].tolist()  
  
print(f"✅ Total chunks created: {len(chunk_texts)}")

In [None]:
chunk_texts = chunk_texts[:1000]

---  
  
## 📌 Generate Embeddings for Each Model  
  
We'll generate embeddings using various LLM models and save them for further analysis.  

In [None]:
# Define paths  
MODEL_DIR = PROJECT_ROOT / "models"  
EMB_DIR = PROJECT_ROOT / "embeddings"  
EMB_DIR.mkdir(exist_ok=True, parents=True)  
  
# List of models to embed with  
MODELS = [  
    "Phi-3-mini-4k-instruct-q4.gguf",  
    "Llama-3.2-1B-Instruct.Q6_K.gguf",  
    "Llama-3.1-8b-instruct-q6_k.gguf",  
    "phi-2.Q6_K.gguf",  
]  
  
# Generate embeddings  
for fname in MODELS:  
    model_path = MODEL_DIR / fname  
    model_name = model_path.stem  
    out_file = EMB_DIR / f"{model_name}_20news_chunks.npy"  
  
    if out_file.exists():  
        print(f"✔ {out_file.name} already exists – skipping")  
        continue  
  
    print(f"→ Embedding with {model_name} …")  
    llm = Llama(  
        model_path=str(model_path),  
        n_gpu_layers=-1,  
        embedding=True,  
    )  
  
    embs = np.zeros((len(chunk_texts), llm.n_embd()), dtype=np.float32)  
  
    with timer(f"Embedding generation for {model_name}"):  
        for i, txt in enumerate(tqdm(chunk_texts, desc=f"Embedding ({model_name})")):  
            emb = llm.embed(txt)  
            emb = np.array(emb)  
  
            if emb.ndim > 1:  
                emb = emb.mean(axis=0)  # Average if needed  
  
            emb = emb.flatten()  
  
            if emb.shape[0] != llm.n_embd():  
                print(f"Warning: Skipping text {i} due to embedding size mismatch: {emb.shape}")  
                continue  
  
            embs[i] = emb  
  
    np.save(out_file, embs)  
    print(f"✔ Saved embeddings to {out_file}")  

---  
  
## 📊 Compute Metrics and Summarize Results  
  
Now let's load the embeddings we've saved, compute intrinsic embedding metrics, and summarize results into a tidy dataframe.  

In [None]:
# Find saved embeddings  

files = {  
    p.stem.split("_20news_chunks")[0]: str(p)  
    for p in EMB_DIR.glob("*_20news_chunks.npy")  
}  
  
print("🗂️ Embedding sets detected:", list(files.keys()))  
  
# Load embeddings  
embs = load_multiple_embeddings(files)  
  
# Compute all metrics  
results = {}  
for name, mat in embs.items():  
    print(f"\n📐 Running metrics for: {name}")  
    with timer(f"Metrics computation for {name}"):  
        results[name] = run_all_metrics(mat)  
  
# Summarize results into DataFrame  
summary_df = summarize_to_dataframe(results)  
  
# Display summarized metrics nicely  
summary_df.style.format(precision=3) 

---  
  
## ✅ Final Results  
  
The resulting table summarizes the embedding quality across all models, making it easy to compare and interpret metrics like isotropy, hubness, clustering quality, and pairwise cosine similarity.  
  
🎉 **You're all set!**  