# 📓 Embedding Generation

This notebook handles **data loading, chunking, embedding generation, and saving embeddings**.  
  
## 🔧 Setup and Imports  

In [None]:
import sys  
from pathlib import Path  
  
import numpy as np  
import pandas as pd  
from tqdm.auto import tqdm  
  
from sklearn.datasets import fetch_20newsgroups  
from langchain.text_splitter import RecursiveCharacterTextSplitter  
from llama_cpp import Llama  
  
# Path setup  
PROJECT_ROOT = Path.cwd().parent  
sys.path.append(str(PROJECT_ROOT))  
  
from vectorvet.core.utils import timer  
  
# Create directories  
MODEL_DIR = PROJECT_ROOT / "models"  
EMB_DIR = PROJECT_ROOT / "embeddings"  
EMB_DIR.mkdir(exist_ok=True, parents=True)

## 🗃️ Load and Chunk Dataset 

In [None]:
# Fetch data  
news = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))  
texts = [t for t in news.data if t.strip()]  
  
# Split into chunks  
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=50)  
  
chunked_data = []  
for idx, text in enumerate(tqdm(texts, desc="Chunking texts")):  
    chunks = splitter.split_text(text)  
    for chunk_idx, chunk in enumerate(chunks):  
        chunked_data.append({  
            "original_index": idx,  
            "chunk_index": chunk_idx,  
            "chunk": chunk  
        })  
  
chunked_df = pd.DataFrame(chunked_data)  
chunk_texts = chunked_df["chunk"].tolist()  
print(f"✅ Total chunks created: {len(chunk_texts)}")  

## 📌 Generate and Save Embeddings

In [None]:
# List of embedding models  
MODELS = [  
    "Phi-3-mini-4k-instruct-q4.gguf",  
    "Llama-3.2-1B-Instruct.Q6_K.gguf",  
    "Llama-3.1-8b-instruct-q6_k.gguf",  
    "phi-2.Q6_K.gguf",  
]  
  
# Verify models exist  
for fname in MODELS:  
    model_path = MODEL_DIR / fname  
    if not model_path.exists():  
        raise FileNotFoundError(f"Model file not found: {model_path}")  
  
# Generate embeddings  
for fname in MODELS:  
    model_path = MODEL_DIR / fname  
    model_name = model_path.stem  
    out_file = EMB_DIR / f"{model_name}_20news_chunks.npy"  
  
    if out_file.exists():  
        print(f"✔ {out_file.name} already exists – skipping")  
        continue  
  
    print(f"→ Embedding with {model_name} …")  
    llm = Llama(  
        model_path=str(model_path),  
        n_gpu_layers=-1,  
        embedding=True,  
    )  
  
    embs = np.zeros((len(chunk_texts), llm.n_embd()), dtype=np.float32)  
  
    with timer(f"Embedding generation for {model_name}"):  
        for i, txt in enumerate(tqdm(chunk_texts, desc=f"Embedding ({model_name})")):  
            emb = llm.embed(txt)  
            emb = np.array(emb)  
  
            if emb.ndim > 1:  
                emb = emb.mean(axis=0)  
  
            emb = emb.flatten()  
  
            if emb.shape[0] != llm.n_embd():  
                print(f"⚠️ Warning: Skipping text {i} due to embedding size mismatch: {emb.shape}")  
                continue  
  
            embs[i] = emb  
  
    np.save(out_file, embs)  
    print(f"✔ Saved embeddings to {out_file}")  