# Embedding Analysis

This notebook explores the embedding generation and similarity aspects of the RAG pipeline.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import json

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

In [None]:
# Load some chunks for analysis
chunks = []
with open("../data/chunks.jsonl", "r") as f:
    for line in f:
        chunks.append(json.loads(line))

print(f"Loaded {len(chunks)} chunks")

# Sample some chunks for visualization
sample_chunks = chunks[:100]  # First 100 chunks for visualization
sample_texts = [chunk['text'] for chunk in sample_chunks]

# Generate embeddings
embeddings = model.encode(sample_texts)
print(f"Generated embeddings shape: {embeddings.shape}")

In [None]:
# Test similarity between different chunks
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compare first few chunks
print("Similarity between chunks:")
for i in range(min(3, len(embeddings))):
    for j in range(i+1, min(4, len(embeddings))):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        print(f"Chunk {i} vs Chunk {j}: {sim:.4f}")

In [None]:
# Test query embedding and similarity
test_queries = [
    "transformer architecture",
    "deep learning models",
    "computer vision"
]

query_embeddings = model.encode(test_queries)

print("Query similarities to first chunk:")
for i, query in enumerate(test_queries):
    sim = cosine_similarity(query_embeddings[i], embeddings[0])
    print(f"'{query}' similarity to first chunk: {sim:.4f}")