# 🧠 EDA Exploration for Document Embeddings

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import pandas as pd

# Load the embeddings and document chunks
with open('../embeddings/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

with open('../embeddings/doc_chunks.pkl', 'rb') as f:
    doc_chunks = pickle.load(f)

# Convert to numpy array
embedding_matrix = np.array(embeddings)

print(f"Loaded {len(embedding_matrix)} embeddings with dimension {embedding_matrix.shape[1]}")


## 📊 Embedding Distribution & Dimensionality

In [None]:
# Show the distribution of values in a few dimensions
plt.figure(figsize=(12, 6))
for i in range(min(5, embedding_matrix.shape[1])):
    sns.histplot(embedding_matrix[:, i], kde=True, bins=30, label=f'Dim {i+1}')
plt.title("Distribution of Embedding Values Across Dimensions")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.legend()
plt.show()


## 🔍 Cosine Similarity Matrix

In [None]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embedding_matrix)

# Display a heatmap of the similarities for a subset
subset_size = min(20, similarity_matrix.shape[0])
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix[:subset_size, :subset_size], cmap="coolwarm", annot=False)
plt.title("Cosine Similarity Matrix (Top 20 Chunks)")
plt.show()


## 🧩 Inspect Most Similar Document Chunks

In [None]:
# Find the most similar pair of different chunks
np.fill_diagonal(similarity_matrix, 0)
max_sim_idx = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
print(f"Most similar pair: Chunk {max_sim_idx[0]} and Chunk {max_sim_idx[1]}")
print("\n--- Chunk 1 ---\n")
print(doc_chunks[max_sim_idx[0]])
print("\n--- Chunk 2 ---\n")
print(doc_chunks[max_sim_idx[1]])
print(f"Cosine Similarity: {similarity_matrix[max_sim_idx]:.4f}")


## 📈 Embedding Norm Distribution

In [None]:
# Compute L2 norm of each embedding
embedding_norms = np.linalg.norm(embedding_matrix, axis=1)

# Plot distribution
plt.figure(figsize=(8, 4))
sns.histplot(embedding_norms, kde=True, bins=30)
plt.title("L2 Norm Distribution of Embeddings")
plt.xlabel("Norm")
plt.ylabel("Frequency")
plt.show()
