In [14]:
import json
import voyageai
import re
import math
from typing import Optional, Any, List, Dict, Tuple
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()
client = voyageai.Client()
console = Path("/home/jx-creator/Projects/console")
tutorials = console / "tutorials"

In [15]:
# Chunk by a set number of characters
def chunk_by_char(text, chunk_size=300, overlap=30):
   chunks: list = []
   start_index: int = 0
   text_length = len(text)

   while start_index < text_length:
      end_index: int = min(start_index + chunk_size, text_length)

      chunk_text = text[start_index:end_index]
      chunks.append(chunk_text)

      start_index: int = (
         end_index - overlap if end_index < text_length else text_length
      )

   return chunks

In [16]:
# Chunk by sentence
def chunk_by_sentence(text, max_per_chunk=5, overlap=1):
   sentences = re.split(r"(?<=[.!?])\s+", text)

   chunks: list = []
   start: int = 0
   length: int = len(sentences)

   while start < length:
      end: int = min(start + max_per_chunk, length)

      current = sentences[start:end]
      chunks.append(" ".join(current))

      start += max_per_chunk - overlap

      if start < 0:
         start = 0
   
   return chunks

In [17]:
# Chunk by section
def chunk_by_section(document):
   pattern = r"\n## "
   return re.split(pattern, document)

In [18]:
with open(tutorials / "rag" / "report.md", "r") as f:
   text = f.read()

chunks = chunk_by_section(text)

In [19]:
chunks[2]

'Table of Contents\n\n1.  Executive Summary\n2.  Table of Contents\n3.  Methodology\n4.  Section 1: Medical Research - Understanding XDR-471 Syndrome\n5.  Section 2: Software Engineering - Project Phoenix Stability Enhancements\n6.  Section 3: Financial Analysis - Q3 Performance and Outlook\n7.  Section 4: Scientific Experimentation - Characterization of Material Composite XT-5\n8.  Section 5: Legal Developments - Navigating IP Precedents and Regulatory Shifts\n9.  Section 6: Product Engineering - Finalizing Model Zircon-5 Specifications\n10. Section 7: Historical Research - Re-evaluating the Galveston Accords (1921)\n11. Section 8: Project Management - Progress on Project Cerberus Phase 2B\n12. Section 9: Pharmaceutical Development - Compound CTX-204b Phase IIa Update\n13. Section 10: Cybersecurity Analysis - Incident Response Report\n14. Future Directions\n'

In [20]:
def generate_embedding(chunks, model="voyage-3-large", input_type="query"):
   is_list: bool = isinstance(chunks, list)
   input: list = chunks if is_list else [chunks]
   result = client.embed(input, model=model, input_type=input_type)
   return result.embeddings if is_list else result.embeddings[0]


In [None]:
#class VectorIndex:
#   def __init__(
#         self,
#         distance_metric: str = "cosine",
#         embedding_fn=None,
#   ):
#      self.vectors: List[List[float]] = []
#      self.documents: List[Dict[str, Any]] = []
#      self.vec_dim: Optional[int] = None
#      if distance_metric not in ["cosine", "euclidian"]:
#         raise ValueError("distance_metric must be one of either 'cosine' or 'euclidean'")
#      self.metric: str = distance_metric
#      self.embedding = embedding_fn
#   
#   def add_document(self, document: Dict[str, Any]):
#      if not self.embedding:
#         raise ValueError(
#            "The embedding function was not provided during initialization."
#         )
#      if not isinstance(document, dict):
#         raise TypeError("Document must be a dictionary")
#      if "content" not in document:
#         raise ValueError("The document dictionary must contain a 'content' key.")
#      
#      content = document["content"]
#      if not isinstance(content, str):
#         raise TypeError("Document 'content' must be a string.")
#      
#      vector = self.embedding(content)
#      self.add_vector(vector=vector, document=document)

In [21]:
# VectorIndex implementation
import math
from typing import Optional, Any, List, Dict, Tuple


class VectorIndex:
    def __init__(
        self,
        distance_metric: str = "cosine",
        embedding_fn=None,
    ):
        self.vectors: List[List[float]] = []
        self.documents: List[Dict[str, Any]] = []
        self._vector_dim: Optional[int] = None
        if distance_metric not in ["cosine", "euclidean"]:
            raise ValueError("distance_metric must be 'cosine' or 'euclidean'")
        self._distance_metric = distance_metric
        self._embedding_fn = embedding_fn

    def add_document(self, document: Dict[str, Any]):
        if not self._embedding_fn:
            raise ValueError(
                "Embedding function not provided during initialization."
            )
        if not isinstance(document, dict):
            raise TypeError("Document must be a dictionary.")
        if "content" not in document:
            raise ValueError(
                "Document dictionary must contain a 'content' key."
            )

        content = document["content"]
        if not isinstance(content, str):
            raise TypeError("Document 'content' must be a string.")

        vector = self._embedding_fn(content)
        self.add_vector(vector=vector, document=document)

    def search(
        self, query: Any, k: int = 1
    ) -> List[Tuple[Dict[str, Any], float]]:
        if not self.vectors:
            return []

        if isinstance(query, str):
            if not self._embedding_fn:
                raise ValueError(
                    "Embedding function not provided for string query."
                )
            query_vector = self._embedding_fn(query)
        elif isinstance(query, list) and all(
            isinstance(x, (int, float)) for x in query
        ):
            query_vector = query
        else:
            raise TypeError(
                "Query must be either a string or a list of numbers."
            )

        if self._vector_dim is None:
            return []

        if len(query_vector) != self._vector_dim:
            raise ValueError(
                f"Query vector dimension mismatch. Expected {self._vector_dim}, got {len(query_vector)}"
            )

        if k <= 0:
            raise ValueError("k must be a positive integer.")

        if self._distance_metric == "cosine":
            dist_func = self._cosine_distance
        else:
            dist_func = self._euclidean_distance

        distances = []
        for i, stored_vector in enumerate(self.vectors):
            distance = dist_func(query_vector, stored_vector)
            distances.append((distance, self.documents[i]))

        distances.sort(key=lambda item: item[0])

        return [(doc, dist) for dist, doc in distances[:k]]

    def add_vector(self, vector, document: Dict[str, Any]):
        if not isinstance(vector, list) or not all(
            isinstance(x, (int, float)) for x in vector
        ):
            raise TypeError("Vector must be a list of numbers.")
        if not isinstance(document, dict):
            raise TypeError("Document must be a dictionary.")
        if "content" not in document:
            raise ValueError(
                "Document dictionary must contain a 'content' key."
            )

        if not self.vectors:
            self._vector_dim = len(vector)
        elif len(vector) != self._vector_dim:
            raise ValueError(
                f"Inconsistent vector dimension. Expected {self._vector_dim}, got {len(vector)}"
            )

        self.vectors.append(list(vector))
        self.documents.append(document)

    def _euclidean_distance(
        self, vec1: List[float], vec2: List[float]
    ) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")
        return math.sqrt(sum((p - q) ** 2 for p, q in zip(vec1, vec2)))

    def _dot_product(self, vec1: List[float], vec2: List[float]) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")
        return sum(p * q for p, q in zip(vec1, vec2))

    def _magnitude(self, vec: List[float]) -> float:
        return math.sqrt(sum(x * x for x in vec))

    def _cosine_distance(self, vec1: List[float], vec2: List[float]) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")

        mag1 = self._magnitude(vec1)
        mag2 = self._magnitude(vec2)

        if mag1 == 0 and mag2 == 0:
            return 0.0
        elif mag1 == 0 or mag2 == 0:
            return 1.0

        dot_prod = self._dot_product(vec1, vec2)
        cosine_similarity = dot_prod / (mag1 * mag2)
        cosine_similarity = max(-1.0, min(1.0, cosine_similarity))

        return 1.0 - cosine_similarity

    def __len__(self) -> int:
        return len(self.vectors)

    def __repr__(self) -> str:
        has_embed_fn = "Yes" if self._embedding_fn else "No"
        return f"VectorIndex(count={len(self)}, dim={self._vector_dim}, metric='{self._distance_metric}', has_embedding_fn='{has_embed_fn}')"

In [22]:
embeddings = generate_embedding(chunks)
store = VectorIndex()
for embedding, chunk in zip(embeddings, chunks):
   store.add_vector(embedding, {"content": chunk})

In [23]:
user_embedding = generate_embedding("What did the software engineering dept do last year?")
results = store.search(user_embedding, 2)
for doc, distance in results:
   print(distance, "\n", doc["content"][0:200], "\n")

0.48331830503085815 
 Section 2: Software Engineering - Project Phoenix Stability Enhancements

The Software Engineering division dedicated considerable effort to improving the stability and performance of the core systems 

0.4888823735702068 
 Future Directions

This year's cross-domain insights underscore the interconnectedness of our diverse research and operational activities. The stability enhancements achieved in Software Engineering ( 

