In [140]:
import pandas as pd
import numpy as np

In [141]:
import ollama

In [142]:
from typing import List

In [143]:
model = "granite3.1-dense:8b"
#model = "llama3-chatqa:8b"
#model = "nomic-embed-text"

In [144]:
def get_embedding(text: str, model: str=model) -> List[float]:
    result = ollama.embed(
      model=model,
      input=text
    )
    return result["embeddings"][0]

In [147]:
def load_embeddings(filename: str) -> dict[tuple[str, str, str], List[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(filename)
    max_dim = 4095
    return {
           (r.title, r.mistress, r.synopsis): [r[str(i)] for i in range(max_dim + 1)] for index, r in df.iterrows()
    }

In [148]:
document_embeddings = load_embeddings("k9_story_vectors_4096.csv")

In [149]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

In [150]:
def cosine_similarity(a: list[float], b: list[float]) -> float:
    dot_product = sum([x * y for x, y in zip(a, b)])
    norm_a = sum([x ** 2 for x in a]) ** 0.5
    norm_b = sum([x ** 2 for x in b]) ** 0.5
    return dot_product / (norm_a * norm_b)

In [151]:
def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [152]:
order_document_sections_by_query_similarity("What did you do to the Captain's parrot?", document_embeddings)

[(0.7971727128028463,
  ('The Armageddon Factor',
   'Romana',
   "Using their diminished size, the Doctor Master and Drax smuggle themselves inside me into the Shadow's lair. I was a Trojan dog! Drax again uses the stabiliser, this time to return them to their normal size. The Doctor Master snatches the partially assembled Key and the final segment and disappears with mistress Romana and me in the TARDIS, assembling the Key to save the day!")),
 (0.7891117512992056,
  ('The Pirate Planet',
   'Romana',
   "My master used the TARDIS to disrupt Zanak's materialisation around Earth while the Mentiads sabotaged the engines. I killed the Polyphase Avatron, the Captain's deadly robot parrot. The Captain's plan failed and my master sent Calufrax off into the time/space continuum where we later picked it up. I was proud to have been part of this mission and I was glad that we were able to save many planets from being destroyed by Queen Xanxia.")),
 (0.7842829937513309,
  ('School Reunion',
  