In [4]:
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import LlamaCppEmbeddings
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.schema import BaseLanguageModel, Document
from langchain.vectorstores import FAISS
import numpy as np
import faiss

In [5]:
model_bin = "/media/captdishwasher/Samshmung/horenbergerb/llama/llama.cpp/models/llama/converted/llama_13b_ggml_q4_3.bin"

In [9]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

embeddings_model = LlamaCppEmbeddings(model_path=model_bin, n_ctx=2048, n_threads=6, n_batch=512)
def embedding_function(query):
    x = embeddings_model.embed_query(query)
    return normalize(x)

def relevance_score_fn(score: float) -> float:
    """Return a similarity score on a scale [0, 1]."""
    # This is probably a really dumb similarity score
    # The embeddings don't seem to have any bounds on what the vector magnitudes can be
    # So I normalized them to be between 0 and 1
    # Then FAISS takes the squared Euclidean distance between these vectors
    # So the output has a max value of 4 and a minimum of 0
    # But it needs to be between 0 and 1
    # Hence the function:
    return (4.0 - score) / 4


def create_new_memory_retriever():
    """Create a new vector store retriever unique to the agent."""
    embedding_size = 5120
    index = faiss.IndexFlatL2(embedding_size)
    vectorstore = FAISS(embedding_function, index, InMemoryDocstore({}), {}, relevance_score_fn=relevance_score_fn)
    return TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, other_score_keys=["importance"], k=15)    

llama.cpp: loading model from /media/captdishwasher/Samshmung/horenbergerb/llama/llama.cpp/models/llama/converted/llama_13b_ggml_q4_3.bin
llama_model_load_internal: format     = ggjt v1 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 6 (mostly Q4_3)
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =  73.73 KB
llama_model_load_internal: mem required  = 11359.03 MB (+ 3216.00 MB per state)
llama_init_from_file: kv self size  = 3200.00 MB
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA =

In [15]:
memory_retriever = create_new_memory_retriever()

memories = ["peepee poopoo", "stinky doodoo", "flowers", "filthy and putrid", "Bob Ross"]
documents = [Document(page_content=memory) for memory in memories]

memory_retriever.add_documents(documents)


llama_print_timings:        load time =  3236.76 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  3104.24 ms /     8 tokens (  388.03 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  3105.98 ms

llama_print_timings:        load time =  3236.76 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  2970.63 ms /     7 tokens (  424.38 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  2972.03 ms

llama_print_timings:        load time =  3236.76 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  2541.01 ms /     3 tokens (  847.00 ms per token)
llama_print_timings

['0843d5e8-c1f1-4450-ab1c-b9f8e960cf22',
 'a60630e3-cb21-4519-8799-856c252de0fb',
 'f51935a2-1487-40ad-98c4-e9495444ca5e',
 '2abdaf55-193d-42e0-8996-98b2ff34f0fd',
 '0ed19bf7-6281-44d4-8519-7d770d59a626']

In [16]:
memory_retriever.get_salient_docs("yucky, nasty")


llama_print_timings:        load time =  3236.76 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  2819.97 ms /     6 tokens (  470.00 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  2821.72 ms


{3: (Document(page_content='filthy and putrid', metadata={'last_accessed_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'created_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'buffer_idx': 3}),
  0.9033975228667259),
 1: (Document(page_content='stinky doodoo', metadata={'last_accessed_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'created_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'buffer_idx': 1}),
  0.8958556354045868),
 2: (Document(page_content='flowers', metadata={'last_accessed_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'created_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'buffer_idx': 2}),
  0.8700922429561615),
 0: (Document(page_content='peepee poopoo', metadata={'last_accessed_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'created_at': datetime.datetime(2023, 4, 29, 16, 32, 38, 358177), 'buffer_idx': 0}),
  0.8667032569646835),
 4: (Document(page_content='Bob Ross', metadata={'last_accessed_at

In [17]:
embedding_size = 5120
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embedding_function, index, InMemoryDocstore({}), {}, relevance_score_fn=relevance_score_fn)

vectorstore.add_documents(documents)
vectorstore.similarity_search_with_relevance_scores('stinky yucky')


llama_print_timings:        load time =  3236.76 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  3107.75 ms /     8 tokens (  388.47 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  3109.13 ms

llama_print_timings:        load time =  3236.76 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  2948.27 ms /     7 tokens (  421.18 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  2949.61 ms

llama_print_timings:        load time =  3236.76 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  2635.03 ms /     3 tokens (  878.34 ms per token)
llama_print_timings

[(Document(page_content='filthy and putrid', metadata={}), 0.7402871549129486),
 (Document(page_content='stinky doodoo', metadata={}), 0.7228175699710846),
 (Document(page_content='peepee poopoo', metadata={}), 0.7214316427707672),
 (Document(page_content='Bob Ross', metadata={}), 0.6299707293510437)]

In [None]:
embedding1 = np.array([embedding_function(memory_content1)], dtype=np.float32)
embedding2 = np.array([embedding_function(memory_content2)], dtype=np.float32)
scores, indices = index.search(embedding1, k=4)
print(scores)
print(indices)

print(np.square(np.linalg.norm(embedding1 - embedding2)))


llama_print_timings:        load time =  2283.82 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  2186.93 ms /     8 tokens (  273.37 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  2189.45 ms


[[0.0000000e+00 2.2389171e-01 3.4028235e+38 3.4028235e+38]]
[[ 0  1 -1 -1]]
0.22389188



llama_print_timings:        load time =  2283.82 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  2043.72 ms /     7 tokens (  291.96 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  2045.95 ms
