semantic_similarity + (1.0 - decay_rate) ^ hours_passed_from_last_access


In [1]:
from datetime import datetime, timedelta

import faiss
from langchain.docstore import InMemoryDocstore
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Low decay rate

In [2]:
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(
    vectorstore=vectorstore, decay_rate=0.0000000000000000000000001, k=1
)

In [7]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world", metadata={"last_accessed_at": yesterday})]
)
retriever.add_documents([Document(page_content="hello foo")])

['75b60237-8377-404a-a695-7f76ee57dcbd']

In [6]:
yesterday

datetime.datetime(2024, 2, 20, 22, 25, 17, 377619)

In [8]:
# "Hello World" is returned first because it is most salient, and the decay rate is close to 0., meaning it's still recent enough
retriever.get_relevant_documents("hello world")

[Document(page_content='hello world', metadata={'last_accessed_at': datetime.datetime(2024, 2, 21, 22, 27, 4, 94798), 'created_at': datetime.datetime(2024, 2, 21, 22, 25, 17, 377619), 'buffer_idx': 0})]

# High decay rate


In [9]:
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(
    vectorstore=vectorstore, decay_rate=0.999, k=1
)

In [10]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world", metadata={"last_accessed_at": yesterday})]
)
retriever.add_documents([Document(page_content="hello foo")])

['5df3269d-74f3-4e79-8629-6678188589c3']

In [11]:
# "Hello Foo" is returned first because "hello world" is mostly forgotten
retriever.get_relevant_documents("hello world")

[Document(page_content='hello foo', metadata={'last_accessed_at': datetime.datetime(2024, 2, 21, 22, 29, 31, 721749), 'created_at': datetime.datetime(2024, 2, 21, 22, 29, 21, 445575), 'buffer_idx': 1})]

# Virtual time

In [12]:
import datetime

from langchain.utils import mock_now

In [13]:
# Notice the last access time is that date time
with mock_now(datetime.datetime(2024, 2, 3, 10, 11)):
    print(retriever.get_relevant_documents("hello world"))

OverflowError: (34, 'Result too large')