In [2]:
from langchain.llms import OpenAI

### In-mem cache

In [3]:
import langchain
from langchain.cache import InMemoryCache
langchain.llm_cache = InMemoryCache()

In [4]:
# To make the caching really obvious, lets use a slower model.
llm = OpenAI(model_name="text-davinci-002", n=2, best_of=2)

In [5]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm("Tell me a joke")

CPU times: user 15.1 ms, sys: 3.45 ms, total: 18.6 ms
Wall time: 1.81 s


"\n\nWhy don't scientists trust atoms?\nBecause they make up everything."

In [6]:
%%time
# The second time it is, so it goes faster
llm("Tell me a joke")

CPU times: user 144 µs, sys: 95 µs, total: 239 µs
Wall time: 244 µs


"\n\nWhy don't scientists trust atoms?\nBecause they make up everything."

### SQLite cache

In [7]:
!rm .langchain.db

rm: cannot remove '.langchain.db': No such file or directory


In [8]:
# We can do the same thing with a SQLite cache
from langchain.cache import SQLiteCache
langchain.llm_cache = SQLiteCache(database_path=".langchain.db")

In [9]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm("Tell me a joke")

CPU times: user 27.9 ms, sys: 98 µs, total: 28 ms
Wall time: 2.95 s


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [10]:
%%time
# The second time it is, so it goes faster
llm("Tell me a joke")

CPU times: user 2.74 ms, sys: 3.2 ms, total: 5.94 ms
Wall time: 4.98 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [16]:
%%time
# Not exact match, should take long again
llm("Tell me a nice joke")

CPU times: user 3.05 ms, sys: 0 ns, total: 3.05 ms
Wall time: 2.59 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

### RedisSemanticCache

In [23]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.cache import RedisSemanticCache


langchain.llm_cache = RedisSemanticCache(
    redis_url="redis://localhost:6379",
    embedding=OpenAIEmbeddings()
)


In [25]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm("Tell me a joke")

CPU times: user 25.7 ms, sys: 6.31 ms, total: 32 ms
Wall time: 2.41 s


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [26]:
%%time
# The second time, while not a direct hit, the question is semantically similar to the original question,
# so it uses the cached result!
llm("Tell me one joke")

CPU times: user 10.7 ms, sys: 706 µs, total: 11.4 ms
Wall time: 312 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [48]:
# explicit playing with the embeddings
import numpy as np

emb = langchain.llm_cache.embedding

pv1 = np.array(emb.embed_query("Tell me a joke"), dtype=float)
pv2 = np.array(emb.embed_query("Tell me one joke"), dtype=float)
pvZ = np.array(emb.embed_query("I once saw a platypus cooking dinner"), dtype=float)

print('pv1 . pv1 = %.4f', np.dot(pv1, pv1))
print('pv2 . pv2 = %.4f', np.dot(pv2, pv2))
print('pvZ . pvZ = %.4f', np.dot(pvZ, pvZ))
print('')
print('pv1 . pv2 = %.4f', np.dot(pv1, pv2))
print('pv1 . pvZ = %.4f', np.dot(pv1, pvZ))

pv1 . pv1 = %.4f 0.9999999850451275
pv2 . pv2 = %.4f 0.999999981578638
pvZ . pvZ = %.4f 1.0000000035189067

pv1 . pv2 = %.4f 0.9666529021629459
pv1 . pvZ = %.4f 0.7769778358531974


### GPTCache

#### Exact match

In [49]:
import gptcache
from gptcache.processor.pre import get_prompt
from gptcache.manager.factory import get_data_manager
from langchain.cache import GPTCache

# Avoid multiple caches using the same file, causing different llm model caches to affect each other
i = 0
file_prefix = "data_map"

def init_gptcache_map(cache_obj: gptcache.Cache):
    global i
    cache_path = f'{file_prefix}_{i}.txt'
    cache_obj.init(
        pre_embedding_func=get_prompt,
        data_manager=get_data_manager(data_path=cache_path),
    )
    i += 1

langchain.llm_cache = GPTCache(init_gptcache_map)

In [51]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm("Tell me a joke")

CPU times: user 15.4 ms, sys: 0 ns, total: 15.4 ms
Wall time: 973 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [52]:
%%time
# The second time it is, so it goes faster
llm("Tell me a joke")

CPU times: user 938 µs, sys: 0 ns, total: 938 µs
Wall time: 952 µs


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [79]:
# Some inspection to play with this gptcache hidden in the belly of the langchain stuff
gpCache = langchain.llm_cache.gptcache_dict["[('_type', 'openai'), ('best_of', 2), ('frequency_penalty', 0), ('logit_bias', {}), ('max_tokens', 256), ('model_name', 'text-davinci-002'), ('n', 2), ('presence_penalty', 0), ('request_timeout', None), ('stop', None), ('temperature', 0.7), ('top_p', 1)]"]
# Force file write :)
gpCache.flush()

# In this case you can check that...
gpcache = list(langchain.llm_cache.gptcache_dict.values())[0]
gpcache.embedding_func('aaa')
# ... this is the identity (exact cache!)

'aaa'

#### Similarity caching

In [88]:
import gptcache
from gptcache.processor.pre import get_prompt
from gptcache.manager.factory import get_data_manager
from langchain.cache import GPTCache
from gptcache.manager import get_data_manager, CacheBase, VectorBase
from gptcache import Cache
from gptcache.embedding import Onnx
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation

# Avoid multiple caches using the same file, causing different llm model caches to affect each other
i = 0
file_prefix = "data_map"
llm_cache = Cache()


def init_gptcache_map(cache_obj: gptcache.Cache):
    global i
    cache_path = f'{file_prefix}_{i}.txt'
    onnx = Onnx()
    cache_base = CacheBase('sqlite')
    vector_base = VectorBase('faiss', dimension=onnx.dimension)
    data_manager = get_data_manager(cache_base, vector_base, max_size=10, clean_size=2)
    cache_obj.init(
        pre_embedding_func=get_prompt,
        embedding_func=onnx.to_embeddings,
        data_manager=data_manager,
        similarity_evaluation=SearchDistanceEvaluation(),
    )
    i += 1

langchain.llm_cache = GPTCache(init_gptcache_map)

In [89]:
# this is what the similarity cache uses under the hood:
from gptcache.embedding import Onnx
Onnx().to_embeddings('Today is a sunny day.')
# a (768,) ndarray

array([-2.43441667e-01,  2.71626980e-01, -1.36018773e-01, -1.08390815e+00,
        5.44992136e-01,  1.36763834e+00, -5.77469025e-01, -2.02428940e-01,
       -3.35534077e-01,  2.22559914e-01, -1.42762621e+00,  1.84675753e-02,
        9.58186403e-01,  5.22948060e-01, -2.30463110e-01, -2.61095968e-01,
        6.06494715e-01, -1.94902738e-01, -7.64107425e-01,  9.42087896e-01,
       -1.16278707e+00,  2.88439989e-01,  5.45596303e-02, -2.44581941e-01,
       -1.10461770e+00, -7.75067637e-01,  3.64013982e-01,  9.96997796e-01,
       -2.71601271e-01, -1.62657753e-01, -6.61603360e-01, -4.46847323e-02,
        5.54294888e-01,  3.27579556e-02, -5.79971708e-02, -5.30550126e-02,
        5.74585985e-01, -3.45220606e-01, -6.97960613e-01, -3.38827940e-01,
        1.13999067e+00,  2.61081062e-01,  4.74922201e-01, -1.98334778e+00,
       -1.98838400e-01,  4.93143775e-01, -1.98280379e-01, -7.35078990e-01,
       -4.20276333e-01, -2.00566373e-01, -7.73501173e-01, -2.71565950e-01,
        1.41680222e-01, -

In [91]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm("Tell me a joke")

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.6/17.6 MB 3.6 MB/s eta 0:00:00
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
CPU times: user 2.06 s, sys: 78.4 ms, total: 2.14 s
Wall time: 19.9 s


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [92]:
%%time
# This is an exact match, so it finds it in the cache
llm("Tell me a joke")

CPU times: user 1.71 s, sys: 44.5 ms, total: 1.75 s
Wall time: 265 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [93]:
%%time
# This is not an exact match, but semantically within distance so it hits!
llm("Tell me joke")

CPU times: user 1.45 s, sys: 4.36 ms, total: 1.45 s
Wall time: 232 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

### SQLAlchemy

In [110]:
# You can use SQLAlchemyCache to cache with any SQL database supported by SQLAlchemy.

from langchain.cache import SQLAlchemyCache
from sqlalchemy import create_engine

engine = create_engine("postgresql://postgres:cachepwd@172.17.0.2:5432/postgres")
langchain.llm_cache = SQLAlchemyCache(engine)

In [112]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm("Tell me a joke")

CPU times: user 15.4 ms, sys: 8.04 ms, total: 23.4 ms
Wall time: 758 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [113]:
%%time
# This is an exact match, so it finds it in the cache
llm("Tell me a joke")

CPU times: user 4.6 ms, sys: 0 ns, total: 4.6 ms
Wall time: 3.69 ms


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

In [114]:
%%time
# This is not an exact match, but semantically within distance so it hits!
llm("Tell me joke")

CPU times: user 17 ms, sys: 0 ns, total: 17 ms
Wall time: 1.03 s


'\n\nWhy did the chicken cross the road?\n\nTo get to the other side.'

##### SQLAlchemy with custom schema

In [116]:
# You can define your own declarative SQLAlchemyCache child class to customize the schema used for caching. For example, to support high-speed fulltext prompt indexing with Postgres, use:

from sqlalchemy import Column, Integer, String, Computed, Index, Sequence
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy_utils import TSVectorType
from langchain.cache import SQLAlchemyCache

Base = declarative_base()


class FulltextLLMCache(Base):  # type: ignore
    """Postgres table for fulltext-indexed LLM Cache"""

    __tablename__ = "llm_cache_fulltext"
    id = Column(Integer, Sequence('cache_id'), primary_key=True)
    prompt = Column(String, nullable=False)
    llm = Column(String, nullable=False)
    idx = Column(Integer)
    response = Column(String)
    prompt_tsv = Column(TSVectorType(), Computed("to_tsvector('english', llm || ' ' || prompt)", persisted=True))
    __table_args__ = (
        Index("idx_fulltext_prompt_tsv", prompt_tsv, postgresql_using="gin"),
    )

engine = create_engine("postgresql://postgres:cachepwd@172.17.0.2:5432/postgres")
langchain.llm_cache = SQLAlchemyCache(engine, FulltextLLMCache)

  Base = declarative_base()


Note: this uses (not investigated too much) the [TSVECTOR](https://docs.sqlalchemy.org/en/20/dialects/postgresql.html#full-text-search) Postgres type which should be the basis for a full-text search (does not seem to relate to "vector search", however. Indeed, out of the box, this is not semantical similarity)

In [120]:
%%time
# This is not an exact match, but semantically within distance so it hits!
llm("Tell me joke")

CPU times: user 1.84 ms, sys: 73 µs, total: 1.92 ms
Wall time: 1.65 ms


"\n\nWhy don't scientists trust atoms?\nBecause they make up everything."

In [122]:
%%time
# This is not an exact match, but semantically within distance so it hits!
llm("Tell me a joke")

CPU times: user 1.46 ms, sys: 115 µs, total: 1.58 ms
Wall time: 1.43 ms


"\n\nWhy couldn't the bicycle stand up by itself? Because it was...two tired!"