In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import TextLoader
import daft
import vexpresso
from vexpresso import transformation
from vexpresso.retriever import NumpyRetriever

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader = TextLoader('data/state_of_the_union.txt')
documents = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [4]:
raw = {"texts":[t.page_content for t in texts]}

In [5]:
texts[0]

Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source':

In [6]:
collection = vexpresso.create(data=raw)

[32m2023-06-07 00:57:46.113[0m | [1mINFO    [0m | [36mdaft.context[0m:[36mrunner[0m:[36m88[0m - [1mUsing PyRunner[0m


In [7]:
collection.show(5)

texts Utf8,vexpresso_index Int64
"Madam Speaker, Madam Vice President, our First Lady and S...",0
Groups of citizens blocking tanks with their bodies. Ever...,1
Putin’s latest attack on Ukraine was premeditated and unp...,2
We are inflicting pain on Russia and supporting the peopl...,3
And tonight I am announcing that we will join our allies ...,4


In [8]:
embeddings = OpenAIEmbeddings()

In [9]:
import numpy as np
def embed_fn(content):
    return np.array(embeddings.embed_documents(content))

In [10]:
collection = collection.embed("texts", embedding_fn=embed_fn,  to="text_embeddings").execute()

In [11]:
collection.show(5)

texts Utf8,vexpresso_index Int64,text_embeddings Python
"Madam Speaker, Madam Vice President, our First Lady and S...",0,"<np.ndarray shape=(1536,) dtype=float64>"
Groups of citizens blocking tanks with their bodies. Ever...,1,"<np.ndarray shape=(1536,) dtype=float64>"
Putin’s latest attack on Ukraine was premeditated and unp...,2,"<np.ndarray shape=(1536,) dtype=float64>"
We are inflicting pain on Russia and supporting the peopl...,3,"<np.ndarray shape=(1536,) dtype=float64>"
And tonight I am announcing that we will join our allies ...,4,"<np.ndarray shape=(1536,) dtype=float64>"


In [12]:
vecdb = collection.to_langchain("texts", "text_embeddings")

In [13]:
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=vecdb)



In [14]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)

' The president said that Ketanji Brown Jackson is one of the nation’s top legal minds, a former top litigator in private practice, a former federal public defender, from a family of public school educators and police officers, a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.'