# Base from getting started

In [1]:
from langchain.chains import RetrievalQA

In [2]:
from langchain.document_loaders import TextLoader
loader = TextLoader('state_of_the_union.txt', encoding='utf8')

In [3]:
from langchain.indexes import VectorstoreIndexCreator

In [4]:
# make the LLM explicit
from langchain.llms import OpenAI
myLLM = OpenAI(temperature=0)
# make the embedding explicit
from langchain.embeddings.openai import OpenAIEmbeddings
myEmbeddings = OpenAIEmbeddings()

In [5]:
# this takes several seconds as it is calculating a bunch of embeddings (for the split text chunks)
index = VectorstoreIndexCreator(embedding=myEmbeddings).from_loaders([loader])

In [6]:
query = "What did the president say about Ketanji Brown Jackson"
index.query(query, llm=myLLM)

" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans."

In [7]:
query = "What did the president say about Ketanji Brown Jackson"
index.query_with_sources(query)

{'question': 'What did the president say about Ketanji Brown Jackson',
 'answer': " The president said that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson, one of the nation's top legal minds, to continue Justice Breyer's legacy of excellence, and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\n",
 'sources': 'state_of_the_union.txt'}

In [8]:
vs = index.vectorstore
print(type(vs))
print(dir(vs))

<class 'langchain.vectorstores.chroma.Chroma'>
['_Chroma__query_collection', '_LANGCHAIN_DEFAULT_COLLECTION_NAME', '__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_client', '_client_settings', '_collection', '_embedding_function', '_persist_directory', '_similarity_search_with_relevance_scores', 'aadd_documents', 'aadd_texts', 'add_documents', 'add_texts', 'afrom_documents', 'afrom_texts', 'amax_marginal_relevance_search', 'amax_marginal_relevance_search_by_vector', 'as_retriever', 'asearch', 'asimilarity_search', 'asimilarity_search_by_vector', 'delete_collection', 'from_documents', 'from_texts', 'get', 'max_marginal_relevance_search', 'max_marg

In [9]:
re = index.vectorstore.as_retriever()
print(type(re))
print(dir(re))

<class 'langchain.vectorstores.base.VectorStoreRetriever'>
['Config', '__abstractmethods__', '__annotations__', '__class__', '__class_vars__', '__config__', '__custom_root_type__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__exclude_fields__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_validators__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__include_fields__', '__init__', '__init_subclass__', '__iter__', '__json_encoder__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__post_root_validators__', '__pre_root_validators__', '__pretty__', '__private_attributes__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_repr__', '__schema_cache__', '__setattr__', '__setstate__', '__signature__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__try_update_forward_refs__', '__validators__', '__weakref__', '_abc_impl', '_calculate_keys', '_copy_and_set_values', '_decompo

# More in-depth from getting started

In [10]:
documents = loader.load()  # loads a single langchain.schema.Document , with metadata containing 'source' etc

In [11]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [12]:
len(texts)  # 42 items, each of type langchain.schema.Document
print(texts[12].page_content, '\n=================\n',texts[13].page_content)

Invest in America. Educate Americans. Grow the workforce. Build the economy from the bottom up  
and the middle out, not from the top down.  

Because we know that when the middle class grows, the poor have a ladder up and the wealthy do very well. 

America used to have the best roads, bridges, and airports on Earth. 

Now our infrastructure is ranked 13th in the world. 

We won’t be able to compete for the jobs of the 21st Century if we don’t fix that. 

That’s why it was so important to pass the Bipartisan Infrastructure Law—the most sweeping investment to rebuild America in history. 

This was a bipartisan effort, and I want to thank the members of both parties who worked to make it happen. 

We’re done talking about infrastructure weeks. 

We’re going to have an infrastructure decade. 

It is going to transform America and put us on a path to win the economic competition of the 21st Century that we face with the rest of the world—particularly with China. 
 It is going to transform

In [13]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [14]:
from langchain.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)

In [15]:
retriever = db.as_retriever()

In [16]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

In [17]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)

" The President said that she is one of our nation's top legal minds and that she will continue Justice Breyer's legacy of excellence. He also said that she is a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He said she is a consensus builder and has received a broad range of support."