# Document Question Answering

An example of using Chroma DB and LangChain to do question answering over documents.

In [18]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import TextLoader

import sqlalchemy
import pandas as pd

In [13]:
# import functions from a file stored in a different directory
import sys
sys.path.append('/home/ubuntu/work/therapeutic_accelerator/scripts/utils')
sys.path.append('/home/ubuntu/work/therapeutic_accelerator/scripts/database')

from db_tools import db_connection
from utils import import_config

config, keys = import_config()

engine = db_connection(
    password=keys["postgres"], host=config["database"]["host"])

## Load documents

Load documents to do question answering over. If you want to do this over your documents, this is the section you should replace.

In [22]:
# Retreive Full Text from Table
table_name = "fulltext"

sql = sqlalchemy.text(
    f""" 
    SELECT * FROM {table_name} LIMIT 10;
    """
)

with engine.connect() as conn:
    query = conn.execute(sql)
    full_text = pd.DataFrame(query.fetchall())

# full_text.head()

example = full_text.loc[0, 'text']

# loader = TextLoader('state_of_the_union.txt')
# documents = loader.load()

## Split documents

Split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

In [33]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0)

# texts = text_splitter.split_documents(example)

texts = text_splitter.split_text(example)
texts

['OPEN ACCESS EDITED BY\n\n\nAlexander Nikolaevich Orekhov \nGeorgy Guria \nAnton G Kutikhin \nIvan Melnikov \nZufar Gabbasov zufargabbasov@yandex.ru \nMelnikov I \nKozlov S \nPogorelova O \nTripoten M \nKhamchieva L \nSaburova O \nAvtaeva Y \nZvereva M \nMatroze E \nKuznetsova T \nProkofieva L \nBalakhonova T \nGabbasov Z \n\nInstitute for Aterosclerosis Research\nRussian Academy of Medical Sciences\nNational Research Centre for Haematology\nRussia REVIEWED BY, Russia, Russia',
 'CORRESPONDENCE\nSPECIALTY SECTION\nDepartment of Ultrasound Diagnostics\nLaboratory of Cell Hemostasis, National Medical Research Centre of Cardiology named after academician E.I. Chazov of the Ministry of Health of the Russian Federation, Moscow, Russia, Laboratory of Gas Exchange, Biomechanics and Barophysiology, State Scientific Center of the Russian Federation -The Institute of Biomedical Problems of the Russian Academy of Sciences, Moscow, Russia, Laboratory of Problems of Atherosclerosis, National Medic

## Initialize ChromaDB

Create embeddings for each chunk and insert into the Chroma vector database.

In [35]:
os.environ["OPENAI_API_KEY"] = keys["openai"]

embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(texts, embeddings)

NameError: name 'os' is not defined

## Create the chain

Initialize the chain we will use for question answering.

In [None]:
qa = VectorDBQA.from_chain_type(
    llm=OpenAI(), chain_type="stuff", vectorstore=vectordb)

## Ask questions!

Now we can use the chain to ask questions!

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)