# Document Question Answering

An example of using Chroma DB and LangChain to do question answering over documents.

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain.embeddings import OpenAIEmbeddings

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import TextLoader

import sqlalchemy
import pandas as pd

In [None]:
# import functions from a file stored in a different directory
import sys
sys.path.append('/home/ubuntu/work/therapeutic_accelerator/scripts/utils')
sys.path.append('/home/ubuntu/work/therapeutic_accelerator/scripts/database')

from db_tools import db_connection
from utils import import_config

config, keys = import_config()

engine = db_connection(
    password=keys["postgres"], host=config["database"]["host"])

## Load documents

Load documents to do question answering over. If you want to do this over your documents, this is the section you should replace.

In [None]:
# Retreive Full Text from Table
table_name = "fulltext"

sql = sqlalchemy.text(
    f""" 
    SELECT * FROM {table_name} LIMIT 10;
    """
)

with engine.connect() as conn:
    query = conn.execute(sql)
    full_text = pd.DataFrame(query.fetchall())

# full_text.head()

example = full_text.loc[0, 'text']

# loader = TextLoader('state_of_the_union.txt')
# documents = loader.load()

## Split documents

Split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0, separators = ['\n\n', '\n'])

# texts = text_splitter.split_documents(example)

texts = text_splitter.create_documents([example])
texts

In [None]:
from llama_index import Document, VectorStoreIndex
import re

text_list = [re.sub("\n", " ", p.page_content) for p in texts]
documents = [Document(text=re.sub("\s\s+", " ", t)) for t in text_list]

## Initialize ChromaDB

Create embeddings for each chunk and insert into the Chroma vector database.

In [None]:
# # create T5 model for summarization
# from transformers import (
#     T5Tokenizer,
#     TFT5Model,
#     TFT5ForConditionalGeneration)

# T5tokens = T5Tokenizer.from_pretrained("t5-base")

# # tokenize text
# T5tokens.tokenize(example)


In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("allenai/specter")

model = AutoModel.from_pretrained("allenai/specter")

# take the first token in the batch as the embedding

class specter_embeddings: 
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model
        
    def embed_documents(self, text):
        inputs = self.tokenizer(item, padding=True, truncation=True, return_tensors="pt", max_length=512)
        result = self.model(**inputs)
        embeddings = result.last_hidden_state[:, 0, :]
        return embeddings


In [None]:
spect_embeds = specter_embeddings(tokenizer, model)

In [None]:
# # Spectre embeddings are trained on a corpus of 1.5 billion words from Wikipedia and the web.
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("feature-extraction", model="allenai/specter", tokenizer="allenai/specter")
# inputs = pipe.tokenizer(example, padding=True, truncation=True, return_tensors="pt", max_length=512)
# pipe.model(**inputs).last_hidden_state[:, 0, :]

# pipe.model(**pipe.tokenizer(example))

In [None]:
import chromadb

client = chromadb.Client()

In [None]:
import os

os.environ["OPENAI_API_KEY"] = keys["openai"]

from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from chromadb.utils import embedding_functions


In [None]:
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, texts: Documents) -> Embeddings:
        texts = [t.replace("\n", " ") for t in texts]
        # embed the documents somehow
        embeddings = []
        for text in texts:
            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
            result = model(**inputs)
            embeddings.append(result.last_hidden_state[:, 0, :])
        
        return embeddings

In [None]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("fulltext_specter", embedding_function=MyEmbeddingFunction(model, tokenizer))

In [None]:
vectordb = Chroma.from_documents(texts, MyEmbeddingFunction(tokenizer, model), persist_directory='/home/ubuntu/work/therapeutic_accelerator/chroma/fulltext')

: 

In [None]:
# specter_ef = embedding_functions.HuggingFaceEmbeddingFunction(
#     api_key=keys['huggingFace'], # Replace with your own HuggingFace API key
#     model_name="allenai/specter"
# )

# huggingface_collection = client.create_collection(name="specter_embeddings", embedding_function=specter_ef)

# # embeddings = OpenAIEmbeddings()
# # def embedding_fn(text):
# #     return spect_embeds.embed_documents(text)

# vectordb = Chroma.from_documents(texts, specter_ef, persist_directory='/home/ubuntu/work/therapeutic_accelerator/chroma/fulltext')

In [None]:
dir(specter_ef)

## Create the chain

Initialize the chain we will use for question answering.

In [None]:
qa = VectorDBQA.from_chain_type(
    llm=OpenAI(), chain_type="stuff", vectorstore=vectordb)

## Ask questions!

Now we can use the chain to ask questions!

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)