In [2]:
import os
import openai

openai.api_key  = os.environ['OPENAI_API_KEY']

In [28]:
# print(openai.api_key)

# STR Chatbot

In [None]:
# pip install langchain
# pip install openai
# pip install pypdf
# pip install tiktoken
# pip install lark
# pip install scikit-learn
# pip install chromadb
# pip install panel

In [None]:
# !source ~/.bash_profile

### Overview 

Recall the overall workflow for retrieval augmented generation (RAG):

![](images/lesson-2-overview.png)

## Setup
`Document Loading`, `Splitting` and `Storage`.

### Read PDFs

In [5]:
from langchain.document_loaders import PyPDFLoader

# Load PDFs
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/NBK268647.pdf"),
    PyPDFLoader("docs/NBK268647.pdf"),
    PyPDFLoader("docs/NBK1438.pdf"),
    PyPDFLoader("docs/NBK1513.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

len(docs)

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

### Store database

In [7]:
from langchain.vectorstores import Chroma

In [8]:
persist_directory = 'docs/chroma/'

In [9]:
!rm -rf ./docs/chroma  # remove old database files if any

In [10]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [11]:
print(vectordb._collection.count())

222



Let's load our vectorDB. 

In [12]:
# import os
# import openai

# openai.api_key  = os.environ['OPENAI_API_KEY']

The code below was added to assign the openai LLM version filmed until it is deprecated, currently in Sept 2023. 
LLM responses can often vary, but the responses may be significantly different when using a different model version.

In [13]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [14]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [15]:
print(vectordb._collection.count())

222


In [16]:
question = "What are major disorders addressed in this corpus?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [17]:
docs[0]

Document(page_content='Table 3. continued from previous page.\nFeatureFrequency\nComment\nNearly all Common\xa01Infrequent\nDisinhibition ● Impulsivity, socially unacceptable behavior, risk taking\nApathy ● Indifference,  lack of interest\nDelusions/hallucinations ● Often  bizarre delusions, mostly visual hallucinations\nPsychosis ● Psychosis, often  as initial symptom\nAnxiety ● Generalized stress & apprehension\nRepetitive, compulsive behavior ● Often  complex, ritualistic behaviors mimicking OCD\nPreference for sweet food ●↑ craving for sweet foods\nMotor symptoms\nUpper MND ● Weakness, spasticity, altered muscle tone\nLower MND ● Weakness, fasciculations, atrophy\nBulbar involvementDysarthria ● Motor language deficit\nDysphagia ● Problems swallowing food &/or liquids\nParkinsonism ●Extrapyramidal findings  such as resting tremor, rigidity, \nakinesia\nMND = motor neuron disease; OCD = obsessive compulsive disorder\n1. Features are ranked as common if present in >33%, if frequency w

In [18]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name=llm_name, temperature=0)

### RetrievalQA chain

In [19]:
from langchain.chains import RetrievalQA

In [20]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [21]:
question = "What are major disorders addressed in this corpus?"

result = qa_chain({"query": question})

In [22]:
print(result['result'])

The major disorders addressed in this corpus include frontotemporal dementia (FTD), amyotrophic lateral sclerosis (ALS), and the C9orf72-FTD/ALS spectrum.


### Prompt

In [23]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [24]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [25]:
result = qa_chain({"query": "Is cancer a disease that is discussed?"})
print(result['result'])

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-CuYsiQfrp7oMZCd9pjziMV09 on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-CuYsiQfrp7oMZCd9pjziMV09 on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

Based on the given context, there is no mention or indication that cancer is being discussed. Therefore, it cannot be determined whether cancer is a disease that is discussed. Thanks for asking!


In [26]:
result = qa_chain({"query": "Is ALS a disease that is discussed?"})
print(result['result'])

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-CuYsiQfrp7oMZCd9pjziMV09 on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..


KeyboardInterrupt: 

### RetrievalQA chain types

So far the qa chain has used the "stuff" type, which is the default. There are other types of chains that can be used.

If you wish to experiment on the `LangChain plus platform`:

 * Go to [langchain plus platform](https://www.langchain.plus/) and sign up
 * Create an API key from your account's settings
 * Use this API key in the code below   
 * uncomment the code  
 Note, the endpoint in the video differs from the one below. Use the one below.

In [None]:
#import os
#os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
#os.environ["LANGCHAIN_API_KEY"] = "..." # replace dots with your api key

In [None]:
qa_chain_map_reduce = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

In [None]:
question = "Is dementia a disease that is discussed?"
result = qa_chain_map_reduce({"query": question})
result["result"]

In [None]:
qa_chain_refine = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_refine({"query": question})
result["result"]

### RetrievalQA limitations
 
QA fails to preserve conversational history.

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [None]:
question = "Is cancer a disease that is discussed?"
result = qa_chain({"query": question})
result["result"]

In [None]:
question = "What is the age of onset (typical, min, max)?"
result = qa_chain({"query": question})
result["result"]

The point is simply that the model does not have access to past questions or answers, this will be covered in the next section.