In [1]:
# https://www.youtube.com/watch?v=3yPBVii7Ct0&list=PLG-xg5pFIsQXSGzlrHvpqGH5EFGVBz2-3&index=4

In [2]:
%%capture
!pip -q install langchain openai tiktoken chromadb pypdf

# Setup OpenAI API Key

In [3]:
import os

os.environ["OPENAI_API_KEY"] = "KEY"

# Import required langchain modules

In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
from langchain.chat_models import ChatOpenAI

# Load and process multiple docs

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Load and process the text files

# loader = DirectoryLoader('/content/drive/MyDrive/data/webpages/', glob="./*.txt", loader_cls=TextLoader)
loader = DirectoryLoader('/content/drive/MyDrive/data/pdfs/', glob="./*.pdf", loader_cls=PyPDFLoader)

In [7]:
documents = loader.load()

In [8]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [9]:
len(texts)

1059

In [10]:
texts[3]

Document(page_content='looking for huge waves while some may just be looking for windless classic waves. \nJust like surf check, ISPO will be providing', metadata={'source': '/content/drive/MyDrive/data/pdfs/coming-to-ucsd-guide.pdf', 'page': 1})

# Create DB

In [11]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [12]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# Create chain

In [13]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [14]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

# Helper function

In [25]:
import textwrap

In [28]:
# Text formatting for the response
def process_llm_response(llm_response):
    result = llm_response['result']
    result = 'Response: ' + result
    wrapped_result = textwrap.wrap(result, width=80)
    for line in wrapped_result:
        print(line)
    print('\nSources:')
    for source in llm_response['source_documents']:
        print(source.metadata['source'])
    print('\n\n')

# Chat prompts

In [16]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [17]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}


# Usage

In [31]:
def main():
    while True:
        query = input("Enter your query (or 'q' to quit): ")
        if query == 'q':
            break
        llm_response = qa_chain(query)
        process_llm_response(llm_response)

if __name__ == "__main__":
    main()

Enter your query (or 'q' to quit): How should I prepare for my arrival at UCSD?
Response: To prepare for your arrival at UCSD, you should visit
iNewStudent.ucsd.edu. There, you can find valuable immigration information and
easy checklists to help you prepare. Additionally, you should contact your
department (if you are a graduate student) or college to get further assistance.
It is important to have your immigration documents ready upon arrival.

Sources:
/content/drive/MyDrive/data/pdfs/ispo-welcome-guide.pdf
/content/drive/MyDrive/data/pdfs/coming-to-ucsd-guide.pdf
/content/drive/MyDrive/data/pdfs/ispo-welcome-guide.pdf



Enter your query (or 'q' to quit): What all immigration documents do I need?
Response: Based on the context provided, the immigration documents you need are:
- I-94 Record - Original Passport - Social Security (SSN) Card or Social
Security Denial (Form SSA-L676) - Residency documents (have to show your name
and the address)  Please note that this list may not be ex

# Streamlit

In [31]:
%%capture
!pip install streamlit

In [52]:
%%writefile app.py

import streamlit as st

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, UnstructuredFileLoader, PyPDFLoader, DirectoryLoader
from langchain.chat_models import ChatOpenAI

loader = DirectoryLoader('/content/drive/MyDrive/data/pdfs/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)


retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

def app():
    st.title("Chat UI")
    query = st.text_input("Enter your query")
    if query:
        llm_response = qa_chain(query)
        st.write(f"Response: {llm_response['result']}")
        st.write(f"\n\nSources:")
        for source in llm_response['source_documents']:
            st.write(source.metadata)

app()

Overwriting app.py


In [53]:
!streamlit run /content/app.py &>/content/url.txt &

In [54]:
!npx localtunnel --port 8501

[K[?25hnpx: installed 22 in 4.416s
your url is: https://silver-olives-learn.loca.lt
^C


In [None]:
# Copy external URL (without the port) from url.txt and paste it on the URL shown above