[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jabascal/chat_with_data_app/blob/main/notebook/chat_to_your_data_medium.ipynb)

# Chat to any source of data with LangChain and OpenAI

## Requirements

In [None]:
mode_colab = True
if mode_colab is False:
    # Local installation
    !python -m venv venv
    !source venv/bin/activate
    !pip install -r requirements.txt

## Read the data

In [None]:
# Document type: "pdf" or "url" or "youtube"
example_type = "url"                

if example_type == "url":
    doc_type = "url" 
    doc_path = "https://en.wikipedia.org/wiki/Cinque_Terre"
elif example_type == "pdf":
    doc_type = "pdf" 
    doc_path = "./data/paper.pdf"
elif example_type == "youtube":
    doc_type = "youtube" 
    #doc_path = "https://www.youtube.com/watch?v=PNVgh4ZSjCw"
    doc_path = "https://www.youtube.com/watch?v=W0DM5lcj6mw"

In [None]:
import re

# Clear white lines in web pages
def clear_blank_lines(docs):
    for doc in docs:
        doc.page_content = re.sub(r"\n\n\n+", "\n\n", doc.page_content)
    return docs

# Read document with langchain.document_loaders
def read_doc(doc_type, doc_path):
    if doc_type == "pdf":
        from langchain.document_loaders import PyPDFLoader
        loader = PyPDFLoader(doc_path)
        docs = loader.load()
    elif doc_type == "url":
        from langchain.document_loaders import WebBaseLoader
        url = doc_path
        loader = WebBaseLoader(url)
        docs = loader.load()
    elif doc_type == "youtube":
        from langchain.document_loaders.blob_loaders.youtube_audio import \
            YoutubeAudioLoader
        from langchain.document_loaders.generic import GenericLoader
        from langchain.document_loaders.parsers import OpenAIWhisperParser
        save_path = "./downloads"
        url = doc_path
        loader = GenericLoader(YoutubeAudioLoader([url], save_path), OpenAIWhisperParser())
        docs = loader.load()

    # Clear white lines in web pages
    clear_blank_lines(docs)

    print(f"Loaded {len(docs)} pages/documents")
    print(f"First page: {docs[0].metadata}")
    print(docs[0].page_content[:500])
    return docs

def pretty_print_docs(docs, question = None):
    print(f"\n{'-' * 100}\n")
    if question:
        print(f"Question: {question}")

    for i, doc in enumerate(docs):
        print(f"Document {i+1}:\n\nMetadata: {doc.metadata}\n")
        print(doc.page_content)
    print("\n")

# Read document with langchain.document_loaders
docs = read_doc(doc_type, doc_path)

## Split the data into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Parameters for splitting documents into chunks
chunk_size = 1500                   
chunk_overlap = 150
add_start_index = True

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    add_start_index=add_start_index)

docs_split = text_splitter.split_documents(docs)
print(f"Split into {len(docs_split)} chunks")
print(f"First chunk: {docs_split[0].metadata}")
print(docs_split[0].page_content)

## OpenAI API key

In [None]:
import os
import json
import openai

user = 'abascal'
path_file_key = f'/home/{user}/Projects/openai'
name_file_key = "openai_key.json" 

def read_key_from_file(path_file, name_file_key):
    with open(os.path.join(path_file, name_file_key), 'r') as f:
        org_data = json.load(f)
        
    openai.organization = org_data['organization']
    openai.api_key = org_data['api_key']

# Read OpenAI key from filepath_file
openai_key = read_key_from_file(path_file_key, name_file_key)

## Create a vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DocArrayInMemorySearch
#from langchain.vectorstores import Chroma

# Define embedding
embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)    

# Create vector database from data    
db = DocArrayInMemorySearch.from_documents(
    docs_split, 
    embedding=embedding)

#db = Chroma.from_texts(docs_split, embedding=embedding)

## LLM and retrievers

In [None]:
from langchain.chat_models import ChatOpenAI
#from langchain.llms import OpenAI

# Info user API key
llm_name = "gpt-3.5-turbo"

# Init the LLM and memory
# llm = OpenAI(temperature=0, openai_api_key=openai_key)
llm = ChatOpenAI(model_name=llm_name,
                 temperature=0,
                 openai_api_key=openai.api_key)

## Chain

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# QA CHAIN
qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=db.as_retriever(),
    memory=memory
)

## Chat

In [None]:
# Start interaction
qa_on = True # Ask questions to the user
while qa_on == True:
    # Prompt the user to introduce a question
    question = input("Ask a question or type 'end chat': ")
    
    if question.lower() == "end chat":
        break

    # Run QA chain
    result = qa_chain({"question": question})
    print(f"Answer: {result['answer']}")

## Build a chat app with Gradio

In [None]:
def qa_call(input):
    # QA call
    output = qa_chain({"question": input})
    return output

def qa_answer(input):
    # Return the answer from the QA call
    return qa_call(input)['answer']

def qa_history(input):
    # Return a formatted history
    response = qa_chain({"question": input})
    output = ""
    response_history = response['chat_history']
    num_qa = len(response_history)//2
    for i in range(num_qa):
        output += "Q: " + response_history[2*i].content + "\n"
        output += "A: " + response_history[2*i+1].content + "\n"
    return output

In [None]:
import gradio as gr

demo = gr.Interface(fn=qa_history, 
                    inputs=[gr.Textbox(label="User question", 
                                       lines=2)],
                    outputs=[gr.Textbox(label="Chat answer", 
                                        lines=4)],
                    title="Chat to your data",
                    description=f"Ask questions about your data to {llm_name}!",
                    allow_flagging="never",
                    examples=["Summarize the document", "Can you provide details about ...", "Can you exaplin what is ...?"]
                   )
demo.launch()

## Same with ChatInterface

In [None]:
def qa_input_msg_history(input, history):
    # QA function that inputs the answer and the history
    # History managed internally by ChatInterface
    answer = qa_answer(input)
    return answer

In [None]:
# Init memory and QA chain
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)
qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=db.as_retriever(),
    memory=memory
)

demo = gr.ChatInterface(fn=qa_input_msg_history, 
                    title="Chat to your data",
                    description=f"Ask questions about your data to {llm_name}!",
                    examples=["Summarize the document", "Can you provide details about ...", "Can you exaplin what is ...?"]
                   )
demo.launch()