In [None]:
%%writefile requirements.txt
langchain
unstructured
pandas
chromadb
tiktoken
openai

In [None]:
!pip install -r requirements.txt -q
!sudo apt-get install tesseract-ocr
!sudo apt-get install poppler-utils
!pip install pdf2image pytesseract
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.10/index.html
!pip install -qU pinecone-client
!pip install adaptive
!pip install gradio

In [None]:
import gradio as gr
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
import pinecone

OPENAI_API_KEY = '---'
PINECONE_API_KEY = '---'
PINECONE_API_ENV = '---'
index_name = "---"

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

def load_pdf_document(file_path):
    loader = UnstructuredPDFLoader(file_path)
    return loader.load()

def split_document_to_chunks(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(documents)

def documentsearch(texts, embeddings, index_name):
    return Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

def responces(query, docsearch):
    docs = docsearch.similarity_search(query, include_metadata=True)
    return chain.run(input_documents=docs, question=query)

docsearch = None
history = []

def chatbot(file, question):
    global history
    global docsearch
    if file is not None:
        data = load_pdf_document(file.name)
        texts = split_document_to_chunks(data)
        embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
        docsearch = documentsearch(texts, embeddings, index_name)
    if docsearch is not None and question is not None:
        history.append(("User", question))
        response = responces(question, docsearch)
        history.append(("Bot", response))
    return history

iface = gr.Interface(fn=chatbot, inputs=["file", "text"], outputs="list")

def clear_chat():
    global history
    history = []
    iface.update_chat([])

iface.launch()
