In [1]:
import gradio
import tiktoken, os

# from streamlit_chat import message
from langchain.memory import ConversationBufferMemory
from langchain.callbacks import get_openai_callback
from langchain.memory import StreamlitChatMessageHistory

from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def load_and_process_data(data_dir):
    doc_list = []
    pdf_list = sorted(os.listdir(data_dir))
    for idx, pdf_name in enumerate(pdf_list):
        if pdf_name.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(data_dir, pdf_name))
            documents = loader.load_and_split()
            doc_list.extend(documents)
            print(f"{idx+1}/{len(pdf_list)} {pdf_name} loaded")
    print(f"Total {idx+1} documents loaded")
    return doc_list

def tiktoken_len(text):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)
    return len(tokens)



def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=900,
        chunk_overlap=100,
        length_function=tiktoken_len
    )
    chunks = text_splitter.split_documents(text)
    return chunks


def get_vectorstore(text_chunks):
    embeddings = HuggingFaceEmbeddings(
                                        model_name="jhgan/ko-sroberta-multitask",
                                        model_kwargs={'device': 'cpu'},
                                        encode_kwargs={'normalize_embeddings': True}
                                        )  
    vectordb = FAISS.from_documents(text_chunks, embeddings)
    return vectordb


def get_conversation_chain(vetorestore,openai_api_key):
    llm = ChatOpenAI(openai_api_key=openai_api_key, model_name = 'gpt-3.5-turbo-1106', temperature=0)
    conversation_chain = ConversationalRetrievalChain.from_llm(
            llm=llm, 
            chain_type="stuff", 
            retriever=vetorestore.as_retriever(search_type = 'mmr', vervose = True), 
            memory=ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer'),
            get_chat_history=lambda h: h,
            return_source_documents=True,
            verbose = True
        )

    return conversation_chain

In [9]:
openai_api_key = "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"
data_dir = "data/" 

doc_list = load_and_process_data(data_dir)
text_chunks = get_text_chunks(doc_list)
vetorestore = get_vectorstore(text_chunks)


1/22 5-2-1-1(1지도서).pdf loaded
2/22 5-2-1-1(2지도서).pdf loaded
3/22 5-2-1-1(3지도서).pdf loaded
4/22 5-2-1-1(4지도서).pdf loaded
5/22 5-2-1-1(5지도서).pdf loaded
6/22 5-2-1-1(6지도서).pdf loaded
7/22 5-2-1-1(7지도서).pdf loaded
8/22 5-2-1-2(10지도서).pdf loaded
9/22 5-2-1-2(11지도서).pdf loaded
10/22 5-2-1-2(12지도서).pdf loaded
11/22 5-2-1-2(13지도서).pdf loaded
12/22 5-2-1-2(14지도서).pdf loaded
13/22 5-2-1-2(8지도서).pdf loaded
14/22 5-2-1-2(9지도서).pdf loaded
15/22 5-2-1-3(15지도서).pdf loaded
16/22 5-2-1-3(16지도서).pdf loaded
17/22 5-2-1-3(17지도서).pdf loaded
18/22 5-2-1-3(18지도서).pdf loaded
19/22 5-2-1-3(19-20지도서).pdf loaded
20/22 5-2-1-3(21지도서).pdf loaded
21/22 5-2-1-3(22지도서).pdf loaded
22/22 5-2-1-3(23-24지도서).pdf loaded
Total 22 documents loaded


In [None]:
conversation = get_conversation_chain(vetorestore, openai_api_key) 

In [None]:
import gradio as gr

def predict(input, history):
    history.append({"role": "user", "content": input})

    gpt_response = client.chat.completions.create(
        model=model_id,
        messages=history
    )
    response = gpt_response.choices[0].message.content
    history.append({"role": "assistant", "content": response})
    messages = [(history[i]["content"], history[i+1]["content"]) for i in range(1, len(history), 2)]
    return messages, history


with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="ChatBot")
    state = gr.State([{
        "role": "system",
        "content": "ChatSNS is a chatbot really friendly and concise."
    }])
    with gr.Row():
        txt = gr.Textbox(show_label=False, placeholder="챗봇에게 아무거나 물어보세요")
    txt.submit(predict, [txt, state], [chatbot, state])

demo.launch(debug=True, share=True)