<a href="https://colab.research.google.com/github/kentny/chat-doc/blob/main/Section7/pdf_document_qa_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 事前準備
---


In [None]:
!pip install openai \
           langchain \
           pypdf \
           chromadb \
           tiktoken

## OpenAI API Keyを設定

In [2]:
import os

os.environ['OPENAI_API_KEY'] = ''

## 定数を設定する

In [3]:
# 使用するPDF文書ファイルのパス
file_path = '/content/drive/MyDrive/第211回国会衆議院環境委員会第1号令和5年3月7日.pdf'

# ベクトルデータベースのパス
vectorstore_chroma_path = "/content/drive/MyDrive/vectorestore/local_chroma"

# PDF文書を分析する
---

## PDF文書を取り込む

In [None]:
import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

loader = PyPDFLoader(file_path)
text_splitter = CharacterTextSplitter(separator='。', chunk_size=100, chunk_overlap=20)
docs = loader.load_and_split(text_splitter=text_splitter)

## 文書をベクトル化し、ベクトルデータベースに保存する

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings()

if os.path.exists(vectorstore_chroma_path):
    shutil.rmtree(vectorstore_chroma_path)
    print("The database has been deleted.")
else:
    print("The database does not exist.")

vectordb = Chroma.from_documents(docs, embeddings, persist_directory=vectorstore_chroma_path)
vectordb.persist()

# 質問から回答を作成する
---

In [5]:
from typing import List
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma


def generate_answer(query: str) -> str:
    docs = _similarity_search(query)
    template = """Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}
    Answer in JAPANESE:"""
    prompt = PromptTemplate(
        template=template, input_variables=["context", "question"]
    )
    chain = load_qa_chain(ChatOpenAI(temperature=0.7), prompt=prompt)
    
    answer = chain.run(input_documents=docs, question=query)
    print(f'''answer: {answer}''')
    return answer


def generate_answer_with_source(query: str) -> str:
    docs = _similarity_search(query)
    template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
    ALWAYS return a "SOURCES" part in your answer.
    Respond in JAPANESE.

    QUESTION: {question}
    =========
    {summaries}
    =========
    FINAL ANSWER IN JAPANESE:"""
    prompt = PromptTemplate(template=template, input_variables=["summaries", "question"])
    chain = load_qa_with_sources_chain(ChatOpenAI(temperature=0.7), prompt=prompt)

    result = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
    answer = result["output_text"]
    print(f'''result: {result}''')
    print(f'''answer: {answer}''')
    return answer


def _similarity_search(query: str) -> List[Document]:
    embeddings = OpenAIEmbeddings()

    vectordb = Chroma(persist_directory=vectorstore_chroma_path, embedding_function=embeddings)
    return vectordb.similarity_search(query, 5)

## 質問をする

In [None]:
# 質問を入力する
question = input()

answer = generate_answer(question)
print(answer)