<a href="https://colab.research.google.com/github/kentny/chat-doc/blob/main/Section5/pdf_document_similarity_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 事前準備
---


In [None]:
!pip install openai \
           langchain \
           pypdf \
           chromadb \
           tiktoken

## OpenAI API Keyを設定

In [3]:
import os

os.environ['OPENAI_API_KEY'] = ''

# PDF文書を分析する
---

## PDF文書を取り込む

In [None]:
import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

file_path = '第211回国会衆議院環境委員会第1号令和5年3月7日.pdf'

loader = PyPDFLoader(file_path)
text_splitter = CharacterTextSplitter(separator='。', chunk_size=100, chunk_overlap=20)
docs = loader.load_and_split(text_splitter=text_splitter)

## 文書をベクトル化し、ベクトルデータベースに保存する

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma


embeddings = OpenAIEmbeddings()

vectorstore_chroma_path = "./vectorestore/local_chroma"

if os.path.exists(vectorstore_chroma_path):
    shutil.rmtree(vectorstore_chroma_path)
    print("The database has been deleted.")
else:
    print("The database does not exist.")

vectordb = Chroma.from_documents(docs, embeddings, persist_directory=vectorstore_chroma_path)
vectordb.persist()

# 質問と類似の情報を見つける
---

In [7]:
from typing import List
from langchain.docstore.document import Document


def _similarity_search(query: str) -> List[Document]:
    embeddings = OpenAIEmbeddings()

    vectordb = Chroma(persist_directory=vectorstore_chroma_path, embedding_function=embeddings)
    return vectordb.similarity_search(query, 5)

## 質問をする

In [None]:
# 質問を入力する
question = input()

documents = _similarity_search(question)

_ = [print(doc) for doc in documents]