# 準備

Google Colabを使用している場合はドライブをマウントする

In [None]:
from google.colab import drive
drive.mount('/content/drive')

必要パッケージのインストール

In [None]:
%pip install -qU langchain_community
%pip install -qU pypdf
%pip install -qU nltk
%pip install -qU langchain_openai langchain_chroma
%pip install -qU unstructured

APIキーの指定

In [None]:
# Option 1: Set the API key as an environment variable
import os
os.environ["OPENAI_API_KEY"] = "APIキー"  # Replace with your actual API key

# データの前処理

## データの読み込み

In [None]:
from langchain_core.documents import Document

## PDFの読み込み

In [None]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
import os

def load_PDFs_from_dir(dir_path:str)->list[Document]:
  loader = PyPDFDirectoryLoader(dir_path, glob="*.pdf")
  documents = loader.load()
  return documents

def load_PDF_from_path(file_path:str)->list[Document]:
  loader = PyPDFLoader(file_path)
  documents = loader.load()
  return documents

In [None]:
DIR_PATH = "/content/sample_data"
docs = load_PDFs_from_dir(DIR_PATH)

In [None]:
print(f'docs配列の長さ（pdfの合計ファイル数ではなく、pdfの合計ページ数）：\n{len(docs)} \n---------') #
print(f'1つ目のドキュメントのメタデータ（ソースのパスと、ページ番号が書かれていることに着目）：\n{docs[1].metadata} \n---------') #
print(f'1つ目のドキュメントの中身：\n{docs[1].page_content}')

## HTMLファイルの読み込み

In [None]:
from langchain_community.document_loaders import BSHTMLLoader
import os
import glob

def load_html_from_dir(dir_path:str) -> list[Document]:
  documents = []
  file_paths = glob.glob(os.path.join(dir_path, '*.html'))
  for file_path in file_paths:
    loader = BSHTMLLoader(file_path)
    documents += loader.load()
  return documents

def load_html_from_path(file_path:str) -> list[Document]:
  loader = BSHTMLLoader(file_path)
  documents = loader.load()
  return documents

In [None]:
DIR_PATH = "/content/sample_data"
docs = load_html_from_dir(DIR_PATH)

In [None]:
print(f'docs配列の長さ：\n{len(docs)} \n ---------')
print(f'1つ目のドキュメントのメタデータ：\n{docs[0].metadata} \n---------')
print(f'1つ目のドキュメントの中身：\n{docs[0].page_content}')

## Markdownファイルの読み込み

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import DirectoryLoader

def load_markdowns_from_dir(dir_path:str)->list[Document]:
  loader = DirectoryLoader(dir_path, glob="*.md")
  documents = loader.load()
  return documents

def load_markdown_from_path(file_path:str)->list[Document]:
  loader = UnstructuredMarkdownLoader(file_path)
  documents = loader.load()
  return documents

In [None]:
DIR_PATH = "/content/sample_data"
docs = load_markdowns_from_dir(DIR_PATH) #Documentクラスが格納されたリスト

In [None]:
print(f'docs配列の長さ：\n{len(docs)} \n ---------')
print(f'1つ目のドキュメントのメタデータ：\n{docs[0].metadata} \n---------')
print(f'1つ目のドキュメントの中身：\n{docs[0].page_content}')

# Chunk分け

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    keep_separator=False
)

In [None]:
PDF_DIR_PATH = "PDFファイルが入ったディレクトリのパス"
docs = load_PDFs_from_dir(PDF_DIR_PATH)

In [None]:
chunks = text_splitter.split_documents(docs)

In [None]:
print(f"chunkの数：{len(chunks)}\n---------")
print(f"1つ目のchunkの中身：\n{chunks[0]}\n---------")
print("文章がchunk_overlap分重複していることを確認：")
print(f"2つ目のchunkの中身：\n{chunks[1]}")
print(f"3つ目のchunkの中身：\n{chunks[2]}\n---------")

# データのベクトル化

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

embeddings_model = OpenAIEmbeddings(model='text-embedding-3-small')

vector_db = Chroma(
        collection_name='rag_app_collection',
        embedding_function=embeddings_model,
        persist_directory='database/ChromaDB'
        )
vector_db.add_documents(documents=chunks)

In [None]:
query='担当者の連絡先を教えて'
contexts = vector_db.similarity_search_with_relevance_scores(query,
                                                             k=5,
                                                             score_threshold=0.1)

In [None]:
print(f"質問と関連性が高いコンテキスト：{contexts}")

# 回答生成

プロンプト作成

In [None]:
from langchain_core.prompts import ChatPromptTemplate
PROMPT = """
You are a helpful assistant. Answer the following questions based on the given context:
chat history: {CHAT_HISTORY}

context: {CONTEXT}

Answer the following questions based on the given context:
query: {QUERY}
"""
chat_history = []
prompt = ChatPromptTemplate.from_template(PROMPT)
chat_history = '\n\n'.join([f"{message['role']}: {message['content']}" for message in chat_history])
context_str = '\n'.join([f"CONTEXT {idx}:\n{res.page_content}" for idx, (res, _score) in enumerate(contexts)])
prompt = prompt.format(CHAT_HISTORY=chat_history,CONTEXT=context_str, QUERY=query)

In [None]:
print(prompt)

チャットモデルによる回答生成

In [None]:
from langchain.chat_models import ChatOpenAI

# Initialize ChatGPT API via LangChain
chat_model = ChatOpenAI(model_name="gpt-4", temperature=0, openai_api_key=os.environ["OPENAI_API_KEY"])

In [None]:
# Send to OpenAI API
response = chat_model.invoke(prompt)

print(response)

In [None]:
PROMPT = """
略
"""

chat_history = '\n\n'.join([f"{message['role']}: {message['content']}" for message in chat_history])
user_prompt = 'この企画の担当者の連絡先を教えてください。'

prompt = ChatPromptTemplate.from_template(PROMPT)
prompt = prompt.format(CHAT_HISTORY=chat_history,CONTEXT=contexts, QUERY=user_prompt)

sources = '\n '.join([f"{doc[0].metadata['source']},{doc[0].metadata['page']}ページ" for doc in contexts ])

chat_model = ChatOpenAI(model='gpt-4o-mini')
response = chat_model.invoke(prompt)
print(response.content,'\n\n 参考文献： \n',sources)
