<a href="https://colab.research.google.com/github/jiruneko/AdvancedLAG/blob/main/AdvancedLAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install --upgrade langchain langchain-openai openai tiktoken

In [None]:
from google.colab import userdata
import os

api_key = userdata.get('OPENAI_API_KEY')
assert api_key, "Colabのシークレットに OPENAI_API_KEY を登録してください。"
os.environ['OPENAI_API_KEY'] = api_key

print("OpenAI API キーは設定されています ✅")

import os

# Colab に登録したシークレットを取得
api_key = os.environ["OPENAI_API_KEY"]

In [None]:
!pip install langchain-core==0.3.0 langchain-openai==0.2.0 \
     langchain-community==0.3.0 GitPython==3.1.43 \
     langchain-chroma==0.1.4 tavily-python==0.5.0

In [None]:
from langchain_community.document_loaders import GitLoader

def file_filter(file_path: str) -> bool:
  return file_path.endswith(".mdx")

loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./langchain",
    branch="master",
    file_filter=file_filter,
)

documents = loader.load()
print(len(documents))

In [None]:
!pip -q install langchain-chroma chromadb langchain-openai langchain-text-splitters tiktoken

In [None]:
# バッチ分割ユーティリティを定義（トークン合計で 30万未満に分割）
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")

def tok_len(s: str) -> int:
    return len(enc.encode(s))

def batches_by_token_limit(docs, limit=240_000):
    batch, total = [], 0
    for d in docs:
        t = tok_len(d.page_content)
        if total + t > limit and batch:
            yield batch
            batch, total = [], 0
        batch.append(d)
        total += t
    if batch:
        yield batch


In [None]:
pip install -U --force-reinstall --no-cache-dir \
  "langchain-core>=0.3.80" \
  "langchain-openai>=0.3.33" \
  "langchain>=0.3.10" \
  "langchain-community>=0.3.10" \
  "langchain-text-splitters>=0.3.2"

In [None]:
from pathlib import Path
from chromadb import PersistentClient
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

PERSIST_DIR = "./chroma_langchain_docs"
COLLECTION  = "langchain_docs_mdx"

embedder = OpenAIEmbeddings(model="text-embedding-3-small")

# 永続クライアント経由でコレクションを開く（なければ作られる）
client = PersistentClient(path=PERSIST_DIR)
db = Chroma(client=client, collection_name=COLLECTION, embedding_function=embedder)

print("DB ready")

In [None]:
BATCH = 100
for i in range(0, len(chunks), BATCH):
    db.add_documents(chunks[i:i+BATCH])
    print(f"added batch {i//BATCH + 1}: {len(chunks[i:i+BATCH])} docs")

retriever = db.as_retriever()
print("OK: retriever ready")


In [None]:
pip install -U langchain-openai

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# ★ 追加: 前方参照を解決させる
from langchain_core.caches import BaseCache  # noqa: F401 使わないが import だけ必要

from langchain_openai import ChatOpenAI
ChatOpenAI.model_rebuild(force=True, raise_errors=False)  # ★ 一度だけ

prompt = ChatPromptTemplate.from_template('''\
以下の文脈だけを踏まえて質問に回答してください。

文脈:"""
{context}
"""
質問:{question}
''')

model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# retriever は List[Document] を返すので文字列化して渡すのが安全
def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

retriever = db.as_retriever()

chain = {
    "question": RunnablePassthrough(),
    "context": retriever | format_docs,   # ← ここで文字列化
} | prompt | model | StrOutputParser()

print(chain.invoke("LangChainの概要を教えて"))


In [None]:
pip list | grep langchain

In [None]:
pip list | grep pydantic