# RAG with BigQuery 
## 参考にした記事
https://cloud.google.com/blog/ja/products/ai-machine-learning/rag-with-bigquery-and-langchain-in-cloud
## 参考にしたNotebook
https://github.com/GoogleCloudPlatform/generative-ai/blob/b5c2d85557d877bc99bf18fdf549423dc54bb108/gemini/use-cases/retrieval-augmented-generation/rag_qna_langchain_bigquery_vector_search.ipynb

# 準備

In [None]:
# Install LangChain and Google Cloud BigQuery
!pip install --upgrade --quiet tiktoken langchain langchain_google_vertexai google-cloud-bigquery pypdf langchain_community langchain_google_community 

# For testing part
!pip install --upgrade db-dtypes pandas


# Installing gcloud command if needed 
# !brew install google-cloud-sdk

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
!gcloud auth application-default login

In [None]:
from google.cloud import bigquery
import pandas as pd

In [None]:
PROJECT_ID = "ml-session" 
# PROJECT_ID = "[your-project-id]"
REGION = "US"

# Set the project id
!gcloud config set project {PROJECT_ID}

In [None]:
!gcloud auth application-default set-quota-project {PROJECT_ID}

In [None]:
client = bigquery.Client(location=REGION, project=PROJECT_ID)

In [None]:
client

# 動作確認のためのテストクエリ

In [None]:
# https://console.cloud.google.com/bigquery

In [None]:

query = """
SELECT
  vendor_id,
  passenger_count,
  trip_distance,
  rate_code,
  payment_type,
  total_amount,
  tip_amount
FROM
  `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE tip_amount >= 0
LIMIT 100
"""
query_job = client.query(
    query,
     location="US",
)

df = query_job.to_dataframe()
df.head(5)

# まずはデータセットを作成

In [None]:
DATASET_ID = "session37"
dataset = bigquery.Dataset(f'{PROJECT_ID}.{DATASET_ID}')
dataset.location = "US"

dataset = client.create_dataset(dataset)  # Make an API request.
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

# Store フェーズ　(Vector Storeを作成し外部情報をベクトルストアに保存)

In [None]:
from langchain_google_vertexai import VertexAIEmbeddings
# https://api.python.langchain.com/en/latest/embeddings/langchain_google_vertexai.embeddings.VertexAIEmbeddings.html

# モデル
# https://cloud.google.com/vertex-ai/generative-ai/docs/learn/model-versions?hl=ja#embeddings_stable_model_versions
embedding_model = VertexAIEmbeddings(
    # model_name="textembedding-gecko@latest", project=PROJECT_ID
    model_name="textembedding-gecko-multilingual@latest", project=PROJECT_ID # 多言語対応 768 次元
)

In [None]:
embedding_model

In [None]:
from langchain_google_community import BigQueryVectorStore
# https://api.python.langchain.com/en/latest/bq_storage_vectorstores/langchain_google_community.bq_storage_vectorstores.bigquery.BigQueryVectorStore.html

# A vector store implementation that utilizes BigQuery and BigQuery Vector Search.
# This class provides efficient storage and retrieval of documents with vector embeddings within BigQuery. It is particularly indicated for prototyping, due the serverless nature of BigQuery, and batch retrieval. It supports similarity search, filtering, and batch operations through batch_search method. Optionally, this class can leverage a Vertex AI Feature Store for online serving through the to_vertex_fs_vector_store method.


TABLE = "internal_info"

bq_vector_store = BigQueryVectorStore(
    project_id=PROJECT_ID,
    dataset_name=DATASET_ID,
    table_name=TABLE,
    location=REGION,
    embedding=embedding_model,
)

In [None]:
all_texts = [
    "6月23日は創立記念日",
    "開発部の内線番号は57",
    "法務部の内線番号は55",
    "有給休暇は年間20日",
    "大阪支社の住所は...",
    "東京本社の住所は...",
]

metadatas = [{"len": len(t)} for t in all_texts]
# Run more texts through the embeddings and add to the vectorstore.
# https://api.python.langchain.com/en/latest/bq_storage_vectorstores/langchain_google_community.bq_storage_vectorstores.bigquery.BigQueryVectorStore.html#langchain_google_community.bq_storage_vectorstores.bigquery.BigQueryVectorStore.add_texts
bq_vector_store.add_texts(all_texts, metadatas=metadatas)

In [None]:
# Search for top k docs most similar to input query.
bq_vector_store.similarity_search(
    "有給休暇は何日ですか？", k=1
)

# Retrieval フェーズ (質問文の類似文章をベクトルストアから取得し、LLMに質問する)

In [None]:
# Return VectorStoreRetriever initialized from this VectorStore.
retriever = bq_vector_store.as_retriever(search_kwargs={'k': 1})

In [None]:
retriever

## 回答

`chain.invoke({"input": question})` を実行すると

1. 検索クエリがretrieverに渡される
1. vector store で検索が実行される。
1. 関連するドキュメントのチャンクが返される。
1. 得られたチャンクはLLMが使用するプロンプトにコンテキストとして使用される。
1. LLMが回答を出力する 


In [None]:
# https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html
# チェーンを用いてLLM の利用を含む一連の処理を一つのまとまりとして扱う
from langchain.chains import create_retrieval_chain

from langchain_google_vertexai import VertexAI

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", (
            "与えられた参考情報をもとに回答してください. "
            "わからなければ「わからない」と答えてください"
            "参考情報: {context}"
            )),
        ("human", "{input}"),
    ]
)

llm = VertexAI(model_name="gemini-pro")

question_answer_chain = create_stuff_documents_chain(llm, prompt)

chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
question = "有給休暇は年間何日ですか？"

result = chain.invoke({"input": question})

print(f'質問:{result["input"]}')
print(f'LLMからの回答:{result["answer"]}')
print(f'使用した参考情報:{result["context"][0].page_content}')


In [None]:
result["context"][0].page_content