### Download data from Hugging Face
https://huggingface.co/datasets/matsuxr/JaGovFaqs-22k

In [None]:
import polars as pl

df = pl.read_ndjson('hf://datasets/matsuxr/JaGovFaqs-22k/data.jsonl')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(f"DataFrameのメモリ使用量: {df.estimated_size() / (1024 ** 2):.2f} MB")


In [None]:
df = df.with_columns(
    ("question : " + pl.col("Question") + " | " + "answer : " + pl.col("Answer")).alias("combined")
)

In [None]:
df["combined"][0]

### Embeddings

In [None]:
from openai import OpenAI
from sentence_transformers import SentenceTransformer

client = OpenAI()
# model = "text-embedding-3-small"
model = "pkshatech/GLuCoSE-base-ja"

if model == "text-embedding-3-small":
    # 48分かかる
    def get_embedding(text, model="text-embedding-3-small") -> list[float]:
        return client.embeddings.create(input = [text], model=model).data[0].embedding

    df = df.with_columns(
        pl.col("combined").map_elements(lambda x: get_embedding(x, model)).alias("ada_embedding")
    )
    df.write_csv('/app/output/embedded_faq.csv')

if model == "pkshatech/GLuCoSE-base-ja":
    model = SentenceTransformer('pkshatech/GLuCoSE-base-ja')
    df["ada_embedding"] = model.encode(df["combined"].to_list(), show_progress_bar=True)
    df.write_csv('/app/output/embedded_faq_GLuCoSE-base-ja.csv')

In [None]:
df.head()

In [None]:
df.write_parquet('/app/output/embedded_faq.parquet')

In [None]:
import polars as pl
embeded_df = pl.read_parquet('/app/output/embedded_faq.parquet')
embeded_df.head()

In [None]:
embeded_df["ada_embedding"][0]

### Build Qdrant Index

In [None]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://qdrant:6333")

In [None]:
from qdrant_client.models import Distance, VectorParams

client.create_collection(
    collection_name="kankocho_faq",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

In [None]:
from qdrant_client.models import PointStruct
from tqdm import tqdm

points = embeded_df.iter_rows(named=True)
point_structs = [
    PointStruct(id=i, vector=row["ada_embedding"], payload={
        "question": row["Question"],
        "answer": row["Answer"],
        "copyright": row["copyright"],
        "url": row["url"],
        "combined": row["combined"],
    })
    for i, row in tqdm(enumerate(points))
]


In [None]:

chunk_size = 100
for chunk in tqdm(point_structs[i:i + chunk_size] for i in range(0, len(point_structs), chunk_size)):
    operation_info = client.upsert(
        collection_name="kankocho_faq",
        wait=True,
        points=chunk,
    )

print(operation_info)

### Retrieve related documents

In [None]:
from openai import OpenAI
from qdrant_client import QdrantClient
import polars as pl

# input = "商品を販売する時に注意するべきことは何ですか"
# input = "インサイダー取引について教えてください"
# input = "相続について教えてください"
input = "マイナンバーについて教えてください"

openai_client = OpenAI()
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return openai_client.embeddings.create(input = [text], model=model).data[0].embedding

input_embedding = get_embedding(input)


qdrant_client = QdrantClient(url="http://qdrant:6333")

search_results = qdrant_client.search(
    collection_name="kankocho_faq",
    query_vector=input_embedding,
    limit=10
)

print(search_results)

# 検索結果をPolars DataFrameに変換する関数
def search_results_to_dataframe(results):
    data = []
    for result in results:
        payload = result.payload
        data.append({
            "id": result.id,
            "score": result.score,
            "question": payload.get("question", ""),
            "answer": payload.get("answer", ""),
            "copyright": payload.get("copyright", ""),
            "url": payload.get("url", ""),
            "combined": payload.get("combined", "")
        })
    return pl.DataFrame(data)

# 検索結果をDataFrameに変換
df_results = search_results_to_dataframe(search_results)

# DataFrameを表示
display(df_results)

In [None]:
response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"以下の検索結果を引用して、'{input}'に対する回答を生成してください。回答には必ず引用した文章を明示してください。\n\n検索結果:\n{df_results['combined'].to_list()}"}
    ],
    #max_tokens=150,
    n=1,
    stop=None,
    temperature=0.7,
)

print(response)
answer = response.choices[0].message.content
print("回答:\n", answer)
