# Config

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.tools.tavily_search import TavilySearchResults

from langchain_core.messages import HumanMessage
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from typing_extensions import TypedDict
from typing import List
from langchain.schema import Document
from langgraph.graph import END, StateGraph

import os
from dotenv import load_dotenv
load_dotenv(".env")

from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

os.environ["LANGCHAIN_TRACING_V2"] = "true"
LANGCHAIN_API_KEY = os.environ["LANGCHAIN_API_KEY"]

Azure_OPENAI_API_KEY = os.environ["AZURE_OPENAI_API_KEY"]

llm = AzureChatOpenAI(
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME_CHAT"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME_EMBEDDINGS"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
)

from langchain_postgres import PGVector

vector_store = PGVector(
    embeddings = embeddings,
    collection_name = "civilcode",
    connection = os.environ["PGVECTOR_CONNECTION_STRING"],
)


# 民法資料存入資料庫

## RDBMS SQLite
不用額外裝套件

In [2]:
# import sqlite3
# import json

# civil_code_json_path = "../data-pre-process/民法-110-01-20.json"

# with open(file=civil_code_json_path, mode='r', encoding='utf-8') as file:
#     json_data = file.read()

# data_list = json.loads(json_data)

# conn = sqlite3.connect('civilcode.db')
# cursor = conn.cursor()

# cursor.execute('''
# CREATE TABLE IF NOT EXISTS law_articles (
#     article_number TEXT,
#     article_title TEXT,
#     article_content TEXT,
#     part_number INTEGER,
#     part_title TEXT,
#     chapter_number INTEGER,
#     chapter_title TEXT,
#     section_number INTEGER,
#     section_title TEXT,
#     subsection_number INTEGER,
#     subsection_title TEXT,
#     item_number INTEGER,
#     item_title TEXT,
#     source_url TEXT
# )
# ''')

# for data in data_list:
#     cursor.execute('''
#     INSERT INTO law_articles (
#         article_number, article_title, article_content, part_number, part_title,
#         chapter_number, chapter_title, section_number, section_title,
#         subsection_number, subsection_title, item_number, item_title, source_url
#     ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
#     ''', (
#         data['article_number'],
#         data['article_title'],
#         '\n'.join(data['artcile_content']),  # 將列表轉換為字符串
#         data['part_number'],
#         data['part_title'],
#         data['chapter_number'],
#         data['chapter_title'],
#         data['section_number'],
#         data['section_title'],
#         data['subsection_number'],
#         data['subsection_title'],
#         data['item_number'],
#         data['item_title'],
#         data['source_url']
#     ))

# conn.commit()
# conn.close()


## RDBMS PostgreSQL

In [6]:
# import psycopg2
# import json

# civil_code_json_path = "../data-pre-process/民法-110-01-20.json"

# with open(file=civil_code_json_path, mode='r', encoding='utf-8') as file:
#     json_data = file.read()

# data_list = json.loads(json_data)

# conn = psycopg2.connect(
#     dbname="civilcode",
#     user=os.environ["POSTGRES_USER"],
#     password=os.environ["POSTGRES_PASSWORD"],
#     host="localhost",
#     port="5432"
# )
# cursor = conn.cursor()

# cursor.execute('''
# CREATE TABLE IF NOT EXISTS law_articles (
#     article_number TEXT,
#     article_title TEXT,
#     article_content TEXT,
#     part_number INTEGER,
#     part_title TEXT,
#     chapter_number INTEGER,
#     chapter_title TEXT,
#     section_number INTEGER,
#     section_title TEXT,
#     subsection_number INTEGER,
#     subsection_title TEXT,
#     item_number INTEGER,
#     item_title TEXT,
#     source_url TEXT
# )
# ''')

# for data in data_list:
#     cursor.execute('''
#     INSERT INTO law_articles (
#         article_number, article_title, article_content, part_number, part_title,
#         chapter_number, chapter_title, section_number, section_title,
#         subsection_number, subsection_title, item_number, item_title, source_url
#     ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
#     ''', (
#         data['article_number'],
#         data['article_title'],
#         "".join(data['article_content']),
#         data['part_number'],
#         data['part_title'],
#         data['chapter_number'],
#         data['chapter_title'],
#         data['section_number'],
#         data['section_title'],
#         data['subsection_number'],
#         data['subsection_title'],
#         data['item_number'],
#         data['item_title'],
#         data['source_url']
#     ))

# conn.commit()
# conn.close()


## VectorDB PGvector

### 失敗：LangChain 的 .add_document() 方法

直接存會卡到API上限
S0 等級的 Azure RPM 沒辦法一次呼叫那個多次

In [4]:
import json
from langchain_core.documents import Document

civil_code_json_path = "../data-pre-process/民法-110-01-20.json"

with open(civil_code_json_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

documents = []

for entry in json_data:

    if entry['article_content'] == ["（刪除）"]:
        continue
    
    article_number = entry['article_number']
    article_title = entry['article_title']
    article_content = "".join(entry['article_content'])
    part_number = entry['part_number']
    part_title = entry['part_title']
    chapter_number = entry['chapter_number']
    chapter_title = entry['chapter_title']
    source_url = entry['source_url']

    metadata = {
        "article_number": article_number,
        "article_title": article_title,
        "part_number": part_number,
        "part_title": part_title,
        "chapter_number": chapter_number,
        "chapter_title": chapter_title,
        "source_url": source_url
    }

    document = Document(
        page_content=article_content,
        metadata=metadata
    )

    documents.append(document)

import time
# batch processing
# input 500 documents every 1 minute

# for i in range(0, len(documents), 500):
#     vector_store.add_documents(documents[i:i+500])
#     time.sleep(60)

# vector_store.add_documents(documents)

#### 不清楚為啥會出錯

額度也給夠了，batch 也實施了，沒用。

```plaintext
RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Embeddings_Create Operation under Azure OpenAI API version 2024-08-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 86400 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}
```

### 成功：用外面的 Embedding 然後寫SQL存入 PGvector

用 OpenAI 的 [Batch API 處理](https://platform.openai.com/docs/api-reference/batch)，比較省錢，缺點是要手動轉換資料並寫入資料庫，很要命

需要輸入 langchain 自行建立的 `collection id`，才能符合 FK key 規則，可以去 VectorDB 的 `langchain_pg_collection`  Table 查詢。

In [5]:
# import psycopg2
# import json
# import uuid
# import os

# batchapi_embed_file_path = "../data-pre-process/batch_67679735cb7081909fb3c8a66d1336cb_output.jsonl"
# raw_json_path = "../data-pre-process/民法-110-01-20.json"

# conn = psycopg2.connect(
#     dbname="civilcode",
#     user=os.environ["POSTGRES_USER"],
#     password=os.environ["POSTGRES_PASSWORD"],
#     host="localhost",
#     port="5433" # 連線到 vectorstore
# )

# cur = conn.cursor()

# with open(batchapi_embed_file_path, 'r', encoding='utf-8') as file:
#     batchapi_embed_data_list = [json.loads(line.strip()) for line in file]

# with open(raw_json_path, 'r', encoding='utf-8') as file:
#     raw_json_data = json.load(file)

# collection_id = input("請輸入 collection_id: ")

# for i, batchapi_embed_data in enumerate(batchapi_embed_data_list):

#     if raw_json_data[i]['article_content'] == ["（刪除）"]:
#         continue

#     id = str(uuid.uuid4())
#     embedding = batchapi_embed_data['response']['body']['data'][0]['embedding']
#     document = raw_json_data[i]['article_content']
#     cmetadata = json.dumps(raw_json_data[i])

#     cur.execute("""
#     INSERT INTO langchain_pg_embedding (id, collection_id, embedding, document, cmetadata) 
#     VALUES (%s, %s, %s, %s, %s);
#     """, (
#         id, 
#         collection_id, 
#         embedding, 
#         document, 
#         cmetadata
#     ))

# conn.commit()

# cur.close()
# conn.close()


# LangGraph