In [22]:
import os
import time
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


# 設定 Markdown 根目錄與向量儲存資料夾
markdown_folder = r"TSpec-LLM/Rel-16"
persist_directory = 'db'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# 建立 splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=10)
markdown_files = [f for f in os.listdir(markdown_folder) if f.endswith(".md")]
###################################################################################################################################################
# 遞迴抓取所有 .md 檔案
markdown_files = []
for root, _, files in os.walk(markdown_folder):
    for file in files:
        if file.endswith(".md"):
            full_path = os.path.join(root, file)
            markdown_files.append(full_path)

print(f"📁 共發現 {len(markdown_files)} 個 Markdown 檔案\n")
###################################################################################################################################################

# 處理每個 .md 檔案
start_time = time.time()
chunk_summary = {}
persist_root = 'multi_db'

for i, full_path in enumerate(markdown_files, start=1):
    filename = os.path.basename(full_path)
    spec_name = os.path.splitext(filename)[0]

    print(f"\n🔹 處理第 {i}/{len(markdown_files)} 個：{spec_name}")

    loader = UnstructuredMarkdownLoader(full_path)
    documents = loader.load()

    if len(documents) == 0:
        print(f"⚠️  {spec_name} 沒有內容，跳過...\n")
        continue

    for doc in documents:
        doc.metadata["spec"] = spec_name
        doc.metadata["path"] = full_path

    splits = text_splitter.split_documents(documents)
    chunk_summary[spec_name] = len(splits)

    print(f"✅ 完成切割：{spec_name}，共 {len(splits)} 個 chunks")

    if len(splits) == 0:
        print(f"⚠️  {spec_name} 拆分為 0，跳過 embedding...\n")
        continue

    # 建立實體儲存路徑
    persist_path = os.path.join(persist_root, spec_name)
    os.makedirs(persist_path, exist_ok=True)

    Chroma.from_documents(
        documents=splits,
        embedding=embedding,
        persist_directory=persist_path
    ).persist()

    print(f"💾 完成 embedding 並將 {spec_name} 存入 Chroma DB\n")

end_time = time.time()

# ===== 總結所有檔案的 chunks 統計 =====
print("\n📊 檔案切割統計總結：")
total_chunks = 0
for spec, count in chunk_summary.items():
    print(f" - {spec.ljust(20)}: {count} chunks")
    total_chunks += count

print(f"\n📦 總共切割出 {total_chunks} 個 chunks")
print(f"===== Finished Processing in {end_time - start_time:.2f} seconds =====")


📁 共發現 1697 個 Markdown 檔案


🔹 處理第 1/1697 個：46081-g00
✅ 完成切割：46081-g00，共 84 個 chunks
💾 完成 embedding 並將 46081-g00 存入 Chroma DB


🔹 處理第 2/1697 個：46053-g00
✅ 完成切割：46053-g00，共 48 個 chunks
💾 完成 embedding 並將 46053-g00 存入 Chroma DB


🔹 處理第 3/1697 個：46051-g00
✅ 完成切割：46051-g00，共 85 個 chunks
💾 完成 embedding 並將 46051-g00 存入 Chroma DB


🔹 處理第 4/1697 個：46062-g00
✅ 完成切割：46062-g00，共 138 個 chunks
💾 完成 embedding 並將 46062-g00 存入 Chroma DB


🔹 處理第 5/1697 個：46020-g00
✅ 完成切割：46020-g00，共 424 個 chunks
💾 完成 embedding 並將 46020-g00 存入 Chroma DB


🔹 處理第 6/1697 個：46011-g00
✅ 完成切割：46011-g00，共 34 個 chunks
💾 完成 embedding 並將 46011-g00 存入 Chroma DB


🔹 處理第 7/1697 個：46008-g00
✅ 完成切割：46008-g00，共 230 個 chunks
💾 完成 embedding 並將 46008-g00 存入 Chroma DB


🔹 處理第 8/1697 個：46002-g00
✅ 完成切割：46002-g00，共 81 個 chunks
💾 完成 embedding 並將 46002-g00 存入 Chroma DB


🔹 處理第 9/1697 個：46042-g00
✅ 完成切割：46042-g00，共 148 個 chunks
💾 完成 embedding 並將 46042-g00 存入 Chroma DB


🔹 處理第 10/1697 個：46061-g00
✅ 完成切割：46061-g00，共 61 個 chunks
💾 完成 embedding 並將 460

In [21]:
import os
from langchain.vectorstores import Chroma

root_db = "db_testinggggggggg"
all_sub_dbs = [d for d in os.listdir(root_db) if os.path.isdir(os.path.join(root_db, d))]

all_docs = []
for sub in all_sub_dbs:
    path = os.path.join(root_db, sub)
    db = Chroma(persist_directory=path, embedding_function=embedding)
    all_docs.extend(db.get()["documents"])

print(f"📦 合計讀入 {len(all_docs)} 筆資料")
print("📄 前兩筆：", all_docs[:2])


📦 合計讀入 28510 筆資料


In [20]:
import os
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# 參數設定
root_db_path = "db_testinggggggggg"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"})

# 查詢語句
query = "PDCCH configuration"
k = 5  # Top-K 全域搜尋數量

# 掃描所有資料夾
sub_dirs = [d for d in os.listdir(root_db_path) if os.path.isdir(os.path.join(root_db_path, d))]

all_results = []

for sub_dir in sub_dirs:
    sub_path = os.path.join(root_db_path, sub_dir)
    try:
        db = Chroma(persist_directory=sub_path, embedding_function=embedding)
        results = db.similarity_search(query, k=k)
        for r in results:
            r.metadata["source_spec"] = sub_dir  # 加上來源標籤
            all_results.append(r)
    except Exception as e:
        print(f"❌ 無法載入 {sub_dir}：{e}")

# 取前 K 筆（這裡不排序，只是合併後取前 k）
# 若你希望根據相似度排序，你需要使用 similarity_search_with_score
top_k_results = all_results[:k]

# 顯示結果
print(f"\n🔍 🔝 Top-{k} 結果（跨 DB）:")
for i, doc in enumerate(top_k_results, 1):
    print(f"\n--- Result {i} ---")
    print(f"來自 SPEC：{doc.metadata.get('source_spec', 'unknown')}")
    print(f"內容：\n{doc.page_content[:300]}...")



🔍 🔝 Top-5 結果（跨 DB）:

--- Result 1 ---
來自 SPEC：31121-gd0
內容：
1) During step b), the time of periods of inactivity on the UICC-Terminal interface shall not be longer than 30 seconds.

2) After step e), the Terminal shall terminate the call or PDP context within 5 s at the latest after having received the wrong response to the STATUS command....

--- Result 2 ---
來自 SPEC：31121-gd0
內容：
12.11.4.2 Procedure

a) The UE is powered on where the UICC is configured as defined in b) in the initial conditions.

b) User request activate a PDP context. After receipt of ACTIVATE PDP CONTEXT REQUEST from the UE, the USS sends ACTIVATE PDP CONTEXT REJECT to the UE indicating:...

--- Result 3 ---
來自 SPEC：31121-gd0
內容：
a) The UE is powered on where the UICC is configured as defined in b) in the initial conditions.

b) The user requests activation of a PDP context. After receipt of ACTIVATE PDP CONTEXT REQUEST from the UE, the USS sends ACTIVATE PDP CONTEXT REJECT to the UE indicating:...

--- Result 4 -