In [None]:
import os
from langchain.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# 設定 Markdown 根目錄（包含多個子資料夾）
markdown_folder = r"TSpec-LLM\Rel-16"

# 使用 DirectoryLoader 載入所有 .md 檔案
loader = DirectoryLoader(markdown_folder, glob="*.md", loader_cls=UnstructuredMarkdownLoader)

# 讀取所有 .md 檔案
md_documents = loader.load()
print(f"===== Loaded {len(md_documents)} Markdown Files =====")

# 拆分 Markdown 內容
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
all_splits = text_splitter.split_documents(md_documents)
print(f"===== Split into {len(all_splits)} Chunks =====")

# # 設定向量儲存資料夾
# persist_directory = 'db'
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# model_kwargs = {'device': 'cpu'}

# # 使用 HuggingFace 進行嵌入
# embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# # 存入 Chroma DB
# vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)

# print("===== Successfully stored embeddings in Chroma DB =====")


In [1]:
import os
import time
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# 設定 Markdown 根目錄（包含多個子資料夾）
markdown_folder = r"TSpec-LLM"

# 遞迴獲取所有子資料夾內的 Markdown 檔案
def get_markdown_files(directory):
    md_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                md_files.append(os.path.join(root, file))
    return md_files

# 取得所有 Markdown 文件路徑
md_file_paths = get_markdown_files(markdown_folder)
print(f"===== Found {len(md_file_paths)} Markdown Files =====")

# 讀取 & 分割 Markdown 文件（確保每個文件獨立處理）
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
all_splits = []
doc_ids = []  # 存放對應的檔案名稱 ID

start_time = time.time()
for idx, file_path in enumerate(md_file_paths):
    print(f"📄 Processing file {idx+1}/{len(md_file_paths)}: {file_path}")  # 顯示當前檔案

    loader = UnstructuredMarkdownLoader(file_path)
    md_documents = loader.load()  # 讀取單一檔案
    file_splits = text_splitter.split_documents(md_documents)  # 針對該檔案切割



    # 取得檔案名稱作為 ID
    file_name = os.path.splitext(os.path.basename(file_path))[0]  # 去除副檔名
    file_ids = [f"{file_name}_{i}" for i in range(len(file_splits))]  # 每個 chunk 加上索引




    print(f"  ✅ {file_path} split into {len(file_splits)} chunks")  # 顯示該檔案被切割的片段數量
    all_splits.extend(file_splits)  # 合併所有拆分後的 chunk
    doc_ids.extend(file_ids)  # 加入對應的 ID



end_time = time.time()
print(f"===== Finished Processing in {end_time - start_time:.2f} seconds =====")
print(f"===== Total {len(all_splits)} Chunks Generated =====")

# 設定向量儲存資料夾
persist_directory = 'db'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}

# 使用 HuggingFace 進行嵌入
start_time = time.time()
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
end_time = time.time()
print(f"===== Loaded embedding Model in {end_time - start_time:.2f} seconds =====")



# 存入 Chroma DB
start_time = time.time()
vectordb = Chroma.from_documents(documents=all_splits, 
                                 embedding=embedding, 
                                 persist_directory=persist_directory, 
                                 collection_name="TSpec_LLM",
                                ids=doc_ids  # 使用自訂 ID
)
end_time = time.time()
print(f"===== Stored embeddings in Chroma DB in {end_time - start_time:.2f} seconds =====")
print("===== Successfully stored embeddings in Chroma DB =====")


===== Found 9 Markdown Files =====
📄 Processing file 1/9: TSpec-LLM\Rel-16\21_series\21101-g10.md
  ✅ TSpec-LLM\Rel-16\21_series\21101-g10.md split into 33 chunks
📄 Processing file 2/9: TSpec-LLM\Rel-16\21_series\21111-g10.md
  ✅ TSpec-LLM\Rel-16\21_series\21111-g10.md split into 192 chunks
📄 Processing file 3/9: TSpec-LLM\Rel-16\21_series\21201-g10.md
  ✅ TSpec-LLM\Rel-16\21_series\21201-g10.md split into 25 chunks
📄 Processing file 4/9: TSpec-LLM\Rel-16\21_series\21202-g00.md
  ✅ TSpec-LLM\Rel-16\21_series\21202-g00.md split into 27 chunks
📄 Processing file 5/9: TSpec-LLM\Rel-16\21_series\21205-g10.md
  ✅ TSpec-LLM\Rel-16\21_series\21205-g10.md split into 16 chunks
📄 Processing file 6/9: TSpec-LLM\Rel-16\21_series\21801-g30.md
  ✅ TSpec-LLM\Rel-16\21_series\21801-g30.md split into 60 chunks
📄 Processing file 7/9: TSpec-LLM\Rel-16\21_series\21900-g40.md
  ✅ TSpec-LLM\Rel-16\21_series\21900-g40.md split into 38 chunks
📄 Processing file 8/9: TSpec-LLM\Rel-16\21_series\21905-g00.md
  ✅ T

In [3]:
print(vectordb.get()["ids"][:10])  # 列出前 10 筆 ID


# 檢查資料庫內的 collections
print(vectordb._collection.count())  # 查看嵌入數量

# 隨機查詢一筆資料
print("Radom data")
sample_query = vectordb._collection.peek(1)
print(sample_query)
print(vectordb._collection.get())


['21101-g10_0', '21101-g10_1', '21101-g10_2', '21101-g10_3', '21101-g10_4', '21101-g10_5', '21101-g10_6', '21101-g10_7', '21101-g10_8', '21101-g10_9']
3993
Radom data
{'ids': ['21101-g10_0'], 'embeddings': array([[-1.34319831e-02,  8.41255262e-02, -2.23526899e-02,
         4.80792783e-02,  4.59755175e-02,  4.36062105e-02,
        -4.14330401e-02,  1.05146259e-01, -3.40081491e-02,
         1.82777867e-02, -6.42170683e-02,  8.40873271e-02,
        -1.60770714e-02,  1.16245635e-02, -5.76250814e-02,
        -2.83166077e-02, -5.38022593e-02, -2.67851036e-02,
        -2.92162467e-02,  2.12104735e-03,  7.61894286e-02,
         1.15002923e-01,  7.62923434e-02,  2.88915206e-02,
         3.49732698e-03,  6.09606728e-02, -7.41655380e-02,
         2.13568378e-02, -2.33695889e-03, -6.37650564e-02,
        -2.33936217e-02,  4.26998325e-02,  6.85030371e-02,
         3.21901366e-02, -1.00619737e-02, -2.56405864e-02,
         6.72741085e-02, -2.17199996e-02,  9.97474864e-02,
        -1.73896439e-02,  2

In [4]:
import chromadb

# 建立 Chroma Client 並指定資料夾
client = chromadb.PersistentClient(path="db")  # 需要改成 `path` 參數
collection = client.get_or_create_collection(name="langchain")

# 查看 collection 數量
print(f"Total collections: {len(client.list_collections())}")

# 查詢已存入的 document IDs
print(collection.get()["ids"])

# 隨機查看部分嵌入內容
print(collection.get(limit=5))


Total collections: 2
[]
{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'data': None, 'metadatas': [], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
