In [2]:
# Import 設定
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parents[0]
sys.path.insert(0, PROJECT_DIR.joinpath("src").__str__())  # 不能用 pathlib.PosixPath 傳入

# Step 01: Ingestion

In [2]:
from ingestion.file_loaders.goodnotes import DoclingGoodnotesLoader
loader = DoclingGoodnotesLoader()
loader_results = loader.load("早安日語-孫寅華part01-page25.pdf")

Downloading RapidOCR PPv5 models


Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]



In [3]:
loader_results

[LoaderResult(content="<!-- image -->\n\n<!-- image -->\n\n<!-- image -->\n\n<!-- image -->\n\n<!-- image -->\n\n<!-- image -->\n\n<!-- image -->\n\n早安\n\n早安日語\n\n<!-- image -->\n\n日語\n\n<!-- image -->\n\n<!-- image -->\n\n<!-- image -->\n\n## 孫寅華\n\n<!-- image -->\n\n<!-- image -->\n\n<!-- image -->\n\n'-'\n\n<!-- image -->", metadata=GoodnotesMetadata(file_type=<FileType.PDF: 'pdf'>, file_name='早安日語-孫寅華part01-page25.pdf', title=None, author=None, subject=None, created_at=datetime.datetime(2025, 8, 22, 4, 33, 17, tzinfo=TzInfo(UTC)), modified_at=datetime.datetime(2025, 8, 22, 4, 33, 17, tzinfo=TzInfo(UTC)), is_chunk=False, chunk_info=None, extra=None, source=<PDFSourceType.UNKNOWN: 'unknown'>, producer='iOS Version 18.5 (Build 22F76) Quartz PDFContext', page=1, outlines=[]), doc=DoclingDocument(schema_name='DoclingDocument', version='1.5.0', name='p1-r0-c0 + p1-r0-c1 + p1-r0-c2 + p1-r0-c3 + p1-r1-c0 + p1-r1-c1 + p1-r1-c2 + p1-r1-c3 + p1-r2-c0 + p1-r2-c1 + p1-r2-c2 + p1-r2-c3 + p1-r3-c

# Step 2: Chunk

In [4]:
from chunking.docling import DoclingChunkProcessor
chunker = DoclingChunkProcessor()

chunks = []
for loader_result in loader_results:
    chunks += chunker.process(doc=loader_result.doc,
                              metadata=loader_result.metadata,
                              )

In [5]:
len(chunks)

47

In [8]:
print(chunks[0]._raw_chunk)

text='早安\n早安日語\n日語' meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[DocItem(self_ref='#/texts/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=10, bbox=BoundingBox(l=860.39111328125, t=460.5, r=1515.23583984375, b=7.5, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 2))]), DocItem(self_ref='#/texts/1', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=11, bbox=BoundingBox(l=81.76007588704427, t=505.5, r=1515.23583984375, b=0.75, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 4))]), DocItem(self_ref='#/texts/3', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=12, bbox=BoundingBox(l=0.0, t=480.5333

  print(chunks[0]._raw_chunk)


# Step 3: Embedding

In [9]:
import os
from cache.redis import RedisCacheHandler
from embedding.openai_embed import OpenAIEmbeddingModel


embedder = OpenAIEmbeddingModel(api_key=os.getenv("OPEN_AI_API"),
                                model_name="text-embedding-3-small",
                                memory_cache=RedisCacheHandler(host=os.getenv("MY_REDIS_HOST"),
                                                               port=os.getenv("MY_REDIS_PORT"),
                                                               password=os.getenv("MY_REDIS_PASSWORD"),
                                                               ),
                                )
for idx in range(len(chunks)):
    vector = embedder.encode(chunks[idx].content)
    chunks[idx].embedding = vector

# Step 04: Insert

## Vector-based

In [12]:
from infra.stores.pgvector import PGVectorStore

vec_store = PGVectorStore(host=os.getenv("MY_POSTGRE_HOST"),
                          port=os.getenv("MY_POSTGRE_PORT"),
                          dbname=os.getenv("MY_POSTGRE_DB_NAME"),
                          schema="Japanese-Learning",
                          user=os.getenv("MY_POSTGRE_USERNAME"),
                          password=os.getenv("MY_POSTGRE_PASSWORD"),
                          )

for chunk in chunks:
    vec_store.insert(chunk)

## Lexical-based

In [14]:
from infra.stores.elasticsearch import ElasticsearchBM25Store

lex_store = ElasticsearchBM25Store(host=os.getenv("MY_ELASTIC_HOST"),
                                   port=os.getenv("MY_ELASTIC_PORT"),
                                   index_name="japanese-learning",
                                   username=os.getenv("MY_ELASTIC_USERNAME"),
                                   password=os.getenv("MY_ELASTIC_PASSWORD"),
                                   )

for chunk in chunks:
    lex_store.insert(chunk)

# Step 05: Retrieve

## Vector-based

In [7]:
import os

from cache.redis import RedisCacheHandler
from embedding.openai_embed import OpenAIEmbeddingModel
from infra.stores.pgvector import PGVectorStore

embedder = OpenAIEmbeddingModel(api_key=os.getenv("OPEN_AI_API"),
                                model_name="text-embedding-3-small",
                                memory_cache=RedisCacheHandler(host=os.getenv("MY_REDIS_HOST"),
                                                               port=os.getenv("MY_REDIS_PORT"),
                                                               password=os.getenv("MY_REDIS_PASSWORD"),
                                                               ),
                                )
query_vector = embedder.encode("自動詞與他動詞的判斷邏輯")

vec_store = PGVectorStore(host=os.getenv("MY_POSTGRE_HOST"),
                          port=os.getenv("MY_POSTGRE_PORT"),
                          dbname=os.getenv("MY_POSTGRE_DB_NAME"),
                          schema="Japanese-Learning",
                          user=os.getenv("MY_POSTGRE_USERNAME"),
                          password=os.getenv("MY_POSTGRE_PASSWORD"),
                          )
results = vec_store.search(query_vector)

In [9]:
[_.chunk.content for _ in results]

["詞性解釋\n他⇒他動詞(似及物動詞).例如':'授舆\n颜理：(他五'>'：五五段变化('6'弥音化动詞)",
 '形動詞是並列，關係，可調換\nEx\':\'日本は便利で綺震国です。\n結構同\'5\'\n並列關係，可調換\nは便利て綺震国です。\nにほ人 べんり くに \'6\'.日本はきれいで便利な 国\n結構同\'5\'.\nほ人 べんり くに 本はきれいで便利 国\n\'-\'\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\nは人\n本はきれいで便利な\nまんご\n本語の\n習\n(lesson \'2\'\'1\'\'~\'\'2\'\'2\')\n中は\nさんは\nしせつ\nひと\n親切な\n人です\n連体形修飾後方名詞\nじようす\n日本語が(上手です)\n国\nべんきょう\nむずか\nたいへん\n形容詞+形動詞\n勉強は\n難しくて\n大変です。\n由於「難」和「大変」有因果關係，所以不能調換\nすうがく\n若(形動)無法直接形容到主詞\n則會用が做為助詞連接\nす。\n人不能調換\n到主詞\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\n到主詞\n好き，嫌い」\n不差）\n贬，並列。\n形動\'>\'+<形\'>\'\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\n人\nやさいいです<形動\'>\'<形)\nゆうめい)\nは親切で\nたんこうだいがく\n有名な学校は）淡江大学です\nす\nはない\nさくら\n好きな、花は）\n桜です。\nら\

## Lexical-based

In [3]:
import os

from infra.stores.elasticsearch import ElasticsearchBM25Store

lex_store = ElasticsearchBM25Store(host=os.getenv("MY_ELASTIC_HOST"),
                                   port=os.getenv("MY_ELASTIC_PORT"),
                                   index_name="japanese-learning",
                                   username=os.getenv("MY_ELASTIC_USERNAME"),
                                   password=os.getenv("MY_ELASTIC_PASSWORD"),
                                   )
results = lex_store.search("自動詞與他動詞的判斷邏輯")

In [4]:
[_.chunk.content for _ in results]

["詞性解釋\n他⇒他動詞(似及物動詞).例如':'授舆\n颜理：(他五'>'：五五段变化('6'弥音化动詞)",
 '(lesson\'2\'\'4\'\'~\'\'2\'\'5\')\n他⇒他動詞(似及物動詞).例如\':\'授舆 五五段变化(c強变化动詞)\nlesson \'2\'\'4\'\'~\'\'2\'\'9\')\n语\n(lesson\'2\'\'4\'\'~\'\'2\'\'5\')\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\n语\n(lesson\'2\'\'4\'\'~\'\'2\'\'5\')\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\nTable Part 001\n{\n  "col000":{\n    "column_name":"row_name"\n  }\n}\nレポ一卜<名\',\'他サ\'>\'報告\nく\n〈自力\'>\'來\n来る\n()つも(副)經常;常常\nけさ<名〉今天早晨\nメモ<名，他サ\'>\'筆記;便忘錄;便條\nよく<副)經常;常常\nぜ人ぶ\n全部<名.副)全部全體\n(名)下個月\nまいにち 每日\nらいげつ 来月\nバス<名\'>\'bus\n空港(名)和場\n<名)每天\nタクシー（名) taxi\nゆうべ名\'>\'昨夜\nバイク<名\'>\'bike\nでんしゃ 電車\nどの<らし)名副\'>\'多少多久\nいちにちじゅう\nじガん \'~\'時間(接尾)小時\n一日中(名)一整天\n<名\'>\'電車\n\'7\'γ\'2\'\'7\' 自転車\n(名)脚踏車，自行車\nくうこう\nから\n(日本人創造的複合名詞)空+orchestra\n同ゼロ\n詞交通\n名詞\n名詞\'1\'片假名\nバス<名\'>\'bus\n空港(名)机场\nくうこう\nあさ はん\n早晨饭\n朝二飯\n(名)早餐\nえいが\n映画\n<名\'>\'電影\nコピユーター\nタ