In [24]:
import os

from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from dotenv import load_dotenv
from openai import OpenAI
from utils.tokenizer import OpenAITokenizerWrapper

env_path = os.path.join(os.sep, "Users", "leon", ".env")
load_dotenv(env_path)

# Initialize OpenAI client (make sure you have OPENAI_API_KEY in your environment variables)
client = OpenAI()


tokenizer = OpenAITokenizerWrapper()  # Load our custom tokenizer for OpenAI
MAX_TOKENS = 256  # text-embedding-3-large's maximum context length


# --------------------------------------------------------------
# Extract the data
# --------------------------------------------------------------


converter = DocumentConverter()
result = converter.convert("test.md")


# --------------------------------------------------------------
# Apply hybrid chunking
# --------------------------------------------------------------


chunker = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=MAX_TOKENS,
    merge_peers=False
)


chunk_iter = chunker.chunk(dl_doc=result.document)

In [25]:
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

LANCEDB_DIR = "../data/lancedb"
TABLE_NAME = "climatepolicyradar"

# --------------------------------------------------------------
# Create a LanceDB database and table
# --------------------------------------------------------------

# Create a LanceDB database
db = lancedb.connect(LANCEDB_DIR)


# Get the OpenAI embedding function
func = get_registry().get("openai").create(name="text-embedding-3-small")


# Define a simplified metadata schema
class ChunkMetadata(LanceModel):
    """
    You must order the fields in alphabetical order.
    This is a requirement of the Pydantic implementation.
    """

    filename: str | None
    title: str | None


# Define the main Schema
class Chunks(LanceModel):
    text: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()  # type: ignore | Pass the embeddings-function to the database
    metadata: ChunkMetadata


table = db.create_table(TABLE_NAME, schema=Chunks, mode="overwrite")

# --------------------------------------------------------------
# Prepare the chunks for the table
# --------------------------------------------------------------

# Create table with processed chunks
processed_chunks = [
    {
        "text": chunk.text,
        "metadata": {
            "filename": chunk.meta.origin.filename,
            "title": chunk.meta.headings[0] if chunk.meta.headings else None,
        },
    }
    for chunk in chunk_iter
]

# --------------------------------------------------------------
# Add the chunks to the table (automatically embeds the text)
# --------------------------------------------------------------

table.add(processed_chunks)



In [26]:
# --------------------------------------------------------------
# Load the table
# --------------------------------------------------------------

df = table.to_pandas()

print(len(df))

df

58


Unnamed: 0,text,vector,metadata
0,"EUROPEAN COMMISSION Brussels, 18.5.2022 COM(20...","[-0.004901573, 0.015297584, 0.029615412, 0.011...","{'filename': 'test.md', 'title': None}"
1,9 bcm of natural gas annually by 2027. Solar e...,"[-0.04308129, 0.018255888, 0.06313675, 0.01090...","{'filename': 'test.md', 'title': None}"
2,"Solar energy, combined with energy efficiency,...","[-0.022207702, 0.015508301, 0.05563516, 0.0382...","{'filename': 'test.md', 'title': None}"
3,EU large-scale skills partnership for,"[-0.0014660229, 0.033820212, 0.05075594, 8.296...","{'filename': 'test.md', 'title': None}"
4,onshore renewable energy under the Pact for Sk...,"[-0.03702904, 0.029328406, 0.03522489, 0.00318...","{'filename': 'test.md', 'title': None}"
5,18 GW that year. It delivered around 5% of tot...,"[-0.044564977, 0.012687859, 0.07369756, 0.0048...","{'filename': 'test.md', 'title': None}"
6,"They can be deployed very rapidly, as they uti...","[-0.026288439, 0.01813808, 0.07533193, 0.01630...","{'filename': 'test.md', 'title': None}"
7,by 2026; oall existing public and commercial b...,"[1.1175928e-06, 0.029705035, 0.07075471, 0.014...","{'filename': 'test.md', 'title': None}"
8,000 by 2025. -Ensure that energy poor and vuln...,"[-0.0029145256, 0.0012499227, 0.070254475, 0.0...","{'filename': 'test.md', 'title': None}"
9,buildings for quick interventions (Energy Perf...,"[-0.013426594, 0.03918594, 0.07120783, 0.02743...","{'filename': 'test.md', 'title': None}"
