In [1]:
# Open gdpr.txt and read its lines
with open("gdpr.txt", "r", encoding="utf-8") as f:
    gdpr = f.readlines()

# Display first 15 lines (unprocessed, raw text)
for i, line in enumerate(gdpr[:15], start=1):
    print(f"{i:02d}: {line.rstrip()}")


01: CHAPTER I
02: 
03: General provisions
04: 
05: Article 1
06: 
07: Subject-matter and objectives
08: 
09: 1.   This Regulation lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data.
10: 
11: 2.   This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data.
12: 
13: 3.   The free movement of personal data within the Union shall be neither restricted nor prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data.
14: 
15: Article 2


In [2]:
from process_text import clean_text
raw_text = "".join(gdpr)
# Processing the document with our function
cleaned_gdpr = clean_text(raw_text)

# Split back into lines for display
cleaned_lines = cleaned_gdpr.splitlines()

for i, line in enumerate(cleaned_lines[:10], start=1):
    print(f"{i:02d}: {line}")

01: CHAPTER I
02: General provisions
03: Article 1
04: Subject-matter and objectives
05: 1. This Regulation lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data.
06: 2. This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data.
07: 3. The free movement of personal data within the Union shall be neither restricted nor prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data.
08: Article 2
09: Material scope
10: 1. This Regulation applies to the processing of personal data wholly or partly by automated means and to the processing other than by automated means of personal data which form part of a filing system or are intended to form part of a filing system.


In [3]:
from chunks import chunk_path
import json

chunked_gdpr = chunk_path("gdpr.txt")

with open(chunked_gdpr, "r", encoding="utf-8") as f:
    chunks = json.load(f)

print(f"Total chunks: {len(chunks)}\n")

Wrote 15 chunks to gdpr_chunks.json
Total chunks: 15



In [4]:
# Show the first few chunks
for i, ch in enumerate(chunks[:5], start=1):
    print(f"id: {ch['id']}")
    print(ch["chunk"])

id: gdpr_1
CHAPTER I General provisions Article 1 Subject-matter and objectives 1. This Regulation lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data. 2. This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data. 3. The free movement of personal data within the Union shall be neither restricted nor prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data. Article 2 Material scope 1. This Regulation applies to the processing of personal data wholly or partly by automated means and to the processing other than by automated means of personal data which form part of a filing system or are intended to form part of a filing system. 2. This Regulation does not apply to the processing of personal data: (a) in the course of an activity whi

In [5]:
from embedding import embed_path
embeddings, chunks = embed_path("gdpr.txt")

SQLite DB will be written to: gdpr_embeddings.sqlite
Chunks JSON will be written to: gdpr_chunks.json
Processing document: gdpr.txt  (doc id: gdpr)
  Chunks: 15
  Inserted 15 rows into SQLite for doc gdpr
Wrote 15 chunks to JSON → gdpr_chunks.json
✅ Done.


In [6]:
import sqlite3
conn = sqlite3.connect(embeddings)
count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
print(f"Total rows (embedded chunks): {count}")
example = conn.execute("SELECT id, LENGTH(emb) FROM chunks LIMIT 15").fetchall()
print(example)
conn.close()

Total rows (embedded chunks): 15
[('gdpr_1', 1536), ('gdpr_2', 1536), ('gdpr_3', 1536), ('gdpr_4', 1536), ('gdpr_5', 1536), ('gdpr_6', 1536), ('gdpr_7', 1536), ('gdpr_8', 1536), ('gdpr_9', 1536), ('gdpr_10', 1536), ('gdpr_11', 1536), ('gdpr_12', 1536), ('gdpr_13', 1536), ('gdpr_14', 1536), ('gdpr_15', 1536)]


In [15]:
from contextualize import contextualize_file
# Run the full contextualization pipeline
contextualized_gdpr = contextualize_file("gdpr_chunks.json")

  Document 'gdpr': 15 chunks
    [1/15] contextualized for 'gdpr'
    [2/15] contextualized for 'gdpr'
    [3/15] contextualized for 'gdpr'
    [4/15] contextualized for 'gdpr'
    [5/15] contextualized for 'gdpr'
    [6/15] contextualized for 'gdpr'
    [7/15] contextualized for 'gdpr'
    [8/15] contextualized for 'gdpr'
    [9/15] contextualized for 'gdpr'
    [10/15] contextualized for 'gdpr'
    [11/15] contextualized for 'gdpr'
    [12/15] contextualized for 'gdpr'
    [13/15] contextualized for 'gdpr'
    [14/15] contextualized for 'gdpr'
    [15/15] contextualized for 'gdpr'
✓ Done. Contextualized 15 chunks → gdpr_contextualized.json (78.1s)


In [19]:
with open(contextualized_gdpr, "r", encoding="utf-8") as f:
    contextualized = json.load(f)

print(f"Total contextualized chunks: {len(contextualized)}")

# Show the first few contextualized results
for i, item in enumerate(contextualized[:3], start=1):
    print(f"id: {item['id']}")
    print(f"chunk:\n{item['chunk'][:500]}...\n")
    print(f"contextualized_chunk:\n{item['contextualized_chunk']}\n")

Total contextualized chunks: 15
id: gdpr_1
chunk:
CHAPTER I General provisions Article 1 Subject-matter and objectives 1. This Regulation lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data. 2. This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data. 3. The free movement of personal data within the Union shall be neither restricted nor prohibited fo...

contextualized_chunk:
This passage defines the subject-matter and objectives of the regulation, which is to protect natural persons with regard to the processing of personal data and ensure the free movement of such data within the EU. It also outlines the material scope of the regulation, specifying that it applies to automated and non-automated processing of personal data, but excludes certain activities such as purely personal or househol

In [22]:
from extract_claims import extract_claims_from_file
claims = extract_claims_from_file("gdpr_contextualized.json")

Total chunks: 15
[1/15] claims (written=7, empty_chunks=0)
[2/15] claims (written=10, empty_chunks=0)
[3/15] claims (written=13, empty_chunks=0)
[4/15] claims (written=18, empty_chunks=0)
[5/15] claims (written=23, empty_chunks=0)
[6/15] claims (written=28, empty_chunks=0)
[7/15] claims (written=32, empty_chunks=0)
[8/15] claims (written=37, empty_chunks=0)
[9/15] claims (written=42, empty_chunks=0)
[10/15] claims (written=48, empty_chunks=0)
[11/15] claims (written=55, empty_chunks=0)
[12/15] claims (written=60, empty_chunks=0)
[13/15] claims (written=70, empty_chunks=0)
[14/15] claims (written=76, empty_chunks=0)
[15/15] claims (written=83, empty_chunks=0)
✓ Done. Wrote 83 claims to: gdpr_claims.jsonl in 132.6s


In [21]:
with open(claims, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 10:
            break
        print(line.strip())

{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#1", "claim_text": "This Regulation protects fundamental rights and freedoms of natural persons.", "source_quote": "2. This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data."}
{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#2", "claim_text": "The free movement of personal data within the Union shall not be restricted or prohibited for reasons connected with the protection of natural persons.", "source_quote": "3. The free movement of personal data within the Union shall be neither restricted nor prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data."}
{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#3", "claim_text": "This Regulation applies to the processing of personal data wholly or partly by automated means.", "source_quote": "1. This Regulation applies to t

In [23]:
from extract_triples import extract_triplets_from_claims
triples = extract_triplets_from_claims("gdpr_claims.jsonl")

Input:  gdpr_claims.jsonl
Output: gdpr_triples.jsonl
Server: http://localhost:11434 | Model: llama3:8b
Total claims: 83
[1/83] triples (written=2, empty=0)
[2/83] triples (written=4, empty=0)
[3/83] triples (written=6, empty=0)
[4/83] triples (written=8, empty=0)
[5/83] triples (written=10, empty=0)
[6/83] triples (written=10, empty=0)
[7/83] triples (written=17, empty=0)
[8/83] triples (written=19, empty=0)
[9/83] triples (written=21, empty=0)
[10/83] triples (written=24, empty=0)
[11/83] triples (written=26, empty=0)
[12/83] triples (written=28, empty=0)
[13/83] triples (written=31, empty=0)
[14/83] triples (written=32, empty=0)
[15/83] triples (written=34, empty=0)
[16/83] triples (written=37, empty=0)
[17/83] triples (written=39, empty=0)
[18/83] triples (written=41, empty=0)
[19/83] triples (written=43, empty=0)
[20/83] triples (written=45, empty=0)
[21/83] triples (written=51, empty=0)
[22/83] triples (written=53, empty=0)
[23/83] triples (written=55, empty=0)
[24/83] triples (wr

In [24]:
with open(triples, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 20:
            break
        print(line.strip())

{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#1", "triple_id": "gdpr_1#1@1", "subject": "This Regulation", "predicate": "protects", "object": "fundamental rights and freedoms"}
{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#1", "triple_id": "gdpr_1#1@2", "subject": "This Regulation", "predicate": "protects", "object": "natural persons"}
{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#2", "triple_id": "gdpr_1#2@1", "subject": "free movement", "predicate": "shall not be restricted", "object": "personal data"}
{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#2", "triple_id": "gdpr_1#2@2", "subject": "free movement", "predicate": "shall not be prohibited", "object": "personal data"}
{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#3", "triple_id": "gdpr_1#3@1", "subject": "This Regulation", "predicate": "applies to", "object": "the processing of personal data"}
{"doc": "gdpr", "chunk_id": "gdpr_1", "claim_id": "gdpr_1#3", "triple_id": "gdp

In [25]:
embeddings_plato, chunks_plato = embed_path("plato.txt")

SQLite DB will be written to: plato_embeddings.sqlite
Chunks JSON will be written to: plato_chunks.json
Processing document: plato.txt  (doc id: plato)
  Chunks: 14
  Inserted 14 rows into SQLite for doc plato
Wrote 14 chunks to JSON → plato_chunks.json
✅ Done.


In [26]:
contextualized_plato = contextualize_file("plato_chunks.json")

  Document 'plato': 14 chunks
    [1/14] contextualized for 'plato'
    [2/14] contextualized for 'plato'
    [3/14] contextualized for 'plato'
    [4/14] contextualized for 'plato'
    [5/14] contextualized for 'plato'
    [6/14] contextualized for 'plato'
    [7/14] contextualized for 'plato'
    [8/14] contextualized for 'plato'
    [9/14] contextualized for 'plato'
    [10/14] contextualized for 'plato'
    [11/14] contextualized for 'plato'
    [12/14] contextualized for 'plato'
    [13/14] contextualized for 'plato'
    [14/14] contextualized for 'plato'
✓ Done. Contextualized 14 chunks → plato_contextualized.json (67.7s)


In [27]:
with open(contextualized_plato, "r", encoding="utf-8") as f:
    contextualized_pl = json.load(f)

print(f"Total contextualized chunks: {len(contextualized_pl)}")

# Show the first few contextualized results
for i, item in enumerate(contextualized_pl[:3], start=1):
    print(f"id: {item['id']}")
    print(f"chunk:\n{item['chunk'][:500]}...\n")
    print(f"contextualized_chunk:\n{item['contextualized_chunk']}\n")

Total contextualized chunks: 14
id: plato_1
chunk:
BOOK I PERSONS OF THE DIALOGUE. Socrates, who is the narrator. Glaucon. Adeimantus. Polemarchus. Cephalus. Thrasymachus. Cleitophon. And others who are mute auditors. The scene is laid in the house of Cephalus at the Piraeus; and the whole dialogue is narrated by Socrates the day after it actually took place to Timaeus, Hermocrates, Critias, and a nameless person, who are introduced in the Timaeus. I went down yesterday to the Piraeus with Glaucon the son of Ariston, that I might offer up my pra...

contextualized_chunk:
This passage introduces the setting and characters of a dialogue between Socrates and several other individuals, including Glaucon, Adeimantus, Polemarchus, Cephalus, Thrasymachus, Cleitophon, and others. The scene is set in the house of Cephalus at the Piraeus, where Socrates has gone to offer prayers to the goddess Bendis. The passage also establishes the narrator's role as Socrates, who recounts the events that took

In [28]:
claims_plato = extract_claims_from_file("plato_contextualized.json")

Total chunks: 14
[1/14] claims (written=4, empty_chunks=0)
[2/14] claims (written=10, empty_chunks=0)
[3/14] claims (written=12, empty_chunks=0)
[4/14] claims (written=16, empty_chunks=0)
[5/14] claims (written=19, empty_chunks=0)
[6/14] claims (written=23, empty_chunks=0)
[7/14] claims (written=27, empty_chunks=0)
[8/14] claims (written=30, empty_chunks=0)
[9/14] claims (written=33, empty_chunks=0)
[10/14] claims (written=36, empty_chunks=0)
[11/14] claims (written=39, empty_chunks=0)
[12/14] claims (written=43, empty_chunks=0)
[13/14] claims (written=49, empty_chunks=0)
[14/14] claims (written=51, empty_chunks=0)
✓ Done. Wrote 51 claims to: plato_claims.jsonl in 78.2s


In [29]:
with open(claims_plato, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 10:
            break
        print(line.strip())

{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#1", "claim_text": "Socrates is the narrator of the dialogue.", "source_quote": "BOOK I PERSONS OF THE DIALOGUE. Socrates, who is the narrator."}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#2", "claim_text": "The scene is set in the house of Cephalus at the Piraeus.", "source_quote": "The scene is laid in the house of Cephalus at the Piraeus;"}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#3", "claim_text": "Socrates went to the Piraeus with Glaucon to offer prayers to the goddess Bendis.", "source_quote": "I went down yesterday to the Piraeus with Glaucon the son of Ariston, that I might offer up my prayers to the goddess (Bendis, the Thracian Artemis.);"}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#4", "claim_text": "Polemarchus is the son of Cephalus.", "source_quote": "Polemarchus the son of Cephalus chanced to catch sight of us from a distance as we were starting on our way

In [30]:
triples_plato = extract_triplets_from_claims("plato_claims.jsonl")

Input:  plato_claims.jsonl
Output: plato_triples.jsonl
Server: http://localhost:11434 | Model: llama3:8b
Total claims: 51
[1/51] triples (written=1, empty=0)
[2/51] triples (written=3, empty=0)
[3/51] triples (written=5, empty=0)
[4/51] triples (written=6, empty=0)
[5/51] triples (written=7, empty=0)
[6/51] triples (written=9, empty=0)
[7/51] triples (written=10, empty=0)
[8/51] triples (written=11, empty=0)
[9/51] triples (written=12, empty=0)
[10/51] triples (written=13, empty=0)
[11/51] triples (written=14, empty=0)
[12/51] triples (written=15, empty=0)
[13/51] triples (written=17, empty=0)
[14/51] triples (written=19, empty=0)
[15/51] triples (written=20, empty=0)
[16/51] triples (written=21, empty=0)
[17/51] triples (written=22, empty=0)
[18/51] triples (written=24, empty=0)
[19/51] triples (written=25, empty=0)
[20/51] triples (written=26, empty=0)
[21/51] triples (written=28, empty=0)
[22/51] triples (written=30, empty=0)
[23/51] triples (written=32, empty=0)
[24/51] triples (wr

In [33]:
with open(triples_plato, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 20:
            break
        print(line.strip())

{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#1", "triple_id": "plato_1#1@1", "subject": "Socrates", "predicate": "is the narrator of", "object": "the dialogue"}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#2", "triple_id": "plato_1#2@1", "subject": "the scene", "predicate": "is set", "object": "in the house of Cephalus"}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#2", "triple_id": "plato_1#2@2", "subject": "the scene", "predicate": "is set", "object": "at the Piraeus"}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#3", "triple_id": "plato_1#3@1", "subject": "Socrates", "predicate": "went", "object": "the Piraeus"}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#3", "triple_id": "plato_1#3@2", "subject": "Socrates", "predicate": "offer prayers to", "object": "the goddess Bendis"}
{"doc": "plato", "chunk_id": "plato_1", "claim_id": "plato_1#4", "triple_id": "plato_1#4@1", "subject": "Polemarchus", "predicate": "i