In [25]:
from PyPDF2 import PdfReader
import re
import json
from pathlib import Path

# 1. Charger le PDF
BASE_PATH = "../../data/cgi/"
pdf_path = BASE_PATH + "LEGITEXT000006069577.pdf"
reader = PdfReader(pdf_path)

In [26]:
# 2. Extraire tout le texte brut
raw_text = ""
for page in reader.pages:
    text = page.extract_text()
    if text:
        raw_text += text + "\n"

In [27]:
# 3. Regex pour identifier les niveaux hiérarchiques
patterns = {
    "livre": re.compile(r"(?i)^\s*LIVRE\s+[^\n]+"),
    "partie": re.compile(r"(?i)^\s*PREMI[\u00c8E]RE PARTIE[^\n]*|DEUXI[\u00c8E]ME PARTIE[^\n]*"),
    "titre": re.compile(r"(?i)^\s*TITRE\s+[^\n]+"),
    "chapitre": re.compile(r"(?i)^\s*CHAPITRE\s+[^\n]+"),
    "section": re.compile(r"(?i)^\s*SECTION\s+[^\n]+"),
    "article": re.compile(r"(Article\s+[A-Z]?[0-9]+[-0-9A-Z]*\s*:?)")
}

In [28]:
# 4. Initialiser le contexte de structure
context = {
    "livre": None,
    "partie": None,
    "titre": None,
    "chapitre": None,
    "section": None,
}

# 5. Parser ligne par ligne et structurer les articles
lines = raw_text.splitlines()
articles = []
current_article = None
buffer = []

In [29]:
def save_current_article():
    if current_article and buffer:
        articles.append({
            "page_content": "\n".join(buffer).strip(),
            "metadata": {
                "title": current_article,
                "source": "Code Général des Impôts",
                "type": "code",
                **context
            }
        })

for line in lines:
    line = line.strip()
    if not line:
        continue

    for level, pattern in patterns.items():
        if level != "article" and pattern.match(line):
            context[level] = line.strip().title()

    if patterns["article"].match(line):
        save_current_article()
        current_article = line.strip()
        buffer = [line]
    else:
        buffer.append(line)

save_current_article()


In [31]:
# 6. Sauvegarde en JSONL
output_path = Path( BASE_PATH + "cgi_articles_structured.jsonl")
with output_path.open("w", encoding="utf-8") as f_out:
    for art in articles:
        json.dump(art, f_out, ensure_ascii=False)
        f_out.write("\n")

print(f"✅ {len(articles)} articles sauvegardés dans {output_path}")

✅ 2500 articles sauvegardés dans ../../data/cgi/cgi_articles_structured.jsonl
