# Extraction Pipeline Prototype

Purpose:
- Prototype LLM extraction logic
- Validate schema outputs
- Test on small sample of documents

Status:
- Exploratory / non-production

In [1]:
import os
# Secure 
from dotenv import load_dotenv

from pathlib import Path 
from dataclasses import dataclass
from typing import List, Optional 

# OpenAI 
from openai import OpenAI 
from pydantic import BaseModel

In [2]:
@dataclass
class Document: 
    doc_id: str
    path: Path
    text: str

In [3]:
@dataclass
class ExtractionResults: 
    executive_name: str
    role:           str
    effective_date: str 
    evidence_quote: str

In [4]:
def read_text_file(path: Path) -> str:
    """
    Read in text files

    input: 
    Path object from pathlib module

    output: 

    """
    try: 
        return path.read_text(encoding = "utf-8")
    except UnicodeDecodeError: 
        return path.read_text(encoding="latin-1")


In [5]:
def normalize_text(text: str) -> str: 
    """
    Normalizing text, line breaks 
    """

        # replace new line characters for Windows, Old Mac, to Unix, Linux, macOS new line characters "\n"
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # remove whitespaces per line 
    lines = [line.rstrip() for line in text.split("\n")]
    text = "\n".join(lines)

    while "\n\n\n" in text: # replace three line breaks
        text = text.replace("\n\n\n", "\n\n")

    return text.strip()


In [6]:
def load_documents(input_dir: Path, limit: Optional[int] = None) -> List[Document]:
    """
    Load .txt files from input_dir in sorted order.
    Convert each file into a Document(doc_id, path, text).
    """
    if not input_dir.exists() or not input_dir.is_dir():
        raise FileNotFoundError(f"Input directory not found: {input_dir.resolve()}")

    paths = sorted(input_dir.glob("*.txt"))

    if limit is not None:
        paths = paths[:limit]

    documents: List[Document] = []

    for path in paths:
        raw_text = read_text_file(path)
        clean_text = normalize_text(raw_text)

        documents.append(
            Document(
                doc_id=path.stem,  # filename without ".txt"
                path=path,
                text=clean_text
            )
        )

    return documents

In [7]:
# Test pipeline 
root_path = '/Users/tahan/Developer/001 Areas/exec-extraction'

data_path = Path(os.path.join(root_path, 'input'))

docs = load_documents(data_path, limit=10)  # pilot

docs

[Document(doc_id='001_acme_cfo_appointment', path=PosixPath('/Users/tahan/Developer/001 Areas/exec-extraction/input/001_acme_cfo_appointment.txt'), text='ACME INDUSTRIAL SOLUTIONS ANNOUNCES APPOINTMENT OF NEW CHIEF FINANCIAL OFFICER\nJanuary 12, 2024 — Chicago, IL\n\nAcme Industrial Solutions, Inc. (“Acme” or the “Company”) today announced that it has appointed Maria L. Chen as Executive Vice President and Chief Financial Officer, effective February 1, 2024. Ms. Chen succeeds Robert J. Meade, who will retire from the Company on January 31, 2024 and will serve as an advisor through the end of the first quarter.\n\n“Maria is a proven financial leader with deep experience in manufacturing and global operations,” said Thomas R. Kellogg, Acme’s President and Chief Executive Officer. “Her expertise in capital allocation and disciplined growth will be instrumental as we execute our multi-year strategy.”\n\nMs. Chen, 46, most recently served as Senior Vice President, Finance at Northline Compo

LLM Testing

In [None]:
class ExtractionResults(BaseModel): 
    executive_name: str
    role:           str
    effective_date: str 
    evidence_quote: str

current_dir = Path.cwd() 
env_path = Path("..") / ".env"

load_dotenv(env_path)
key = os.getenv("OPENAI_API_KEY")
print(f"Key Loaded: {bool(key)}")

client = OpenAI() 

response = client.responses.parse(
    model="gpt-4.1-mini",
    input=[
        {"role": "system", "content": "Extract the event information and return strictly valid JSON."},
        {
            "role": "user",
            "content": "Acme Corp announced that Jane Doe was appointed Chief Financial Officer effective March 1, 2024.",
        },
    ],
    text_format=ExtractionResults,
)

result = response.output_parsed

print(result)
print(result.model_dump())

Key Loaded: True
