# Extraction Pipeline Prototype

Purpose:
- Prototype LLM extraction logic
- Validate schema outputs
- Test on small sample of documents

Status:
- Exploratory / non-production

In [13]:
import os

from pathlib import Path 
from dataclasses import dataclass
from typing import List, Optional 

In [None]:
@dataclass
class Document: 
    doc_id: str
    path: Path
    text: str

In [4]:
def read_text_file(path: Path) -> str:
    """
    Read in text files

    input: 
    Path object from pathlib module

    output: 

    """
    try: 
        return path.read_text(encoding = "utf-8")
    except UnicodeDecodeError: 
        return path.read_text(encoding="latin-1")


In [None]:
def normalize_text(text: str) -> str: 
    """
    Normalizing text, line breaks 
    """

        # replace new line characters for Windows, Old Mac, to Unix, Linux, macOS new line characters "\n"
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # remove whitespaces per line 
    lines = [line.rstrip() for line in text.split("\n")]
    text = "\n".join(lines)

    while "\n\n\n" in text: # replace three line breaks
        text = text.replace("\n\n\n", "\n\n")


In [18]:
root_path = '/Users/tahan/Developer/001 Areas/exec-extraction'

data_path = os.path.join(root_path, 'input')

output = read_text_file(Path(os.path.join(data_path, '001_acme_cfo_appointment.txt')))

output

'ACME INDUSTRIAL SOLUTIONS ANNOUNCES APPOINTMENT OF NEW CHIEF FINANCIAL OFFICER\nJanuary 12, 2024 — Chicago, IL\n\nAcme Industrial Solutions, Inc. (“Acme” or the “Company”) today announced that it has appointed Maria L. Chen as Executive Vice President and Chief Financial Officer, effective February 1, 2024. Ms. Chen succeeds Robert J. Meade, who will retire from the Company on January 31, 2024 and will serve as an advisor through the end of the first quarter.\n\n“Maria is a proven financial leader with deep experience in manufacturing and global operations,” said Thomas R. Kellogg, Acme’s President and Chief Executive Officer. “Her expertise in capital allocation and disciplined growth will be instrumental as we execute our multi-year strategy.”\n\nMs. Chen, 46, most recently served as Senior Vice President, Finance at Northline Components, where she led FP&A, treasury, investor relations, and corporate development. Prior to Northline, she held finance leadership roles at Barton Logis

In [27]:
# normalizing text
# replace new line characters for Windows, Old Mac, to Unix, Linux, macOS new line characters "\n"
text = output.replace("\r\n", "\n").replace("\r", "\n")

# remove whitespaces per line 
lines = [line.rstrip() for line in text.split("\n")]
text = "\n".join(lines)


print(text)

ACME INDUSTRIAL SOLUTIONS ANNOUNCES APPOINTMENT OF NEW CHIEF FINANCIAL OFFICER
January 12, 2024 — Chicago, IL

Acme Industrial Solutions, Inc. (“Acme” or the “Company”) today announced that it has appointed Maria L. Chen as Executive Vice President and Chief Financial Officer, effective February 1, 2024. Ms. Chen succeeds Robert J. Meade, who will retire from the Company on January 31, 2024 and will serve as an advisor through the end of the first quarter.

“Maria is a proven financial leader with deep experience in manufacturing and global operations,” said Thomas R. Kellogg, Acme’s President and Chief Executive Officer. “Her expertise in capital allocation and disciplined growth will be instrumental as we execute our multi-year strategy.”

Ms. Chen, 46, most recently served as Senior Vice President, Finance at Northline Components, where she led FP&A, treasury, investor relations, and corporate development. Prior to Northline, she held finance leadership roles at Barton Logistics and