In [1]:
%load_ext autoreload
%autoreload 2

In [63]:
import spacy
import pathlib
from collections import namedtuple

In [5]:
nlp = spacy.load("en_core_web_sm")

In [65]:
data_path = pathlib.Path("data/raw")
output_path = pathlib.Path("data/processed")
DocPair = namedtuple('DocPair', ['infile','outfile'])

In [69]:
doclist = [
    DocPair(outfile="us-eo-lines.txt", 
            infile="US Executive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence.md"),
    DocPair(outfile="tbs-automated-decisions.txt",
            infile="TBS Directive on Automated Decision-Making.md"),
]

In [70]:
# Load the spaCy model with the statistical sentencizer
nlp_sentencizer = spacy.lang.en.English()
# Add the sentencizer component explicitly (if not already added)
if "sentencizer" not in nlp_sentencizer.pipe_names:
    nlp_sentencizer.create_pipe("sentencizer")
    nlp_sentencizer.add_pipe("sentencizer")

# Load the full spaCy English model
nlp_full_model = spacy.load("en_core_web_sm")



In [71]:
for docs in doclist:
    input_file = data_path / docs.infile
    output_file = output_path / docs.outfile
    
    # Read the text from the file
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()
    print(f"Splitting {docs.infile} -> {docs.outfile}")

    # Tokenize using the statistical sentencizer
    doc_sentencizer = nlp_sentencizer(text)
    sentences_sentencizer = [sent.text for sent in doc_sentencizer.sents]
    # Tokenize using the full spaCy English model
    #doc_full_model = nlp_full_model(text)
    #sentences_full_model = [sent.text for sent in doc_full_model.sents]
    
    with open (output_file, "w") as fw:
        for sentence in sentences_sentencizer:
            if sentence.rstrip():
                fw.write(sentence) 
    

Splitting US Executive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence.md -> us-eo-lines.txt
Splitting TBS Directive on Automated Decision-Making.md -> tbs-automated-decisions.txt


In [62]:
!head $output_path/"us-eo-lines.txt"

# Executive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence

     By the authority vested in me as President by the Constitution and the laws of the United States of America, it is hereby ordered as follows:

     Section 1.  Purpose.  Artificial intelligence (AI) holds extraordinary potential for both promise and peril.  Responsible AI use has the potential to help solve urgent challenges while making our world more prosperous, productive, innovative, and secure.  At the same time, irresponsible use could exacerbate societal harms such as fraud, discrimination, bias, and disinformation; displace and disempower workers; stifle competition; and pose risks to national security.  Harnessing AI for good and realizing its myriad benefits requires mitigating its substantial risks.  This endeavor demands a society-wide effort that includes government, the private sector, academia, and civil society.

     My Administration places the highest urg