This program iterates through the PDF files in the input_pdfs directory, extracts the text from each PDF file using the PyPDF2 library, splits the text into sentences using a regular expression, and writes the sentences to a CSV file in the output_csvs directory. The CSV file has a single column with a row for each sentence in the PDF file.

In [3]:
import os
import csv
import re
import PyPDF2

In [None]:
# Set the input and output directories
input_dir = "input_pdfs"
output_dir = "output_csvs"

In [None]:
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
# Iterate through the PDF files in the input directory
for file in os.listdir(input_dir):
    if file.endswith(".pdf"):
        # Open the PDF file
        with open(os.path.join(input_dir, file), "rb") as f:
            pdf = PyPDF2.PdfFileReader(f)

            # Extract the text from the PDF
            text = ""
            for page in pdf.pages:
                text += page.extractText()

            # Split the text into sentences
            sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
            
            # Write the sentences to a CSV file
            output_file = file.replace(".pdf", ".csv")
            with open(os.path.join(output_dir, output_file), "w", newline="") as csvfile:
                writer = csv.writer(csvfile)
                for sentence in sentences:
                    writer.writerow([sentence])