# This method extracts sentences from PDF files and saves them in a csv file.

In [None]:
!pip install pymupdf

import os
import re
import csv
import fitz  # PyMuPDF

In [53]:
def splitParagraphIntoSentences(paragraph):
    sentenceEnders = re.compile(r"""
        # Split sentences on whitespace between them.
        (?:               # Group for two positive lookbehinds.
          (?<=[.!?])      # Either an end of sentence punct,
        | (?<=[.!?]['"])  # or end of sentence punct and quote.
        )                 # End group of two positive lookbehinds.
        (?<!  Mr\.   )    # Don't end sentence on "Mr."
        (?<!  Mrs\.  )    # Don't end sentence on "Mrs."
        (?<!  Jr\.   )    # Don't end sentence on "Jr."
        (?<!  Dr\.   )    # Don't end sentence on "Dr."
        (?<!  Prof\. )    # Don't end sentence on "Prof."
        (?<!  Sr\.   )    # Don't end sentence on "Sr."
        \s+               # Split on whitespace between sentences.
        """,
        re.IGNORECASE | re.VERBOSE)
    sentenceList = sentenceEnders.split(paragraph)
    return sentenceList


def extract_sentences_from_pdf(file_path):
    try:
        # Open the PDF file using PyMuPDF
        document = fitz.open(file_path)
        text = ""
        # Extract text from each page
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text += page.get_text()

        # Split the extracted text into sentences
        sentences = splitParagraphIntoSentences(text)

        # Remove any leading or trailing whitespace from each sentence
        sentences = [sentence.strip() for sentence in sentences]

        # Filter sentences to keep only those ending with a dot and with length between 25 and 300 characters
        sentences = [sentence for sentence in sentences if sentence.endswith('.') and 25 <= len(sentence) <= 300]

        # Save sentences in list
        save_list_to_csv(file_path, sentences)

        return sentences
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


def save_list_to_csv(file_path, sentences_list):
    new_file_path = change_path_to_csv(file_path)
    os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
    with open(new_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write each string as a single row
        for line in sentences_list:
            writer.writerow([line])
    print(f"File was saved in directory: {new_file_path}")


def change_path_to_csv(original_path):
    # Split the original path into directory and filename
    directory, filename = os.path.split(original_path)
    # Change the directory to the new target directory
    new_directory = directory.replace('Reports', 'Reports_csv_2')
    # Change the file extension from .pdf to .csv
    new_filename = filename.replace('.pdf', '.csv')
    # Construct the new path
    new_path = os.path.join(new_directory, new_filename)
    return new_path


def print_sentences(sentences_list):
    # Print the list of sentences
    for i, sentence in enumerate(sentences_list):
        print(f"Sentence {i+1}: {sentence}")


Collecting pymupdf
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.4
/content/Reports/Brenntag_2023.pdf
File was saved in directory: /content/Reports_csv_2/Brenntag_2023.csv
/content/Reports/Bayer_2023.pdf
File was saved in directory: /content/Reports_csv_2/Bayer_2023.csv
/content/Reports/SAP_2023.pdf
File was saved in directory: /content/Reports_csv_2/SAP_2023.csv
/content/Reports/Heidelberg_Materials_2023.pdf
File was saved in directory: /content/Reports_csv_2/Heidelberg_Mate

In [None]:
# Directory containing the reports
directory = '/content/Reports'

# Loop through all files in the directory
for filename in os.listdir(directory):
    # Construct the full file path
    file_path = os.path.join(directory, filename)
    print(file_path)
    # Call the extract_sentences_from_pdf function with the file path
    extract_sentences_from_pdf(file_path)