In [2]:
import os
from tika import parser
import pandas as pd
import re

In [4]:
# Path to the folder containing the PDF files
folder_path = 'ctr_format'

# Function to extract text from each PDF file in the folder
def extract_text_from_pdfs(folder_path):
    data = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            file_path = os.path.join(folder_path, file)
            parsed_pdf = parser.from_file(file_path)
            text = parsed_pdf['content'].strip() if parsed_pdf['content'] else ""

            # Remove the specified block of text
            text = re.sub(
                r"RJJ Consortium for JTB Project\nc/o PT Rekayasa Industri\nJl\. Kalibata Timur I No\. 36\nKalibata, Jakarta 12740 – INDONESIA\n", 
                "", text
            )

            text = re.sub(
                r"Page 1 of 2", 
                "", text
            )

            text = re.sub(
                r"Page 2 of 2", 
                "", text
            )

            # Extract letter number
            letter_number = re.search(r'No\. :\s*(.*?)\s*\n', text)
            letter_number = letter_number.group(1) if letter_number else None

            # Extract date
            date = re.search(r'Tanggal : \s*(.*?)\s*\n', text)
            date = date.group(1) if date else None

            # Extract title
            title = re.search(r'Perihal  : \s*(.*?)\s*\n', text)
            title = title.group(1) if title else None

            # Extract letter content
            letter_content = re.search(r'Dengan Hormat,(.*?)Hormat kami,\n\nBudi Prianto\nProject Manager', text, re.DOTALL)
            letter_content = letter_content.group(1).strip() if letter_content else None

            
            data.append({'File Name': file, 'Letter Number': letter_number, 
                         'Date': date, 'Title': title, 
                         'letter_content ': letter_content, 'Raw Text': text,
                         })
    return data
    
# Extract text from all PDFs in the folder
extracted_data = extract_text_from_pdfs(folder_path)

# Convert to DataFrame
df = pd.DataFrame(extracted_data)

# Save the DataFrame to a CSV file (optional)
df.to_excel('surat_ctr_format.xlsx', index=False)
