In [8]:
GPTKey = "your-api-key-here"

In [11]:
from pathlib import Path
import pytesseract
import openai
import pdf2image
from pdf2image import convert_from_path
import pandas as pd
import re
from tqdm import tqdm
import cv2
import numpy as np

# Set the path to your PDF file and the desired output CSV file
pdf_file_path = "PDFs/testsoc2.pdf"
output_csv = "Output/summary2.csv"

# Configure pytesseract
pytesseract.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"  # Update with your tesseract path

# Configure OpenAI
openai.api_key = GPTKey

# Function to convert PDF to text using OCR
def ocr_pdf_to_text(pdf_file_path):
    images = pdf2image.convert_from_path(pdf_file_path)
    text = ""
    for i, image in tqdm(enumerate(images), total=len(images), desc="Extracting text from PDF pages"):
        # Convert image to numpy array
        img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        
        # Detect orientation and correct if needed
        orientation = pytesseract.image_to_osd(img, output_type=pytesseract.Output.DICT)["orientation"]
        if orientation > 0:
            img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
        
        # OCR the image
        text += pytesseract.image_to_string(img, lang="eng", config='--psm 1', output_type=pytesseract.Output.STRING) + "\f"
    return text

# Function to summarize text using GPT-3.5 Turbo
def summarize_text(text):
    messages = [
        {"role": "system", "content": "You are a helpful assistant that removes formatting and summarizes legal texts."},
        {"role": "user", "content": "Please summarize the following: " + text}
        ]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        n=1,
        stop=None,
        temperature=0.7,
    )
    summary = str(response.choices[0].message.content)
    summary = summary.replace("\n","").replace("\t","")
    summary = " ".join(summary.split())
    summary = summary.encode("ascii", "ignore").decode("utf-8")
    return summary

# Function to summarize each page of a PDF and save the result as a CSV file
def summarize_pdf(pdf_file_path, output_csv):
    text = ocr_pdf_to_text(pdf_file_path)
    pages = text.split("\f")[:-1]  # Remove last empty page
    
    summaries = []
    with tqdm(total=len(pages), desc='Summarizing PDF') as pbar:
        for page in pages:
            summaries.append({'Page': pages.index(page) + 1, 'Summary': summarize_text(page)})
            pbar.update(1)
    
    df = pd.DataFrame(summaries)
    df.to_csv(output_csv, index=False, encoding="utf-8")

# Summarize the PDF and save the result as a CSV file
summarize_pdf(pdf_file_path, output_csv)

# Display the summary
summary_df = pd.read_csv(output_csv, encoding="utf-8")
display(summary_df)

Extracting text from PDF pages: 100%|██████████| 43/43 [01:45<00:00,  2.44s/it]
Summarizing PDF: 100%|██████████| 43/43 [04:47<00:00,  6.69s/it]


Unnamed: 0,Page,Summary
0,1,This is a legal proceeding under the Class Pro...
1,2,The document advises Ralph Rowe and the Synod ...
2,3,The plaintiff is claiming the following:(a) Ce...
3,4,Ralph Rowe was a priest and Scout leader who f...
4,5,"The plaintiff, Alvin McKay, is a member of Kit..."
5,6,The Synod of the Diocese of Keewatin is an inc...
6,7,The Synod is responsible for administering to ...
7,8,Rowe was a priest in the Synod who was trained...
8,9,"Between 1971 and 1987, the Synod encouraged an..."
9,10,"The text describes how Rowe, a priest of the S..."
