In [2]:
import os
from PyPDF2 import PdfReader


def read_pdfs_in_folder(folder_path):
    # Ensure the folder path is valid
    if not os.path.isdir(folder_path):
        raise ValueError(f"The folder path {folder_path} is not valid.")

    # Create the 'txt' directory if it doesn't exist
    txt_folder_path = os.path.join(folder_path, "txt")
    if not os.path.exists(txt_folder_path):
        os.makedirs(txt_folder_path)

    # Iterate through all the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a PDF
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "rb") as file:
                    reader = PdfReader(file)

                    # Extract text from the PDF
                    pdf_text = ""
                    for page in reader.pages:
                        pdf_text += page.extract_text() if page.extract_text() else ""

                    # Write the extracted text to a text file
                    txt_filename = os.path.splitext(filename)[0] + ".txt"
                    txt_file_path = os.path.join(txt_folder_path, txt_filename)
                    with open(txt_file_path, "w", encoding="utf-8") as txt_file:
                        txt_file.write(pdf_text)

                    print(
                        f"Extracted text from {filename} and wrote to {txt_file_path}"
                    )

            except Exception as e:
                print(f"Failed to read {filename} due to {e}")


# Example usage
folder_path = "files"
read_pdfs_in_folder(folder_path)

Extracted text from neighbourhood_plan-Cossington.pdf and wrote to files/txt/neighbourhood_plan-Cossington.txt
Extracted text from neighbourhood_plan-Addingham.pdf and wrote to files/txt/neighbourhood_plan-Addingham.txt
Extracted text from neighbourhood_plan-Ab Kettleby Parish.pdf and wrote to files/txt/neighbourhood_plan-Ab Kettleby Parish.txt
Extracted text from neighbourhood_plan-Sawtry.pdf and wrote to files/txt/neighbourhood_plan-Sawtry.txt
Extracted text from neighbourhood_plan-Totnes.pdf and wrote to files/txt/neighbourhood_plan-Totnes.txt
Extracted text from neighbourhood_plan-Aberford.pdf and wrote to files/txt/neighbourhood_plan-Aberford.txt
Extracted text from neighbourhood_plan-Cleobury Mortimer.pdf and wrote to files/txt/neighbourhood_plan-Cleobury Mortimer.txt
Extracted text from neighbourhood_plan-Walsham le Willows.pdf and wrote to files/txt/neighbourhood_plan-Walsham le Willows.txt
Extracted text from neighbourhood_plan-Nether Whitacre.pdf and wrote to files/txt/neighb

In [15]:
import os
from datetime import datetime
import ast

all_responses = []
for filename in os.listdir("files/txt"):
    # Check if the file is a PDF
    if filename.endswith(".txt"):
        file_path = os.path.join("files/txt/", filename)
        try:
            with open(file_path, "rb") as file:
                # print file name

                print(f"Reading {filename}")
                print(f"Gemini query starts at {datetime.now()}")

                genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
                model = genai.GenerativeModel("gemini-1.5-flash")

                prompt = "Please return a list of 20 main topics discussed below. Return the list as a machine readable JSON list (only valid json returned). Please do not include ```json or ``` or newline characters.\n\n"
                with open(file_path, "r") as file:
                    text = file.read()
                    prompt += text

                response = model.generate_content(prompt)

                print(f"Gemini query complete at {datetime.now()}")

                all_responses.append(ast.literal_eval(response.text))

                print(response.text)

        except Exception as e:
            print(f"Failed to read {filename} due to {e}")

Reading neighbourhood_plan-West Wittering.txt
Gemini query starts at 2024-06-11 01:20:43.454221
Gemini query complete at 2024-06-11 01:20:49.447115
["Introduction", "National and Local Planning Context", "The Parish of West Wittering", "Vision & Objectives", "Policies & Proposals", "Non-Statutory Community Aspirations", "Delivery Plan", "Heritage", "Community Facilities", "Public Rights of Way and Quiet Lanes", "Housing", "Coastal Enhancements", "Biodiversity, Geodiversity and Mitigating the Impacts of Climate Change", "Economic Development", "Visitor Accommodation and Facilities", "Lighting", "Utility Infrastructure", "Design", "Preventing Coalescence", "Retail Facilities"]
Reading neighbourhood_plan-Walsham le Willows.txt
Gemini query starts at 2024-06-11 01:20:49.447599
Gemini query complete at 2024-06-11 01:20:57.434265
["Introduction", "National and local strategic planning policy context", "Walsham le Willows parish", "How the Plan was prepared", "Vision, objectives and policies"

In [16]:
print(all_responses)

[['Introduction', 'National and Local Planning Context', 'The Parish of West Wittering', 'Vision & Objectives', 'Policies & Proposals', 'Non-Statutory Community Aspirations', 'Delivery Plan', 'Heritage', 'Community Facilities', 'Public Rights of Way and Quiet Lanes', 'Housing', 'Coastal Enhancements', 'Biodiversity, Geodiversity and Mitigating the Impacts of Climate Change', 'Economic Development', 'Visitor Accommodation and Facilities', 'Lighting', 'Utility Infrastructure', 'Design', 'Preventing Coalescence', 'Retail Facilities'], ['Introduction', 'National and local strategic planning policy context', 'Walsham le Willows parish', 'How the Plan was prepared', 'Vision, objectives and policies', 'Built Environment and Design', 'Housing', 'Community and Infrastructure', 'Transport and Accessibility', 'Natural and Historic Environment', 'Economy', 'Projects and Community Infrastructure Levy (CIL) funding', 'Implementation and monitoring', 'Challenges for Walsham le Willows', 'Pre-Submissi