In [None]:
!pip install PyPDF2



In [None]:
import json
import os
import time
from pathlib import Path
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from google.cloud import storage
from google.cloud.storage import transfer_manager

# Configuration
project_id = "kiaraerica"
location = "us-central1"
bucket_name = "us_climate"
raw_folder = "initial-loads/state_climate_policies/raw/"
llm_folder = "initial-loads/state_climate_policies/llm_text/"
model_name = "gemini-1.5-flash-001"

# Updated prompt for NDJSON
prompt = """Extract structured policy data from this document.
Identify and return the following fields:
1. Policy Title (only the title, without descriptions)
2. Policy Area
3. Category
4. Status (Enacted/Not Enacted)
5. Year Enacted (if available)

Return each policy as a separate JSON object, one per line (NDJSON format), like:
{"policy": "string", "policy_area": "string", "category": "string", "status": "string", "year_enacted": "string"}

Do not return an array. Do not add extra text. Return only NDJSON.
"""

def extract():
    """Extracts structured data from all PDFs in `raw_folder` using Gemini and outputs NDJSON format."""
    vertexai.init(project=project_id, location=location)
    model = GenerativeModel(model_name)

    storage_client = storage.Client()
    blobs = storage_client.list_blobs(bucket_name, prefix=raw_folder)

    for blob in blobs:
        if blob.name == raw_folder:
            continue  # Skip folder itself

        # Extract the state name from the filename
        state_name = Path(blob.name).stem  # "Alabama.pdf" → "Alabama"

        # Generate correct JSONL file path
        json_filename = Path(llm_folder) / f"{state_name}.jsonl"
        print(f"Saving NDJSON file to: {json_filename}")

        print(f"Processing {blob.name}...")  # ✅ Always process the file
        file_content = Part.from_uri(f"gs://{bucket_name}/{blob.name}", "application/pdf")
        resp = model.generate_content([file_content, prompt])

        # Remove any extraneous Markdown artifacts
        resp_text = resp.text.strip().replace("```json", "").replace("```", "")
        raw_lines = resp_text.split("\n")  # Split response line by line

        # Ensure folder exists
        json_filename.parent.mkdir(parents=True, exist_ok=True)

        valid_json_lines = []

        for line in raw_lines:
            line = line.strip()
            if not line:
                continue  # Skip empty lines

            try:
                json_obj = json.loads(line)  # Parse each line separately
                json_obj["state"] = state_name  # Add state name to JSON object
                valid_json_lines.append(json.dumps(json_obj))  # Store valid JSON strings
            except json.JSONDecodeError:
                print(f"⚠️ Skipping malformed JSON line: {line}")  # Debugging step

        if not valid_json_lines:
            print(f"❌ Error: No valid JSON extracted for {blob.name}. Skipping file.")
            continue  # Move to next file without saving invalid data

        # ✅ Overwrite existing NDJSON file
        with open(json_filename, "w", encoding="utf-8") as f:
            f.write("\n".join(valid_json_lines) + "\n")

        print(f"✅ Successfully saved NDJSON: {json_filename}")


def copy_to_GCS(local_folder, gcs_folder, file_extension):
    """Uploads all processed JSONL files to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    directory_as_path_obj = Path(local_folder).resolve()
    print(f"Checking for JSONL files in: {directory_as_path_obj}")  # Debugging step

    file_paths = list(directory_as_path_obj.rglob(file_extension))
    print(f"Found JSONL files: {file_paths}")  # Debugging step

    if not file_paths:
        print(f"⚠️ No JSONL files found in {local_folder}, skipping upload.")
        return

    correct_gcs_folder = gcs_folder.rstrip("/") + "/"

    results = transfer_manager.upload_many_from_filenames(
        bucket,
        [str(f.relative_to(directory_as_path_obj)) for f in file_paths],
        source_directory=str(directory_as_path_obj),
        blob_name_prefix=correct_gcs_folder,
        max_workers=5
    )

    for name, result in zip(file_paths, results):
        if isinstance(result, Exception):
            print(f"⚠️ Failed to upload {name} due to {result}")
        else:
            print(f"✅ Uploaded {name} to {bucket.name}")

if __name__ == "__main__":
    extract()  # Process ALL PDFs and convert to NDJSON
    copy_to_GCS(llm_folder, llm_folder, "*.jsonl")  # Upload ALL JSONL files

Saving NDJSON file to: initial-loads/state_climate_policies/llm_text/Alabama.jsonl
Processing initial-loads/state_climate_policies/raw/Alabama.pdf...
✅ Successfully saved NDJSON: initial-loads/state_climate_policies/llm_text/Alabama.jsonl
Saving NDJSON file to: initial-loads/state_climate_policies/llm_text/Alaska.jsonl
Processing initial-loads/state_climate_policies/raw/Alaska.pdf...
✅ Successfully saved NDJSON: initial-loads/state_climate_policies/llm_text/Alaska.jsonl
Saving NDJSON file to: initial-loads/state_climate_policies/llm_text/Arizona.jsonl
Processing initial-loads/state_climate_policies/raw/Arizona.pdf...
⚠️ Skipping malformed JSON line: ==End of OCR for page 11==
✅ Successfully saved NDJSON: initial-loads/state_climate_policies/llm_text/Arizona.jsonl
Saving NDJSON file to: initial-loads/state_climate_policies/llm_text/Arkansas.jsonl
Processing initial-loads/state_climate_policies/raw/Arkansas.pdf...
✅ Successfully saved NDJSON: initial-loads/state_climate_policies/llm_text