In [0]:
import requests
import os
import json


In [0]:

# Base URLs for API requests
besluiten_base_url = "https://besluiten.onroerenderfgoed.be/besluiten"

# DBFS directory where files will be stored
dbfs_save_dir = "/FileStore/ABB_pdf/"

# Headers for JSON requests
headers_json = {
    "Accept": "application/json"
}



In [0]:
# Function to get detailed information from the provided 'aanduidingsobjecten' URL
def get_aanduidingsobjecten_details(aanduidingsobject_url):
    response = requests.get(aanduidingsobject_url, headers=headers_json)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching details for {aanduidingsobject_url}: {response.status_code}")
        return None

# Function to fetch the besluiten (decisions) for a specific besluit ID
def fetch_besluit_files(besluit_id):
    url = f"{besluiten_base_url}/{besluit_id}/bestanden/"
    response = requests.get(url, headers=headers_json)
    
    if response.status_code == 200:
        return response.json()  # Assuming JSON response with file metadata
    else:
        print(f"Error fetching files for besluit {besluit_id}: {response.status_code}")
        return None



In [0]:
# Function to download PDF files to DBFS
def download_pdf_to_dbfs(file_id, besluit_id, save_dir):
    # Construct the final URL for downloading the file
    file_url = f"{besluiten_base_url}/{besluit_id}/bestanden/{file_id}"
    print(f"Attempting to download from URL: {file_url}")
    
    response = requests.get(file_url, stream=True)
    
    if response.status_code == 200:
        # Ensure DBFS save directory exists
        dbutils.fs.mkdirs(save_dir)
        
        # Save the file in DBFS
        pdf_filename = os.path.join(save_dir, f"{file_id}.pdf")
        
        with open(f"/dbfs{pdf_filename}", 'wb') as pdf_file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # Filter out keep-alive chunks
                    pdf_file.write(chunk)
        
        print(f"Downloaded {pdf_filename} to DBFS")
    else:
        print(f"Error downloading PDF file {file_id}: {response.status_code} - {response.text}")



In [0]:
# Main function to process a given aanduidingsobject URL and download besluiten PDFs
def process_aanduidingsobject_url(aanduidingsobject_url, save_dir):
    # Step 1: Get detailed info from the given URL
    obj_details = get_aanduidingsobjecten_details(aanduidingsobject_url)
    
    if obj_details:
        # Extract location and relevant metadata
        object_metadata = {
            "aanduidingsobject_url": aanduidingsobject_url,
            "object_id": obj_details.get("id"),
            "location": obj_details.get("locatie_samenvatting", "N/A"),  # Location summary
            "besluiten": []
        }
        
        # Step 2: Extract 'besluiten' (decisions) from the object details
        relevant_besluiten = obj_details.get("besluiten", [])
        
        for besluit in relevant_besluiten:
            besluit_id = besluit.get("id")
            besluit_date = besluit.get("datum_ondertekening", "N/A")  # Extract the signing date
            
            print(f"Fetching besluit ID {besluit_id}")
            
            # Step 3: Fetch the files for each besluit
            besluit_files = fetch_besluit_files(besluit_id)
            
            if besluit_files:
                for file in besluit_files:
                    file_id = file.get("id")
                    file_type = file.get("bestandssoort", {}).get("soort", "")
                    
                    if file_type == "Besluit":  # Only download files of type "Besluit"
                        print(f"Downloading PDF file ID {file_id} for besluit {besluit_id}")
                        download_pdf_to_dbfs(file_id, besluit_id, save_dir)
                        
                        # Add decision metadata for JSON
                        besluit_metadata = {
                            "besluit_url": f"{besluiten_base_url}/{besluit_id}",
                            "besluit_pdf_url": f"{besluiten_base_url}/{besluit_id}/bestanden/{file_id}",
                            "besluit_id": besluit_id,
                            "pdf_file_id": file_id,
                            "besluit_date": besluit_date
                        }
                        object_metadata["besluiten"].append(besluit_metadata)
        
        # Step 4: Save metadata as a JSON file
        #json_output_file = os.path.join(save_dir, "metadata.json")
        json_output_file = os.path.join(save_dir, f"metadata_{obj_details.get('id')}.json")

        with open(f"/dbfs{json_output_file}", 'w', encoding='utf-8') as json_file:
            json.dump(object_metadata, json_file, ensure_ascii=False, indent=4)
            print(object_metadata)
        
        print(f"Metadata saved to {json_output_file}")
    else:
        print("No valid object details found.")



In [0]:
# Example usage:
aanduidingsobject_url = "https://inventaris.onroerenderfgoed.be/aanduidingsobjecten/113448"
process_aanduidingsobject_url(aanduidingsobject_url, dbfs_save_dir)


Fetching besluit ID 14743
Downloading PDF file ID 21145 for besluit 14743
Attempting to download from URL: https://besluiten.onroerenderfgoed.be/besluiten/14743/bestanden/21145
Downloaded /FileStore/ABB_pdf/21145.pdf to DBFS
{'aanduidingsobject_url': 'https://inventaris.onroerenderfgoed.be/aanduidingsobjecten/113448', 'object_id': 113448, 'location': 'Gérard Willemotlaan 85 (Gent)', 'besluiten': [{'besluit_url': 'https://besluiten.onroerenderfgoed.be/besluiten/14743', 'besluit_pdf_url': 'https://besluiten.onroerenderfgoed.be/besluiten/14743/bestanden/21145', 'besluit_id': 14743, 'pdf_file_id': 21145, 'besluit_date': '2018-12-20'}]}
Metadata saved to /FileStore/ABB_pdf/metadata_113448.json
