In [0]:
import requests
import os
import json

In [0]:
# Base URLs for API requests
base_url = "https://inventaris.onroerenderfgoed.be"
besluiten_base_url = "https://besluiten.onroerenderfgoed.be/besluiten"
besluit_types_base_url = "https://id.erfgoed.net/thesauri/besluittypes"


In [0]:
# DBFS directory where files will be stored
dbfs_save_dir = "/dbfs/FileStore/ABB_pdf"

In [0]:
# Headers for JSON and CSV requests
headers_json = {
    "Accept": "application/json"
}
headers_csv = {
    "Accept": "text/csv"
}

In [0]:
# Function to search 'aanduidingsobjecten' based on text query
def search_aanduidingsobjecten(tekst, limit=12):
    url = f"{base_url}/aanduidingsobjecten"
    params = {
        "tekst": tekst
    }
    response = requests.get(url, headers=headers_json, params=params)
    
    if response.status_code == 200:
        return response.json()  # Return JSON response
    else:
        print(f"Error searching for aanduidingsobjecten: {response.status_code}")
        return None

# Function to get detailed information from 'self' link of 'aanduidingsobjecten'
def get_aanduidingsobjecten_details(obj_id):
    url = f"{base_url}/aanduidingsobjecten/{obj_id}"
    response = requests.get(url, headers=headers_json)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching details for {obj_id}: {response.status_code}")
        return None

# Function to fetch the besluiten (decisions) for a specific besluit ID
def fetch_besluit_files(besluit_id):
    url = f"{besluiten_base_url}/{besluit_id}/bestanden/"
    response = requests.get(url, headers=headers_json)
    
    if response.status_code == 200:
        return response.json()  # Assuming JSON response with file metadata
    else:
        print(f"Error fetching files for besluit {besluit_id}: {response.status_code}")
        return None




In [0]:


'''
# Function to download PDF files to DBFS
def download_pdf_to_dbfs(file_id, besluit_id, save_dir):
    # Construct the final URL for downloading the file
    file_url = f"{besluiten_base_url}/{besluit_id}/bestanden/{file_id}"
    print(f"Attempting to download from URL: {file_url}")
    
    response = requests.get(file_url, stream=True)
    
    if response.status_code == 200:
        # Ensure DBFS save directory exists
        dbutils.fs.mkdirs(save_dir)  # No need for '/dbfs' prefix
        
        # Save the file in DBFS
        pdf_filename = os.path.join(save_dir, f"{file_id}.pdf")
        
        with open(f"/dbfs{pdf_filename}", 'wb') as pdf_file:  # '/dbfs' prefix here is needed
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # Filter out keep-alive chunks
                    pdf_file.write(chunk)
        
        print(f"Downloaded {pdf_filename} to DBFS")
    else:
        print(f"Error downloading PDF file {file_id}: {response.status_code} - {response.text}")
'''

'\n# Function to download PDF files to DBFS\ndef download_pdf_to_dbfs(file_id, besluit_id, save_dir):\n    # Construct the final URL for downloading the file\n    file_url = f"{besluiten_base_url}/{besluit_id}/bestanden/{file_id}"\n    print(f"Attempting to download from URL: {file_url}")\n    \n    response = requests.get(file_url, stream=True)\n    \n    if response.status_code == 200:\n        # Ensure DBFS save directory exists\n        dbutils.fs.mkdirs(save_dir)  # No need for \'/dbfs\' prefix\n        \n        # Save the file in DBFS\n        pdf_filename = os.path.join(save_dir, f"{file_id}.pdf")\n        \n        with open(f"/dbfs{pdf_filename}", \'wb\') as pdf_file:  # \'/dbfs\' prefix here is needed\n            for chunk in response.iter_content(chunk_size=1024):\n                if chunk:  # Filter out keep-alive chunks\n                    pdf_file.write(chunk)\n        \n        print(f"Downloaded {pdf_filename} to DBFS")\n    else:\n        print(f"Error downloading P

In [0]:
# Function to download PDF files to DBFS
def download_pdf_to_dbfs(file_id, besluit_id, save_dir):
    # Construct the final URL for downloading the file
    file_url = f"{besluiten_base_url}/{besluit_id}/bestanden/{file_id}"
    print(f"Attempting to download from URL: {file_url}")
    
    response = requests.get(file_url, stream=True)
    
    # Check if the response contains a PDF based on the headers
    content_type = response.headers.get('Content-Type')
    if content_type != 'application/pdf':
        print(f"Error: The file at {file_url} is not a PDF. Content-Type: {content_type}")
        return
    
    # Ensure that the directory exists
    dbutils.fs.mkdirs(save_dir)  # No need for '/dbfs' prefix
    
    # Save the file in DBFS
    pdf_filename = os.path.join(save_dir, f"{file_id}.pdf")
    
    with open(f"/dbfs{pdf_filename}", 'wb') as pdf_file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:  # Filter out keep-alive chunks
                pdf_file.write(chunk)
    
    # Verify if the file size is greater than 0 to ensure it was saved correctly
    file_size = os.path.getsize(f"/dbfs{pdf_filename}")
    if file_size > 0:
        print(f"Downloaded {pdf_filename} to DBFS ({file_size} bytes)")
    else:
        print(f"Error: {pdf_filename} was saved but the file size is 0 bytes.")



In [0]:
# Function to process besluit metadata and return relevant URLs and metadata
def process_besluit_metadata(besluit_id):
    besluit_metadata = {}
    
    # Fetch besluit files (PDFs)
    besluit_files = fetch_besluit_files(besluit_id)
    if besluit_files:
        for file in besluit_files:
            file_id = file.get("id")
            file_type = file.get("bestandssoort", {}).get("soort", "")
            file_url = f"{besluiten_base_url}/{besluit_id}/bestanden/{file_id}"
            
            if file_type == "Besluit":  # Only consider files of type 'Besluit'
                besluit_metadata['besluit_pdf_url'] = file_url
                besluit_metadata['pdf_file_id'] = file_id

    # Add besluit URL and ID
    besluit_metadata['besluit_url'] = f"{besluiten_base_url}/{besluit_id}"
    besluit_metadata['besluit_id'] = besluit_id
    
    return besluit_metadata


In [0]:
def process_aanduidingsobjecten(tekst_search, num_items, save_dir):
    # Step 1: Search for objects
    search_results = search_aanduidingsobjecten(tekst_search, limit=num_items)
    
    all_metadata = []  # List to store all metadata
    
    if search_results:
        # Handle both list and dict types for search_results
        objects = search_results if isinstance(search_results, list) else search_results.get('items', [])
        
        # Iterate through the results and fetch details for each object
        for obj in objects:
            obj_id = obj.get('id')
            print(f"Processing object {obj_id}")
            obj_url = f"{base_url}/aanduidingsobjecten/{obj_id}"
            
            # Step 2: Get detailed info using the object ID
            obj_details = get_aanduidingsobjecten_details(obj_id)
            
            if obj_details:
                # Extract location and relevant metadata
                object_metadata = {
                    "aanduidingsobject_url": obj_url,
                    "object_id": obj_id,
                    "location": obj_details.get("locatie_samenvatting", "N/A"),  # Location summary
                    "besluiten": []
                }
                
                # Step 3: Extract 'besluiten' (decisions) from the object details
                relevant_besluiten = obj_details.get("besluiten", [])
                
                for besluit in relevant_besluiten:
                    besluit_id = besluit.get("id")
                    besluit_date = besluit.get("datum_ondertekening", "N/A")  # Extract the signing date
                    
                    print(f"Fetching besluit ID {besluit_id}")
                    
                    # Step 4: Process besluit metadata
                    besluit_metadata = process_besluit_metadata(besluit_id)
                    besluit_metadata['besluit_date'] = besluit_date
                    object_metadata["besluiten"].append(besluit_metadata)
                    
                    # Step 5: Fetch the files for each besluit
                    besluit_files = fetch_besluit_files(besluit_id)
                    
                    if besluit_files:
                        for file in besluit_files:
                            file_id = file.get("id")
                            file_type = file.get("bestandssoort", {}).get("soort", "")
                            
                            if file_type == "Besluit":  # Only download files of type "Besluit"
                                print(f"Downloading PDF file ID {file_id} for besluit {besluit_id}")
                                download_pdf_to_dbfs(file_id, besluit_id, save_dir)
                
                # Add object metadata to all_metadata
                all_metadata.append(object_metadata)
    
    else:
        print("No search results found.")
    
    # Step 6: Save all metadata as a JSON file
    if all_metadata:
        json_output_file = os.path.join(save_dir, "metadata.json")
        with open(f"/dbfs{json_output_file}", 'w', encoding='utf-8') as json_file:
            json.dump(all_metadata, json_file, ensure_ascii=False, indent=4)
            print(all_metadata)
        
        print(f"Metadata saved to {json_output_file}")
    else:
        print("No metadata to save.")


In [0]:
process_aanduidingsobjecten(tekst_search="Architectenwoning", num_items=5, save_dir=dbfs_save_dir)


Processing object 169894
Fetching besluit ID 15115
Downloading PDF file ID 35765 for besluit 15115
Attempting to download from URL: https://besluiten.onroerenderfgoed.be/besluiten/15115/bestanden/35765
Downloaded /dbfs/FileStore/ABB_pdf/35765.pdf to DBFS (227628 bytes)
Processing object 113499
Fetching besluit ID 14789
Downloading PDF file ID 26258 for besluit 14789
Attempting to download from URL: https://besluiten.onroerenderfgoed.be/besluiten/14789/bestanden/26258
Downloaded /dbfs/FileStore/ABB_pdf/26258.pdf to DBFS (320999 bytes)
Processing object 171222
Fetching besluit ID 15115
Downloading PDF file ID 35765 for besluit 15115
Attempting to download from URL: https://besluiten.onroerenderfgoed.be/besluiten/15115/bestanden/35765
Downloaded /dbfs/FileStore/ABB_pdf/35765.pdf to DBFS (227628 bytes)
Processing object 27977
Fetching besluit ID 5825
Downloading PDF file ID 16110 for besluit 5825
Attempting to download from URL: https://besluiten.onroerenderfgoed.be/besluiten/5825/bestande

In [0]:
'''
import requests

url = "https://inventaris.onroerenderfgoed.be/aanduidingsobjecten/113448"
headers = {
    "Accept": "application/json"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    data = response.json()  # Convert the response to JSON
    print(data)  # Print or inspect the JSON data
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")
'''

{'id': 113448, 'naam': 'Architectenwoning van Ferdinand Schlich', 'status': {'actief': True, 'verantwoordelijken': [], 'id': 665574, 'status': {'id': 75, 'naam': 'Actief'}, 'opmerkingen': None, 'datum': '2020-06-22T08:08:32.093182+02:00', 'aanpasser': {'uri': 'https://id.erfgoed.net/actoren/1203', 'omschrijving': 'Verhelst, Julie'}}, 'geldigheid_opmerkingen': None, 'type': {'id': 1, 'naam': 'Beschermd monument', 'uri': 'https://id.erfgoed.net/thesauri/aanduidingstypes/1'}, 'korte_beschrijving': 'Deze bescherming betreft de architectenwoning van Ferdinand Schlich.', 'besluiten': [{'uri': 'https://id.erfgoed.net/besluiten/14743', 'rechtsgevolgen': [{'uri': 'https://id.erfgoed.net/thesauri/besluittypes/13', 'label': 'definitieve beschermingsbesluiten'}], 'onderwerp': 'Architectenwoning Ferdinand Schlich', 'datum_ondertekening': '2018-12-20', 'status': {'id': 75, 'status': 'Actief'}, 'id': 14743}], 'waarden': [{'id': 108757, 'algemeen': False, 'uiteenzetting': '<p>De woning wordt beschouwd