In [1]:
import requests
import os
import json
from zipfile import ZipFile
import pandas as pd
import geopandas as gpd
from datetime import datetime, timezone, date

In [None]:
# May need updating
location = "home"
item_url_file = "ItemURL_ArcGIS_6monthUpdates.csv" # Replace with "ItemURL_ArcGIS_YearlyUpdates.csv" for yearly updates 

In [None]:
if location == "home":
    folder_location = "C:/Users/Lara/Work/DataDownloads/"
    
if location == "work":
    folder_location = "O:/Data_team/GIS_data_downloads/"

download_location = f"{folder_location}testData/"
temp_download_location = f"{folder_location}testDataTemp/"
lookup_location = f"{folder_location}Lookups/"
boundary_file = f"{folder_location}CountyBoundary.shp"


In [None]:
# Wiltshire boundary for creating Wilts datasets
boundary_gdf = gpd.read_file(boundary_file)

In [18]:
# Read the csv file 
item_url_df = pd.read_csv(lookup_location + item_url_file)
item_url_df.tail(3)

Unnamed: 0,Dataset,Owner,URL
7,Special Protection Areas (England),NE,https://services.arcgis.com/JJzESW51TqeY9uat/a...
8,SSSI Impact Risk Zones (England),NE,https://services.arcgis.com/JJzESW51TqeY9uat/a...
9,Ancient Tree Inventory (ATI),Woodland Trust,https://services-eu1.arcgis.com/WIfgdJeDbrZU1c...


In [None]:
#Loop through each dataset name in lookup
for url in item_url_df.tail(3).URL:
    print(url)
    # Get url that contains details for dataset
    url_details = url.replace("/query?", "?f=json")
    
    # Get dataset name and source from lookup
    dataset = item_url_df[item_url_df.URL==url]['Dataset'].item() 
    source = item_url_df[item_url_df.URL==url]['Owner'].item()

    # Parameters
    batch_size = 2000  # Max size listed in details page, under "Max record count". E.g. https://services.arcgis.com/JJzESW51TqeY9uat/arcgis/rest/services/Priority_Habitats_Inventory_England/FeatureServer/0
    offset = 0
    all_features = []
    
    
    while True:
        params = {
        "where": "1=1",
        "outFields": "*",
        "f": "geojson",
        "resultOffset": offset,
        "resultRecordCount": batch_size,  
        }
        # Keeps running until server stops returning features (for files that are too large so require pagination)
        try:
            response = requests.get(url, params=params)
            # Check if response is successful
            response.raise_for_status()
        except Exception as error:
            print(f"Request error occured for {dataset}:", error)

        data = response.json()

        features = data.get('features')
        if not features:
            # No more features returned
            break

        all_features.extend(features)
        # Break after all features read
        if len(features) < batch_size:
            break
        
        offset += batch_size
        

    # Write to file as one GeoJSON FeatureCollection
    output = {
        'type': 'FeatureCollection',
        'features': all_features
    }
    with open(f"{temp_download_location}{dataset}.geojson", "w", encoding='utf-8') as f:
        json.dump(output, f)

    print("Complete. Total features:", len(all_features))
    
   
    # Create folder if not already there for permanent data storage
    if not os.path.exists(f"{download_location}Original/{source}/{dataset}"):
        os.makedirs(f"{download_location}Original/{source}/{dataset}")
    if not os.path.exists(f"{download_location}Wiltshire/{source}/{dataset}"):
        os.makedirs(f"{download_location}Wiltshire/{source}/{dataset}")

    # Need to read geojson back in then write out the gpkg, as arcgis does not read geojson
    gdf = gpd.read_file(f"{temp_download_location}{dataset}.geojson")

    # Convert to BNG as geojson defaults to EPSG: 4326
    gdf_27700 = gdf.to_crs(epsg=27700)

    # Remove previous file
    if os.path.exists(f"{download_location}Original/{source}/{dataset}/{dataset}.gpkg"):
        os.remove(f"{download_location}Original/{source}/{dataset}/{dataset}.gpkg")
    # Write out original file
    gdf.to_file(f"{download_location}Original/{source}/{dataset}/{dataset}.gpkg")

    # Clip to Wilts and write out
    if os.path.exists(f"{download_location}Wiltshire/{source}/{dataset}/{dataset}.gpkg"):
        os.remove(f"{download_location}Wiltshire/{source}/{dataset}/{dataset}.gpkg")
    gdf_wilts = gpd.clip(gdf_27700, boundary_gdf)
    if gdf_wilts.empty == False:
        gdf_wilts.to_file(f"{download_location}Wiltshire/{source}/{dataset}/{dataset}.gpkg")

    # Download dataset details
    try:
        response_details = requests.get(url_details)
        response.raise_for_status()
        details = response_details.json()
        date_last_edit = details['editingInfo']['dataLastEditDate']
        dt = datetime.fromtimestamp(date_last_edit / 1000, tz=timezone.utc)
        metadata = {"Date last edited":dt.strftime("%d/%m/%Y"),
                    "Date downloaded":date.today().strftime('%d/%m/%Y')} 
        with open(f"{download_location}Original/{source}/{dataset}/metadata.json", mode="w") as file:
            json.dump(metadata,file)
        with open(f"{download_location}Wiltshire/{source}/{dataset}/metadata.json", mode="w") as file:
            json.dump(metadata,file)
    except Exception as error:
        print(f"Request failed for details of {dataset}: {error}")



https://services.arcgis.com/JJzESW51TqeY9uat/arcgis/rest/services/Special_Protection_Areas_England/FeatureServer/0/query?
Complete. Total features: 250
https://services.arcgis.com/JJzESW51TqeY9uat/arcgis/rest/services/SSSI_Impact_Risk_Zones_England/FeatureServer/0/query?
Complete. Total features: 103922
https://services-eu1.arcgis.com/WIfgdJeDbrZU1cnA/arcgis/rest/services/Ancient%20Tree%20Inventory%20(ATI)/FeatureServer/0/query?
No more features returned
Complete. Total features: 72000
