In [1]:
import requests
import os
import json
from zipfile import ZipFile
import pandas as pd
import geopandas as gpd
import pathlib
from datetime import date
import shutil

In [2]:
# Set location we're working in
location = "work"

In [3]:
wilts_clip = {
    "BoundaryLine":{
     "polling_districts_england":"intersect",
     "parish":"intersect",
     "westminster_const":"intersect",
     "unitary_electoral_division":"intersect"
     },

    "Zoomstack":{
        "airports":"clip",
    "boundaries":False,
    "contours": "clip"}
}

In [4]:
# Sets location to work from
if location == "home":
    folder_location = "C:/Users/Lara/Work/DataDownloads/"
    
if location == "work":
    folder_location = "O:/Data_team/GIS_data_downloads/"

download_location = "G:/GIS_Data/External/Open_Source/"
temp_download_location = f"{folder_location}testDataTemp/"
lookup_location = f"{folder_location}Lookups/"
item_url_file = "OS-API_URL.csv"
boundary_file = f"{folder_location}CountyBoundary.shp"

In [5]:
# Read in Wiltshire boundary file
boundary_gdf = gpd.read_file(boundary_file)

In [11]:
# Read the csv file
item_url_df = pd.read_csv(lookup_location + item_url_file)
item_url_df.tail(2)

Unnamed: 0,Dataset,DatasetProductName,URL,Format,Source,Wiltshire
10,OS Terrain 50,Terrain50,https://api.os.uk/downloads/v1/products/Terrai...,GeoPackage,OS,Clip
11,OS Open Built Up Areas,BuiltUpAreas,https://api.os.uk/downloads/v1/products/BuiltU...,GeoPackage,OS,Clip


In [7]:
# get list of OS open data products available to download https://docs.os.uk/os-apis/accessing-os-apis/os-downloads-api/technical-specification/opendata-products
url = 'https://api.os.uk/downloads/v1/products'

response = requests.get(url)
product_list = response.json()

In [8]:

def remove_previous_spatial_files(source, dataset):
    # Get list of spatial files downloaded for specific datatset
    data_loc = pathlib.Path(f"{download_location}Original/{source}/{dataset}")
    shp_list = list(data_loc.rglob("*.gpkg"))

def process_250kScaleColourRaster_wilts():
    """
    Processes 250kScaleColourRaster data for wilts by selecting relevant Tiff files

    Args:
        
    Returns:
        
    """
    source = "OS"
    dataset = "250kScaleColourRaster"
    # Data location of tif files
    data_loc = pathlib.Path(f"{download_location}Original/{source}/{dataset}")
    # Location for wilts file
    wilts_file_location = pathlib.Path(str(data_loc).replace("Original", "Wiltshire"))
    # List all tif files in original location
    tif_list = list(data_loc.rglob("*.tif"))
    # Extract only tif files which are relecant area and copy to wilts folder
    for tif in tif_list:
        if tif.stem in ["SU", "ST", "SP", "SO"]:
            print(tif.stem)
            shutil.copy(tif,wilts_file_location )

def process_boundaryline_wilts(gpd_layers, file, wilts_file_location):
    """
    Processes boundary line data for Wilts

    Args:
        gpd_layers (DataFrame): Layers in geopackage
        file (str): Geospatial file to read
        wilts_file_location (str): Output location for wilts dataset

    Returns:
        
    """
    count = 0
    for i, layer in enumerate(gpd_layers.name):
        # Layers not listed in dict should not have a wiltshire version
        if layer not in wilts_clip['BoundaryLine']:
            continue
        gdf = gpd.read_file(file, layer = layer)
        if wilts_clip['BoundaryLine'][layer]== "clip":
            gdf_wilts = gpd.clip(gdf, boundary_gdf)
        elif wilts_clip['BoundaryLine'][layer]== "intersect":
            target_geom = boundary_gdf.geometry.iloc[0]
            gdf_wilts = gdf[gdf.intersects(target_geom)]
            # gdf_wilts = gdf.intersection(boundary_gdf)
        # Check that geodataframe contains data
        if gdf_wilts.empty == False:
            # Write or append geopackage layer
            gdf_wilts.to_file(wilts_file_location,layer = layer, driver = "GPKG", mode = "w" if count==0 else "a")
            count=+1
    

def process_geopackage_wilts(folder_name_wilts, source , dataset , metadata ):
    """
    Processes original datasets into Wilts, geopackages only

    Args:
        folder_name_wilts (str): Output location for wilts dataset
        source (str): Source of dataset
        dataset (str): Dataset name
        metadata (dict): Metadata information including version

    Returns:
        
    """
    # Get list of spatial files downloaded for specific datatset
    data_loc = pathlib.Path(f"{download_location}Original/{source}/{dataset}")
    shp_list = list(data_loc.rglob("*.gpkg"))
    # Loop through each spatial file
    for file in shp_list:
        # Check folder destination exists, and if not create it
        wilts_file_location = pathlib.Path(str(file).replace("Original", "Wiltshire"))
        if not os.path.exists(wilts_file_location.parent):
            os.makedirs(wilts_file_location.parent)
        # Remove file if it already exists as geopackage writes onto existing file
        if os.path.exists(wilts_file_location):
            os.remove(wilts_file_location)
        # Loop through each layer in file, clip to wilts and write to new location
        gpd_layers = gpd.list_layers(file)
        # Need separate code for boundary line, as handle each layer in geopackage differently (clip or interesect)
        if dataset == "BoundaryLine":
            process_boundaryline_wilts(gpd_layers, file, wilts_file_location)
        else:
            count = 0
            for i, layer in enumerate(gpd_layers.name):
                gdf = gpd.read_file(file, layer = layer)
                gdf_wilts = gpd.clip(gdf, boundary_gdf)
                # Check that geodataframe contains data
                if gdf_wilts.empty == False:
                    # Write or append geopackage layer
                    gdf_wilts.to_file(wilts_file_location,layer = layer, driver = "GPKG", mode = "w" if count==0 else "a")
                    count=+1
        
        with open(folder_name_wilts+"metadata.json", mode="w") as file:
            json.dump(metadata,file)

In [12]:
#Loop through each dataset name in lookup
for dataset in item_url_df.DatasetProductName.tail(2):
    print(dataset)
    # Get metadata for dataset
    dataset_details_list = [d for d in product_list if d['id'] in [dataset]]
    metadata = {"Version":dataset_details_list[0]['version'],
                "Date downloaded":date.today().strftime('%m/%d/%Y')} 

    # Product download https://docs.os.uk/os-apis/accessing-os-apis/os-downloads-api/technical-specification/download-an-opendata-product
    url_product = f"https://api.os.uk/downloads/v1//products/{dataset}/downloads"
    response_product = requests.get(url_product)
    dataset_product_list = response_product.json()
    print(dataset_product_list)
    # Set format required from external csv lookup (some datasets have multiple)
    format = item_url_df[item_url_df.DatasetProductName==dataset]['Format'].item()
    # Set source for creating folder structure
    source = item_url_df[item_url_df.DatasetProductName==dataset]['Source'].item()

    # Loop through each product in dataset list
    for value in dataset_product_list:
        #Only interested in specified formats/ areas
        if value['format'] == format and value['area'] in ["GB"]:
            # Set variables 
            dataset_url = value['url']
            temp_folder_name = f"{temp_download_location}"
            folder_name_original = f"{download_location}Original/{source}/{dataset}/"
            folder_name_wilts = f"{download_location}Wiltshire/{source}/{dataset}/"
            file_name = f"{dataset}_{value['area']}"
            
            print(dataset_url)
            try:
                response = requests.get(dataset_url)
                # Check if response is successful
                if response.status_code != 200:
                    print(f"Request failed for {dataset} with status code: {response.status_code}")
                    break

                #Create folder if not already there
                if not os.path.exists(folder_name_original):
                    os.makedirs(folder_name_original)
                # # Remove file if it already exists
                # if os.path.exists(wilts_file_location):
                #     os.remove(wilts_file_location)
                # Create zip file in temp location
                with open(temp_folder_name+file_name+".zip", mode="wb") as file:
                    file.write(response.content)
                # Extract zip and move to permanent location
                with ZipFile(temp_folder_name+file_name+".zip", 'r') as z_object:
                    z_object.extractall(path=folder_name_original)
                # Write out metadata
                with open(folder_name_original+"metadata.json", mode="w") as file:
                    json.dump(metadata,file)
                
                ## Export wiltshire data 
                # 250kScaleColourRaster
                if dataset == "250kScaleColourRaster":
                    process_250kScaleColourRaster_wilts()
                # Geopackages which are clipped
                if value['format'] == "GeoPackage":
                    process_geopackage_wilts(folder_name_wilts, source , dataset , metadata)
                    

            except Exception as error:
                print(f"Error occured for {dataset}:", error)
                #assert 

           

Terrain50
[{'md5': '9d4597fcfd059253a1002388766b5c88', 'size': 161701445, 'url': 'https://api.os.uk/downloads/v1/products/Terrain50/downloads?area=GB&format=ASCII+Grid+and+GML+%28Grid%29&redirect', 'format': 'ASCII Grid and GML (Grid)', 'area': 'GB', 'fileName': 'terr50_gagg_gb.zip'}, {'md5': '03f4d4f9082a3730600f156df532d256', 'size': 1043961427, 'url': 'https://api.os.uk/downloads/v1/products/Terrain50/downloads?area=GB&format=ESRI%C2%AE+Shapefile&subformat=%28Contours%29&redirect', 'format': 'ESRIÂ® Shapefile', 'subformat': '(Contours)', 'area': 'GB', 'fileName': 'terr50_cesh_gb.zip'}, {'md5': '60fe32672499b05f912cb476307665c0', 'size': 1122162594, 'url': 'https://api.os.uk/downloads/v1/products/Terrain50/downloads?area=GB&format=GML&subformat=%28Contours%29&redirect', 'format': 'GML', 'subformat': '(Contours)', 'area': 'GB', 'fileName': 'terr50_cgml_gb.zip'}, {'md5': '670f964d3a48afc1c66758baec7ef6f5', 'size': 1229391491, 'url': 'https://api.os.uk/downloads/v1/products/Terrain50/do

ConnectionError: HTTPSConnectionPool(host='api.os.uk', port=443): Max retries exceeded with url: /downloads/v1//products/BuiltUpAreas/downloads (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A536BF0910>: Failed to resolve 'api.os.uk' ([Errno 11004] getaddrinfo failed)"))