In [None]:
import requests
import os
import json
from zipfile import ZipFile
import pandas as pd
import geopandas as gpd
import pathlib
from datetime import date
import shutil

ModuleNotFoundError: No module named 'rasterio'

In [12]:
# Set location we're working in
location = "work"

In [56]:
wilts_clip = {
    "BoundaryLine":{
     "polling_districts_england":"clip",
     "parish":"clip",
     "westminster_const":"intersect",
     "unitary_electoral_division":"intersect"
     },

    "Zoomstack":{
        "airports":"clip",
    "boundaries":False,
    "contours": "clip"}
}

In [13]:
# Sets location to work from
if location == "home":
    folder_location = "C:/Users/Lara/Work/DataDownloads/"
    
if location == "work":
    folder_location = "O:/Data_team/GIS_data_downloads/"

download_location = folder_location + "testData/"
temp_download_location = folder_location + "testDataTemp/"
lookup_location = folder_location + "Lookups/"
item_url_file = "OS-API_URL.csv"
boundary_file = folder_location + "CountyBoundary.shp"

In [14]:
# Read in Wiltshire boundary file
boundary_gdf = gpd.read_file(boundary_file)

In [40]:
# Read the csv file
item_url_df = pd.read_csv(lookup_location + item_url_file)
item_url_df.tail(12)

Unnamed: 0,Dataset,DatasetProductName,URL,Format,Source,Wiltshire
0,MiniScale,MiniScale,https://api.os.uk/downloads/v1/products/MiniSc...,"Zip file (containing EPS, Illustrator and TIFF...",OS,Clip
1,1:250 000 Scale Colour Raster,250kScaleColourRaster,https://api.os.uk/downloads/v1/products/250kSc...,TIFF-LZW,OS,Select
2,Boundary-Line,BoundaryLine,https://api.os.uk/downloads/v1/products/Bounda...,GeoPackage,OS,No
3,Code-Point,CodePointOpen,https://api.os.uk/downloads/v1/products/CodePo...,GeoPackage,OS,Clip
4,OS Open Zoomstack,OpenZoomstack,https://api.os.uk/downloads/v1/products/OpenZo...,GeoPackage,OS,Some
5,OS Open Greenspace,OpenGreenspace,https://api.os.uk/downloads/v1/products/OpenGr...,GeoPackage,OS,Yes
6,OS OpenMap - Local,OpenMapLocal,https://api.os.uk/downloads/v1/products/OpenMa...,GeoPackage,OS,Yes
7,OS Open Names,OpenNames,https://api.os.uk/downloads/v1/products/OpenNa...,GeoPackage,OS,Yes
8,OS Open Rivers,OpenRivers,https://api.os.uk/downloads/v1/products/OpenRi...,GeoPackage,OS,Yes
9,OS Open Roads,OpenRoads,https://api.os.uk/downloads/v1/products/OpenRo...,GeoPackage,OS,Yes


In [16]:
# get list of OS open data products available to download https://docs.os.uk/os-apis/accessing-os-apis/os-downloads-api/technical-specification/opendata-products
url = 'https://api.os.uk/downloads/v1/products'

response = requests.get(url)
product_list = response.json()

In [68]:

def process_250kScaleColourRaster_wilts():
    source = "OS"
    dataset = "250kScaleColourRaster"
    # Data location of tif files
    data_loc = pathlib.Path(f"{download_location}Original/{source}/{dataset}")
    # Location for wilts file
    wilts_file_location = pathlib.Path(str(data_loc).replace("Original", "Wiltshire"))
    # List all tif files in original location
    tif_list = list(data_loc.rglob("*.tif"))
    # Extract only tif files which are relecant area and copy to wilts folder
    for tif in tif_list:
        if tif.stem in ["SU", "ST", "SP", "SO"]:
            print(tif.stem)
            shutil.copy(tif,wilts_file_location )

def process_boundaryline_wilts(gpd_layers, file, wilts_file_location):
    for i, layer in enumerate(gpd_layers.name):
        if layer not in wilts_clip['BoundaryLine']:
            break
        gdf = gpd.read_file(file, layer = layer)
        if wilts_clip['BoundaryLine'][layer]== "clip":
            gdf_wilts = gpd.clip(gdf, boundary_gdf)
        elif wilts_clip['BoundaryLine'][layer]== "intersect":
            gdf_wilts = gdf.intersection(boundary_gdf)
        # Check that geodataframe contains data
        if gdf_wilts.empty == False:
            # Write or append geopackage layer
            gdf_wilts.to_file(wilts_file_location,layer = layer, driver = "GPKG", mode = "w" if count==0 else "a")
            count=+1
    

def process_geopackage_wilts(folder_name_wilts, source , dataset , metadata ):
    # Get list of spatial files downloaded
    data_loc = pathlib.Path(f"{download_location}Original/{source}/{dataset}")
    shp_list = list(data_loc.rglob("*.gpkg"))
    # Loop through each spatial file
    for file in shp_list:
        # Check folder exists, and if not create it
        wilts_file_location = pathlib.Path(str(file).replace("Original", "Wiltshire"))
        if not os.path.exists(wilts_file_location.parent):
            os.makedirs(wilts_file_location.parent)
        # Remove file if it already exists as geopackage writes onto existing file
        if os.path.exists(wilts_file_location):
            os.remove(wilts_file_location)
        # Loop through each layer in file, clip to wilts and write to new location
        gpd_layers = gpd.list_layers(file)
        count = 0
        if dataset == "BoundaryLine":
            process_boundaryline_wilts(gpd_layers, file, wilts_file_location)
        else:
            for i, layer in enumerate(gpd_layers.name):
                gdf = gpd.read_file(file, layer = layer)
                gdf_wilts = gpd.clip(gdf, boundary_gdf)
                # Check that geodataframe contains data
                if gdf_wilts.empty == False:
                    # Write or append geopackage layer
                    gdf_wilts.to_file(wilts_file_location,layer = layer, driver = "GPKG", mode = "w" if count==0 else "a")
                    count=+1
        
        with open(folder_name_wilts+"metadata.json", mode="w") as file:
            json.dump(metadata,file)

In [69]:
#Loop through each dataset name in lookup
for dataset in ["BoundaryLine"]:#item_url_df.DatasetProductName.tail(5):
    print(dataset)
    # Get metadata for dataset
    dataset_details_list = [d for d in product_list if d['id'] in [dataset]]
    metadata = {"Version":dataset_details_list[0]['version'],
                "Date downloaded":date.today().strftime('%m/%d/%Y')} 

    # Product download https://docs.os.uk/os-apis/accessing-os-apis/os-downloads-api/technical-specification/download-an-opendata-product
    # productId = dataset
    url_product = f"https://api.os.uk/downloads/v1//products/{dataset}/downloads"
    response_product = requests.get(url_product)
    dataset_product_list = response_product.json()
    print(dataset_product_list)
    # Set format required from external csv lookup (some datasets have multiple)
    format = item_url_df[item_url_df.DatasetProductName==dataset]['Format'].item()
    # Set source for creating folder structure
    source = item_url_df[item_url_df.DatasetProductName==dataset]['Source'].item()

    # Loop through each product in dataset list
    for value in dataset_product_list:
        #Only interested in specified formats/ areas
        if value['format'] == format and value['area'] in ["GB"]:
            # Set variables 
            dataset_url = value['url']
            temp_folder_name = f"{temp_download_location}"
            folder_name_original = f"{download_location}Original/{source}/{dataset}/"
            folder_name_wilts = f"{download_location}Wiltshire/{source}/{dataset}/"
            file_name = f"{dataset}_{value['area']}"
            
            print(dataset_url)
            try:
                response = requests.get(dataset_url)
                # Check if response is successful
                if response.status_code != 200:
                    print(f"Request failed for {dataset} with status code: {response.status_code}")
                    break

                #Create folder if not already there
                if not os.path.exists(folder_name_original):
                    os.makedirs(folder_name_original)

                # Create zip file in temp location
                with open(temp_folder_name+file_name+".zip", mode="wb") as file:
                    file.write(response.content)
                # Extract zip and move to permanent location
                with ZipFile(temp_folder_name+file_name+".zip", 'r') as z_object:
                    z_object.extractall(path=folder_name_original)
                # Write out metadata
                with open(folder_name_original+"metadata.json", mode="w") as file:
                    json.dump(metadata,file)
                
                ## Export wiltshire data 
                # 250kScaleColourRaster
                if dataset == "250kScaleColourRaster":
                    process_250kScaleColourRaster_wilts()
                # Geopackages which are clipped
                if value['format'] == "GeoPackage":
                    process_geopackage_wilts(folder_name_wilts, source , dataset , metadata)
                    

            except Exception as error:
                print(f"Error occured for {dataset}:", error)
                #assert 

           

BoundaryLine
[{'md5': 'eec886908ae65048d6ff140e2fc4e47d', 'size': 742238914, 'url': 'https://api.os.uk/downloads/v1/products/BoundaryLine/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect', 'format': 'ESRI® Shapefile', 'area': 'GB', 'fileName': 'bdline_essh_gb.zip'}, {'md5': '2d45b6c0562bc5c1b2b334e571b5f26d', 'size': 167493941, 'url': 'https://api.os.uk/downloads/v1/products/BoundaryLine/downloads?area=GB&format=GML&subformat=3&redirect', 'format': 'GML', 'subformat': '3', 'area': 'GB', 'fileName': 'bdline_gml3_gb.zip'}, {'md5': '8dcf87ea29a22e547a78d02b3691d035', 'size': 813118010, 'url': 'https://api.os.uk/downloads/v1/products/BoundaryLine/downloads?area=GB&format=GeoPackage&redirect', 'format': 'GeoPackage', 'area': 'GB', 'fileName': 'bdline_gpkg_gb.zip'}, {'md5': '62dc7b34a6bf8fb8842cdb344b065db3', 'size': 706025531, 'url': 'https://api.os.uk/downloads/v1/products/BoundaryLine/downloads?area=GB&format=MapInfo%C2%AE+TAB&redirect', 'format': 'MapInfo® TAB', 'area': 'GB', 'file

In [53]:
process_250kScaleColourRaster_wilts()

SO
SP
ST
SU


In [38]:
path = pathlib.Path(r"O:\Data_team\GIS_data_downloads\testData\Original\OS\250kScaleColourRaster\ras250_gb\data\HX.tif")
path.stem

'HX'