In [2]:
## Purpose: Clip listed datasets to input area 

In [3]:
import pandas as pd
import geopandas as gpd
from pathlib import Path
import os
from datetime import date

In [4]:
#### Make changes
# What boundary are you clipping too?
# Is it a parish council? (answer yes or no)
parish_council = "yes"
# If yes, which parish council
boundary_name = "Colerne"

# If no, what is the filepath to the boundary?
boundary_area_filepath = "O:/Data_team/GIS_data_downloads/CountyBoundary.shp"
if parish_council != "yes":
    # And what is the boundary name?
    boundary_name = "Wiltshire"

# Do you want a buffer around the boundary? (answer yes or no)
buffer = "yes"
# If yes, what size buffer? (in metres)
buffer_size = 2000

In [5]:
### Do not change
# Set boundary
if parish_council == "yes":
    pc_boundaries_filepath = "G:/OS_OpenData/OS_BoundaryLine/OS_BL_Parish_Wilts.shp"
    pc_boundaries_gdf = gpd.read_file(pc_boundaries_filepath)
    boundary_area = pc_boundaries_gdf[pc_boundaries_gdf.NAME == boundary_name]
    # Check if boundary name valid, if not throw error
    if boundary_area.empty == True:
        raise ValueError(f"Parish council name was incorrect. Full list of Parish council names: {pc_boundaries_gdf.NAME.unique()}")

else:
    try:
        boundary_area = gpd.read_file(boundary_area_filepath)
    except:
        raise ValueError("The boundary file path was incorrect, check it is a valid file path")


if buffer == "yes":
    boundary_area.geometry = boundary_area.geometry.buffer(buffer_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [None]:
# location to write clipped data to
download_location = "O:/Data_team/GIS_data_downloads/dataPackage/"

# Find all geopackages in folder
dataset_area = Path("G:/GIS_Data/External/Open_Source/Original")
dataset_folder_list = list(dataset_area.glob("*/*"))

# Set date for file name
today = date.today().strftime("%Y%m%d")

In [None]:
data_list = []
# Loop through each dataset folder
for dataset_path_folder in dataset_folder_list:
    # Get list of geopackages in folder
    dataset_path_list = list(Path(dataset_path_folder).rglob("*.gpkg"))
    dataset_name = dataset_path_folder.stem.replace(" ", "_")
    
    try:
        # Check if there is a geopackage present
        dataset_path = dataset_path_list[0]
        filename = Path(dataset_path).stem
    except:
        # Skip loop if no geopackage
        continue

    # Create metadata list 
    dataset_metadata_name = dataset_path_folder.stem.replace("_", " ")
    source_metadata = dataset_path_folder.parent.stem.replace("_", " ")
    row_metadata = [ f"{dataset_name}_{today}.gpkg", dataset_metadata_name, source_metadata]
    data_list.append(row_metadata)

    # Create folder if not already there for data storage
    if not os.path.exists(f"{download_location}{boundary_name}/data/"):
        os.makedirs(f"{download_location}{boundary_name}/data/")



    # List layers in geopackage
    gpd_layers = gpd.list_layers(dataset_path)
    # Need separate code for boundary line, as handle each layer in geopackage differently (clip or interesect)
    if filename in ["BoundaryLine", "OS_Open_Zoomstack"]:
        pass
    else:
        # Loop through each layer in file, clip to boundary and write to new location
        count = 0
        for i, layer in enumerate(gpd_layers.name):
            print(filename, layer)
            # Read in each layer of dataset
            gdf = gpd.read_file(dataset_path, layer = layer)
            # Clip dataset
            gdf_clipped = gpd.clip(gdf, boundary_area)
            # Check that geodataframe contains data
            if gdf_clipped.empty == False:
                # Write or append geopackage layer
                gdf_clipped.to_file(f"{download_location}{boundary_name}/data/{dataset_name}_{today}.gpkg",layer = layer, driver = "GPKG", mode = "w" if count==0 else "a")
                count=+1
# Write out metadata file
metadata_df = pd.DataFrame(columns = ["File name", "Dataset Title", "Source"], data = data_list)
metadata_df.to_csv(f"{download_location}{boundary_name}/File information.csv", index = False)

# Write out boundary area used
boundary_area.to_file(f"{download_location}{boundary_name}/boundary_area.gpkg", layer='boundary', driver="GPKG")


Conservation_Areas Conservation_Areas
NHLE_Building_Preservation_Notices_polygons NHLE_Building_Preservation_Notices_polygons
NHLE_Building_Preservation_Notice_points NHLE_Building_Preservation_Notice_points
NHLE_Certificate_of_Immunity_points NHLE_Certificate_of_Immunity_points
NHLE_Certificate_of_Immunity_polygons NHLE_Certificate_of_Immunity_polygons
NHLE_Listed_Building_points NHLE_Listed_Building_points
NHLE_Listed_Building_polygons NHLE_Listed_Building_polygons
NHLE_Protected_Wreck_sites NHLE_Protected_Wreck_sites
NHLE_Registered_Battlefields NHLE_Registered_Battlefields
NHLE_Registered_Parks_and_Gardens NHLE_Registered_Parks_and_Gardens
NHLE_Scheduled_Monuments NHLE_Scheduled_Monuments
NHLE_World_Heritage_Sites NHLE_World_Heritage_Sites
National_Trust_Open_Data_Land_Always_Open National_Trust_Open_Data_Land_Always_Open
National_Trust_Open_Data_Land_Limited_Access National_Trust_Open_Data_Land_Limited_Access
Access_Network_Mapping Access_Network_Mapping
Agricultural_Land_Classifi