### Clean CityPy data
- mainly 'buildings' layer
- and 'grid' layer

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import fiona
from pathlib import Path

In [3]:
citypy_path = "../data/citypy/groningen_NL.gpkg"
groningen_citypy = gpd.read_file(citypy_path)

# get gpkg layer names as a list
layers = fiona.listlayers(citypy_path)
print(layers)


['extents', 'landuse', 'city_center', 'water', 'terrain', 'road_nodes', 'tesselation', 'enclosures', 'road_edges', 'buildings', 'grid']


  result = read_func(


### 'buildings' layer

In [4]:
# read bldgs layer
bldgs = gpd.read_file(citypy_path, layer = "buildings")
bldgs.columns

# shape
print(f"Rows and columns of bldgs: {bldgs.shape}\n")

# count each columns' unique values
for col in bldgs.columns:
    print(f"{col}: {len(bldgs[col].unique())}")


Rows and columns of bldgs: (100863, 79)

id::building: 100863
real_levels: 26
real_height: 58
roof_shape: 15
subtype: 11
class: 56
name: 552
ghsl_height: 7390
building_group_id: 43239
closest_road_edge_id: 26132
approximate_height: 7266
area: 100863
squareness: 100674
enclosure_id: 3931
orientation: 100706
alignment: 100837
covered_area: 93628
elongation: 100862
equivalent_rectangular_index: 100863
form_factor: 100416
building_group_area_std: 15201
building_group_courtyard_index: 324
building_group_elongation_std: 15201
building_group_enclosure_perimeter_coverage: 12595
land_use_category: 5
perimeter: 100863
shape_index: 100863
shared_walls_ratio: 82761
enclosurue_building_area_ratio: 3931
area::nb_enclosure::mean: 1
area::nb_enclosure::std: 1
building_group_enclosure_perimeter_coverage::nb_enclosure::mean: 1
building_group_enclosure_perimeter_coverage::nb_enclosure::std: 1
shared_walls_ratio::nb_enclosure::mean: 1
shared_walls_ratio::nb_enclosure::std: 1
shape_index::nb_enclosure::mea

In [5]:
# count each columns' null values
for col in bldgs.columns:
    print(f"{col}: {bldgs[col].isnull().sum()}")

id::building: 0
real_levels: 97492
real_height: 100700
roof_shape: 0
subtype: 0
class: 0
name: 100295
ghsl_height: 0
building_group_id: 0
closest_road_edge_id: 51
approximate_height: 0
area: 0
squareness: 8
enclosure_id: 1818
orientation: 0
alignment: 0
covered_area: 0
elongation: 0
equivalent_rectangular_index: 0
form_factor: 448
building_group_area_std: 28039
building_group_courtyard_index: 0
building_group_elongation_std: 28039
building_group_enclosure_perimeter_coverage: 2196
land_use_category: 0
perimeter: 0
shape_index: 0
shared_walls_ratio: 0
enclosurue_building_area_ratio: 1818
area::nb_enclosure::mean: 100863
area::nb_enclosure::std: 100863
building_group_enclosure_perimeter_coverage::nb_enclosure::mean: 100863
building_group_enclosure_perimeter_coverage::nb_enclosure::std: 100863
shared_walls_ratio::nb_enclosure::mean: 100863
shared_walls_ratio::nb_enclosure::std: 100863
shape_index::nb_enclosure::mean: 100863
shape_index::nb_enclosure::std: 100863
building_group_elongation_s

In [6]:
# count for columns that have exactly the same null values as the number of rows
for col in bldgs.columns:
    if bldgs[col].isnull().sum() == len(bldgs):
        print(f"{col}: {bldgs[col].isnull().sum()}")

area::nb_enclosure::mean: 100863
area::nb_enclosure::std: 100863
building_group_enclosure_perimeter_coverage::nb_enclosure::mean: 100863
building_group_enclosure_perimeter_coverage::nb_enclosure::std: 100863
shared_walls_ratio::nb_enclosure::mean: 100863
shared_walls_ratio::nb_enclosure::std: 100863
shape_index::nb_enclosure::mean: 100863
shape_index::nb_enclosure::std: 100863
building_group_elongation_std::nb_enclosure::mean: 100863
building_group_area_std::nb_enclosure::mean: 100863
equivalent_rectangular_index::nb_enclosure::mean: 100863
equivalent_rectangular_index::nb_enclosure::std: 100863
elongation::nb_enclosure::mean: 100863
elongation::nb_enclosure::std: 100863
covered_area::nb_enclosure::mean: 100863
covered_area::nb_enclosure::std: 100863
alignment::nb_enclosure::mean: 100863
alignment::nb_enclosure::std: 100863
squareness::nb_enclosure::mean: 100863
squareness::nb_enclosure::std: 100863
approximate_height::nb_enclosure::mean: 100863
approximate_height::nb_enclosure::std: 1

In [7]:
# delete columns that have exactly the same null values as the number of rows
bldgs_nonull = bldgs.drop(columns = [col for col in bldgs.columns if bldgs[col].isnull().sum() == len(bldgs)])
len(bldgs_nonull.columns)


55

In [8]:
# count each columns' unique values and number ofnull values as a tuple for each column
for col in bldgs_nonull.columns:
    print(f"{col}: {len(bldgs_nonull[col].unique())}, {bldgs_nonull[col].isnull().sum()}")


id::building: 100863, 0
real_levels: 26, 97492
real_height: 58, 100700
roof_shape: 15, 0
subtype: 11, 0
class: 56, 0
name: 552, 100295
ghsl_height: 7390, 0
building_group_id: 43239, 0
closest_road_edge_id: 26132, 51
approximate_height: 7266, 0
area: 100863, 0
squareness: 100674, 8
enclosure_id: 3931, 1818
orientation: 100706, 0
alignment: 100837, 0
covered_area: 93628, 0
elongation: 100862, 0
equivalent_rectangular_index: 100863, 0
form_factor: 100416, 448
building_group_area_std: 15201, 28039
building_group_courtyard_index: 324, 0
building_group_elongation_std: 15201, 28039
building_group_enclosure_perimeter_coverage: 12595, 2196
land_use_category: 5, 0
perimeter: 100863, 0
shape_index: 100863, 0
shared_walls_ratio: 82761, 0
enclosurue_building_area_ratio: 3931, 1818
area::nb_radius_300::mean: 98769, 0
area::nb_radius_300::std: 98762, 8
building_group_enclosure_perimeter_coverage::nb_radius_300::mean: 97326, 518
building_group_enclosure_perimeter_coverage::nb_radius_300::std: 97604, 5

In [9]:
# subtype, class, enclosure_id?, land_use_category, building_class
# only keep these columns
bldgs_nonull_filtered = bldgs_nonull[["subtype", "class", "enclosure_id", "land_use_category", "building_class", "geometry"]]
bldgs_nonull_filtered.head()

# print each column's unique values
for col in bldgs_nonull_filtered.columns:
    print(f"{col}: {bldgs_nonull_filtered[col].unique()}")


subtype: ['commercial' 'education' 'religious' 'industrial' 'UNKNOWN' 'residential'
 'civic' 'outbuilding' 'medical' 'entertainment' 'agricultural']
class: ['retail' 'university' 'church' 'industrial' 'yes' 'house' 'storage_tank'
 'warehouse' 'apartments' 'school' 'commercial' 'civic' 'water_tower'
 'museum' 'cathedral' 'office' 'shed' 'train_station' 'roof' 'public'
 'service' 'tower' 'hospital' 'sports_centre' 'greenhouse' 'college'
 'garage' 'static_caravan' 'toilets' 'houseboat' 'chapel' 'guardhouse'
 'sports_hall' 'carport' 'garages' 'farm_auxiliary' 'hotel' 'stable'
 'dormitory' 'bungalow' 'residential' 'kiosk' 'synagogue' 'farm'
 'grandstand' 'detached' 'barn' 'prison' 'construction' 'ruins' 'hut'
 'healthcare' 'allotment_house' 'parking' 'data_center' 'stadium']
enclosure_id: [ 3612.  2416.  6715. ...  2522.  4868. 10411.]
land_use_category: ['retail' 'commercial' 'UNKNOWN' 'residential' 'industrial']
building_class: ['big_commercial' 'complex' 'apartments' 'detached' 'industri

In [10]:
# for subtype and land_use_category, replace 'UNKNOWN' with np.nan
bldgs_nonull_filtered["subtype"] = bldgs_nonull_filtered["subtype"].replace("UNKNOWN", np.nan)
bldgs_nonull_filtered["land_use_category"] = bldgs_nonull_filtered["land_use_category"].replace("UNKNOWN", np.nan)

bldgs_nonull_filtered.isnull().sum()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


subtype              36913
class                    0
enclosure_id          1818
land_use_category     6316
building_class           0
geometry                 0
dtype: int64

In [11]:
# save bldgs_nonull_filtered as a gpkg
bldgs_nonull_filtered.to_file('../data/citypy/Groningen_NL_bldgs_filtered.gpkg', layer = "buildings", driver = "GPKG")

In [None]:
# unique values of 'class' column
bldgs_nonull_filtered["class"].unique()

array(['retail', 'university', 'church', 'industrial', 'yes', 'house',
       'storage_tank', 'warehouse', 'apartments', 'school', 'commercial',
       'civic', 'water_tower', 'museum', 'cathedral', 'office', 'shed',
       'train_station', 'roof', 'public', 'service', 'tower', 'hospital',
       'sports_centre', 'greenhouse', 'college', 'garage',
       'static_caravan', 'toilets', 'houseboat', 'chapel', 'guardhouse',
       'sports_hall', 'carport', 'garages', 'farm_auxiliary', 'hotel',
       'stable', 'dormitory', 'bungalow', 'residential', 'kiosk',
       'synagogue', 'farm', 'grandstand', 'detached', 'barn', 'prison',
       'construction', 'ruins', 'hut', 'healthcare', 'allotment_house',
       'parking', 'data_center', 'stadium'], dtype=object)

In [None]:
# count values of 'class' column
bldgs_nonull_filtered["class"].value_counts()

# house, apartments, houseboat ... what's 'yes'?

class
house              40887
yes                33733
apartments         10617
shed                8136
garage              2527
industrial          1978
retail               639
commercial           634
houseboat            472
construction         389
static_caravan       184
roof                 134
school                89
service               80
office                78
university            50
church                28
hospital              27
storage_tank          18
sports_centre         16
college               15
garages               14
farm_auxiliary        13
hotel                 12
greenhouse            10
stable                10
warehouse              9
dormitory              6
guardhouse             6
civic                  6
residential            5
grandstand             5
carport                5
allotment_house        5
public                 4
barn                   2
prison                 1
hut                    1
ruins                  1
healthcare         

In [18]:
# count values of 'building_class' column
bldgs_nonull_filtered["building_class"].value_counts()

building_class
terraced           69036
detached           16690
perimeter_block     4664
filled_block        3914
apartments          2378
industrial          2126
big_commercial      1141
irregular_block      532
complex              382
Name: count, dtype: int64

### 'grid' layer

In [13]:
grid = gpd.read_file(citypy_path, layer = "grid")
grid.columns



Index(['distance_to_city_center',
       'road_edges::test_clusters::most_intersecting',
       'road_edges::test_clusters::most_intersecting::mode_kernel3',
       'buildings::building_class::most_intersecting',
       'buildings::building_class::most_intersecting::mode_kernel3',
       'city_center::name::count', 'city_center::name::count::mode_kernel3',
       'geometry'],
      dtype='object')

In [14]:
# list all unique values of each column
for col in grid.columns:
    print(f"{col}: {grid[col].unique()}")

grid.isnull().sum()


distance_to_city_center: [8299.57828427 8247.12921585 8195.56470959 ... 8051.30993733 8111.45685516
 8172.38477553]
road_edges::test_clusters::most_intersecting: [None '3' '11' '10' '2' '1' '6' '7' '0' '5' '12' '4' '8']
road_edges::test_clusters::most_intersecting::mode_kernel3: ['99' '3' '11' '10' '6' '1' '7' '0' '5' '2' '12']
buildings::building_class::most_intersecting: [None 'detached' 'industrial' 'big_commercial' 'terraced' 'apartments'
 'complex' 'irregular_block' 'filled_block' 'perimeter_block']
buildings::building_class::most_intersecting::mode_kernel3: ['99' 'industrial' 'detached' 'big_commercial' 'terraced' 'apartments'
 'complex' 'perimeter_block' 'filled_block' 'irregular_block']
city_center::name::count: [0. 1.]
city_center::name::count::mode_kernel3: [0.]
geometry: <GeometryArray>
[<POLYGON ((330535.221 5894961.615, 330635.221 5894961.615, 330635.221 589506...>,
 <POLYGON ((330535.221 5895061.615, 330635.221 5895061.615, 330635.221 589516...>,
 <POLYGON ((330535.221 58

distance_to_city_center                                          0
road_edges::test_clusters::most_intersecting                  7060
road_edges::test_clusters::most_intersecting::mode_kernel3       0
buildings::building_class::most_intersecting                  7018
buildings::building_class::most_intersecting::mode_kernel3       0
city_center::name::count                                         0
city_center::name::count::mode_kernel3                           0
geometry                                                         0
dtype: int64

In [15]:
# only keep 'buildings::building_class::most_intersecting::mode_kernel3' column and 'geometry' column
grid_filtered = grid[["buildings::building_class::most_intersecting::mode_kernel3", "geometry"]]

# change column name to 'building_class'
grid_filtered.columns = ["building_class", "geometry"]

grid_filtered["building_class"].unique()

# change "99" to "other"
grid_filtered["building_class"] = grid_filtered["building_class"].replace("99", "other")

# save grid_filtered as a gpkg
grid_filtered.to_file('../data/citypy/Groningen_NL_grid_filtered.gpkg', layer = "grid", driver = "GPKG")

grid_filtered.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,building_class,geometry
0,other,"POLYGON ((330535.221 5894961.615, 330635.221 5..."
1,other,"POLYGON ((330535.221 5895061.615, 330635.221 5..."
2,other,"POLYGON ((330535.221 5895161.615, 330635.221 5..."
3,other,"POLYGON ((330535.221 5895261.615, 330635.221 5..."
4,other,"POLYGON ((330535.221 5895361.615, 330635.221 5..."
