In [76]:
import warnings
warnings.filterwarnings('ignore') # hide warnings
import sys
sys.path.append("..")
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [77]:
# OVERVIEW:
# Step 1: Relabel the zonenumbers to our actual numbers (such that zone 3 has id 3, not id 7 for example)
# Step 2: Split the shapefile into the different zones
# Step 3: Clustering + retrieving for each original zone to which cluster it belongs
# Step 4: Update the centroids and the OD-matrix

In [78]:
# STEP 1
city = "BRUSSEL"
radius = "40"

path_orig_shapefile = f"STA_prep/shapefile_data/{city}_{radius}_10/{city}_{radius}_10.shp"
qgis_path = f"QGIS/{city}_{radius}_10.shp"
shapefile = gpd.read_file(path_orig_shapefile)
shapefile["ZONENUMMER"] = list(range(1,len(shapefile)+1))
shapefile.to_file(qgis_path)

In [79]:
# STEP 2: QGIS
# Find centroid of whole area --> Split circle into bands with inner and outer radiuses 0-10, 10-20, 20-40 
# For radius 0-10: Vector --> Research tools --> Select by location: Select features from Brussel_40_10 by comparing to radius_0-10 based on intersect
#                  Right click Brussel_40_10 layer and export the selected features to a new layer brussel_40_10_0-10
# For radius 10-20: Vector --> Research tools --> Select by location: 3 operations:
#                       1) "creating new selection" from brussel_40_10 intersect with radius 10-20
#                       2) "removing from current selection" from brussel_40-10 equal with brussel_40_10_0-10 (to make sure that no zones is included in multiple bands)
#                       3) Export selected features to brussel_40_10_10-20
# For radius 20-40: Vector --> Research tools --> Select by location: 4 operations: 
#                       1) Select all features (such that some small zones from outside the 40 radius are also included)
#                       3) "Remove from current selection" from brussel_40-10 equal to brussel_40_10_10-20
#                       4) "Remove from current selection" from brussel_40-10 equal to brussel_40_10_0-10
#                       5) Export selected features to brussel_40_10_20-40


# TODO: this code is not error proof... 1 zone was included in both 10-20 and 20-40
# I removed it manually. Karls' way of working might be easier to use! 

In [80]:
# STEP 3: clustering

# settings
radius1 = "0-10"
radius2 = "10-20"
radius3 = "20-40"
# groupSize1 = 1 not needed
groupSize2 = 4
groupSize3 = 16

# TODO make code more waterproof (naming of input and outputfile)
def aggregate_zones(city = city, radius = radius2, idealNbZonesPerCluster = groupSize2):
    '''
    original shapefile will change: 'cluster' column containing the cluster to which each zone belongs
    new shapefill will be created: shapefile consisting of aggregated zones

    Note:
    code not waterproof: always looks in folder "QGIS"
    '''
    inputFile = f"QGIS/{city}_40_10_{radius}.shp"
    outputFile = f"QGIS/{city}_40_10_{radius}_knn.shp"
    shapefile = gpd.read_file(inputFile)

    points = np.array(shapefile.centroid.apply(lambda p: [p.x, p.y]).tolist())

    k =  int(np.ceil(len(shapefile) / idealNbZonesPerCluster))  # number of clusters to create
    print(f'Aggregation of {len(shapefile)} zones into {k} clusters.')
    kmeans = KMeans(n_clusters=k, random_state=0).fit(points) 

    shapefile["cluster"] = kmeans.labels_
    aggregated = shapefile.dissolve(by="cluster").reset_index()
    basicStatisticsZoneSizes = shapefile.groupby('cluster').size().describe() # contains
    print(f"Basic statistics of zone aggregation:\n{basicStatisticsZoneSizes}\n")

    # Create a new attribute column to store which elements each cluster contains
    aggregated["elements_1"] = ""
    aggregated["elements_2"] = ""

    # Retrieve which zones are clustered into which cluster
    cluster_groups = shapefile.groupby("cluster")
    cluster_elements = {}
    for cluster_id, group in cluster_groups:
        element_ids = group["ZONENUMMER"].tolist()
        cluster_elements[cluster_id] = element_ids

    # Update the attribute column with the zonenummers for each cluster. Since the attribute columns are limited in size, 
    # a second column is used if more than 40 zones are clustered into one aggregated zone. 
    for index, row in aggregated.iterrows():
        cluster_id = row["cluster"]
        element_ids = cluster_elements.get(cluster_id, [])
        if len(element_ids) > 40:
            element_ids1 = element_ids[0:40]
            element_ids2 = element_ids[40:]
            aggregated.at[index, "elements_1"] = ", ".join(str(e) for e in element_ids1)
            aggregated.at[index, "elements_2"] = ", ".join(str(e) for e in element_ids2)
        else: 
            aggregated.at[index, "elements_1"] = ", ".join(str(e) for e in element_ids)
    
    # Save the new shapefile to a file & update the original file
    shapefile.to_file(inputFile)
    aggregated.to_file(outputFile)

    # Return aggregated shapefile
    return aggregated

agg_shapefile1 = aggregate_zones()
agg_shapefile2 = aggregate_zones(radius = radius3, idealNbZonesPerCluster = groupSize3)

Aggregation of 219 zones into 55 clusters.
Basic statistics of zone aggregation:
count    55.000000
mean      3.981818
std       1.768981
min       1.000000
25%       3.000000
50%       4.000000
75%       5.000000
max       9.000000
dtype: float64

Aggregation of 957 zones into 60 clusters.
Basic statistics of zone aggregation:
count    60.000000
mean     15.950000
std       9.701345
min       6.000000
25%      10.000000
50%      13.000000
75%      19.000000
max      66.000000
dtype: float64



#### Try OD matrix

In [81]:
# STEP 4: Update centroids & OD. First we do some processing to make things easier. 

# After-market fix: add 'cluster' column to unclustered 0-10 radius such that later we have a cluster value for all zones in radius 0-40
shapefile0 = gpd.read_file(f'QGIS/{city}_40_10_{radius1}.shp')

# no k-nearest neighbour is ran on the smallest radius, but just for naming convention some processing is done 
shapefile0['cluster'] = range(shapefile0.shape[0])
shapefile0['elements_1'] = shapefile0['ZONENUMMER']
shapefile0.to_file(f'QGIS/{city}_40_10_{radius1}_knn.shp') 

# Concatenate shapefiles
shapefile1 = gpd.read_file(f'QGIS/{city}_40_10_0-10_knn.shp')
shapefile2 = gpd.read_file(f'QGIS/{city}_40_10_{radius2}_knn.shp')
shapefile3 = gpd.read_file(f'QGIS/{city}_40_10_{radius3}_knn.shp')

# add scalar to cluster numbers before concatenating shapefiles into combined shapefile
shapefile2['cluster'] = shapefile2['cluster'] + (shapefile1.cluster.max()+1)
shapefile3['cluster'] = shapefile3['cluster'] + (shapefile2.cluster.max()+1)

# Save combined shapefile
combined_shapefile = gpd.GeoDataFrame(pd.concat([shapefile1, shapefile2, shapefile3]))
combined_shapefile.to_file(f'QGIS/{city}_40_10_aggr_comb.shp')

In [82]:
# Retrieve original OD matrix, original centroids and aggregated shapefile. 
original_od_matrix = pd.read_excel("STA_prep/od_matrix_data/BRUSSEL_40_9_.xlsx") # TODO define dynamically
combined_shapefile = gpd.read_file(f'QGIS/{city}_40_10_aggr_comb.shp')
original_shapefile = gpd.read_file(f'QGIS/{city}_40_10.shp')
x_centroids = original_shapefile["centroid_x"]
y_centroids = original_shapefile["centroid_y"]

In [83]:
# Initialize new OD
nbAggZones = len(combined_shapefile)
aggregated_od_matrix = np.zeros((nbAggZones, nbAggZones))

# Retrieve combined zones of each cluster 
zones_per_cluster = []
for index, row in combined_shapefile.iterrows():
    zones_cluster_str = row['elements_1']
    zones_cluster = [int(e) for e in zones_cluster_str.split(",")]
    if len(zones_cluster) >= 40:
        zones_cluster_str = row['elements_2']
        more_zones = [int(e) for e in zones_cluster_str.split(",")]
        for zone in more_zones:
            zones_cluster.append(zone)
    zones_per_cluster.append(zones_cluster) 

In [84]:
# Now new OD and centroids. 
# Outer loop: fix centroid for every cluster
# Inner loop: fix OD for every cluster combination

# Outer loop
for index_i, row_i in combined_shapefile.iterrows():
    # Retrieve all the zones belonging to cluster i    
    zones_cluster_i = zones_per_cluster[index_i]

    # Calculate cluster centroid: mean of centroids of all the zones in the cluster
    centr_x = 0
    centr_y = 0
    for zone_i in zones_cluster_i:
        centr_x += x_centroids[zone_i-1]
        centr_y += y_centroids[zone_i-1]
    centr_x = centr_x / len(zones_cluster_i)
    centr_y = centr_y / len(zones_cluster_i)
    combined_shapefile.at[index_i, "centroid_x"] = centr_x
    combined_shapefile.at[index_i, "centroid_y"] = centr_y
    
    # Inner loop
    for index_j, row_j in combined_shapefile.iterrows():
        # Retrieve all the zones belonging to cluster j 
        zones_cluster_j = zones_per_cluster[index_j]
        
        # Calculate OD flows: sum up the individual flows
        aggr_od_flow = 0
        for zone_i in zones_cluster_i:
            for zone_j in zones_cluster_j:
                # VERY IMPORTANT: PANDAS INDEXING SHOULD FIRST SPECIFY COLUMN, AND THEN THE ROW
                # Otherwise we are flipping the direction of the summed flows... 
                aggr_od_flow += original_od_matrix[zone_j-1][zone_i-1]
        aggregated_od_matrix[index_i][index_j] = aggr_od_flow

# Change to pandas dataframe
aggregated_od_matrix = pd.DataFrame(aggregated_od_matrix)


KeyError: 3419

In [89]:
# split shapefile into pie slices:
slice_0 = gpd.read_file(f'QGIS/slices/slice_0.shp')
slice_1 = gpd.read_file(f'QGIS/slices/slice_1.shp')
slice_2 = gpd.read_file(f'QGIS/slices/slice_2.shp')
slice_3 = gpd.read_file(f'QGIS/slices/slice_3.shp')
slice_4 = gpd.read_file(f'QGIS/slices/slice_4.shp')
slice_5 = gpd.read_file(f'QGIS/slices/slice_5.shp')
slice_6 = gpd.read_file(f'QGIS/slices/slice_6.shp')
slices = [slice_0, slice_1, slice_2, slice_3, slice_4, slice_5, slice_6]

# initialize slice list
slice_list = [0 for i in range(len(combined_shapefile))]
i = 1

for slice in slices[1:]:
    clusters = slice['cluster'].values
    for cluster in combined_shapefile['cluster']:
        if cluster in clusters:
            slice_list[cluster] = i
    i += 1

combined_shapefile['slice'] = slice_list


In [90]:
combined_shapefile

Unnamed: 0,ZONENUMMER,STGB,STGB_L,STEDELIJK,PARKEERTAR,GEWEST,GEWEST_L,PROV,PROV_L,ARRO,...,STDGR_L,Zonenumber,centroid_x,centroid_y,Dist,cluster,elements_1,elements_2,geometry,slice
0,716,1,STUDIEGEBIED,6,1,2,BHG,6,BHG,21,...,grootstedelijk gebied,716,154003.682940,168873.301826,4293.652672,0,716,,"POLYGON ((145665.745 169392.406, 145665.358 16...",0
1,717,1,STUDIEGEBIED,6,0.8333,2,BHG,6,BHG,21,...,grootstedelijk gebied,717,146179.436560,170028.259042,3802.563299,1,717,,"POLYGON ((146408.405 170257.532, 146375.017 17...",0
2,718,1,STUDIEGEBIED,6,1.25,2,BHG,6,BHG,21,...,grootstedelijk gebied,718,146121.131852,169364.642155,4202.046934,2,718,,"POLYGON ((146183.583 169069.998, 146046.936 16...",0
3,719,1,STUDIEGEBIED,6,1.5,2,BHG,6,BHG,21,...,grootstedelijk gebied,719,145693.963884,169039.165557,4738.860133,3,719,,"POLYGON ((146046.936 169167.971, 146183.583 16...",0
4,720,1,STUDIEGEBIED,6,2.125,2,BHG,6,BHG,21,...,grootstedelijk gebied,720,145031.524096,168790.656583,5424.832862,4,720,,"POLYGON ((145267.150 168574.333, 145190.189 16...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,1382,1,STUDIEGEBIED,1,0,1,VLAANDEREN,2,VLAAMS-BRABANT,24,...,landelijk gebied,1382,176774.867437,167202.722896,27621.658220,597,"1382, 1383, 1385, 1386, 1390, 1391, 1392, 1393",,"POLYGON ((181196.248 172068.483, 181195.861 17...",4
598,2232,1,STUDIEGEBIED,1,0,1,VLAANDEREN,4,OOST-VLAANDEREN,41,...,landelijk gebied,2232,116838.011182,183674.562000,34788.282977,598,"2232, 2253, 2254, 2255, 2346, 2626, 2627, 2631",,"POLYGON ((110508.054 176226.658, 110463.192 17...",1
599,2233,1,STUDIEGEBIED,3,0.75,1,VLAANDEREN,4,OOST-VLAANDEREN,41,...,kleinstedelijk gebied,2233,125296.084232,169529.279731,24350.586212,599,"2233, 2234, 2235, 2236, 2237, 2238, 2241, 2242...",,"POLYGON ((125536.833 166885.461, 125515.434 16...",6
600,413,1,STUDIEGEBIED,1,0,1,VLAANDEREN,1,ANTWERPEN,12,...,landelijk gebied,413,137820.175903,199031.374063,29638.903874,600,"413, 2315, 2819, 2820, 2822, 2823, 2844, 2845,...",,"POLYGON ((135367.121 198275.193, 135244.784 19...",2


In [91]:
# Save new centroids to shapefile
combined_shapefile.to_file(f'QGIS/{city}_40_10_aggr_comb.shp')
# Save new OD flows to excel
aggregated_od_matrix.to_excel("QGIS/BRUSSEL_40_9_aggregated.xlsx", index=False) # Index is False removes the index column (which matches the format of the original OD matrix)

AttributeError: 'numpy.ndarray' object has no attribute 'to_excel'

In [None]:
# Some checks for validity of new OD-matrix. Centroids were manually inspected

from_0_10 = [953] # Corresponds to cluster 498 
to_0_10 = [705] # Corresponds to cluster 369
from_10_20 = [857, 860, 864] # Corresponds to cluster 537
to_10_20 = [1029, 1030, 1031, 1095] # Corresponds to cluster 520
from_20_40 = [1496, 1572, 1573, 1574, 1602, 1603, 1663] # Corresponds to cluster 623
to_20_40 = [999, 1001, 1006, 1007, 1008, 1009, 1089] # Corresponds to cluster 629

from_cluster_0_10 = 498
from_cluster_10_20 = 537
from_cluster_20_40 = 623
to_cluster_0_10 = 369
to_cluster_10_20 = 520
to_cluster_20_40 = 629

In [None]:
# I checked 9 combinations of the above clusters, they were all correct :)
sum = 0
for i in from_20_40:
    for j in to_20_40:
        sum += original_od_matrix[j-1][i-1]
print(sum)

print(aggregated_od_matrix[to_cluster_20_40][from_cluster_20_40])

**** 

## Elasticity matrix

Moet eigenlijk in andere file, maar de merge gaat lastig worden als ik dit ga coderen waar het moet staan

In [6]:
standard_elasticity = 0.384

standard_elasticity_matrix = [[standard_elasticity for i in range(7)] for j in range(7)]
standard_elasticity_matrix

[[0.384, 0.384, 0.384, 0.384, 0.384, 0.384, 0.384],
 [0.384, 0.384, 0.384, 0.384, 0.384, 0.384, 0.384],
 [0.384, 0.384, 0.384, 0.384, 0.384, 0.384, 0.384],
 [0.384, 0.384, 0.384, 0.384, 0.384, 0.384, 0.384],
 [0.384, 0.384, 0.384, 0.384, 0.384, 0.384, 0.384],
 [0.384, 0.384, 0.384, 0.384, 0.384, 0.384, 0.384],
 [0.384, 0.384, 0.384, 0.384, 0.384, 0.384, 0.384]]

In [11]:
# indexing: standard_elasticity_matrix[row][column]
# e.g. standard_elasticity_matrix[0][4] means row 0, column 4. Interpretation: from slice 0 to slice 4

to_own = 1.5
to_neighbour = 1.4
to_others = 1.3
to_opposite = 1.2
to_Brussels = 1.1

# from 1 to 1
standard_elasticity_matrix[1][1] *= to_own
# from 1 to Brussels 
standard_elasticity_matrix[1][0] *= to_Brussels
# from 1 to neighbouring slices
standard_elasticity_matrix[1][2] *= to_neighbour
standard_elasticity_matrix[1][6] *= to_neighbour
# from 1 to opposite slice
standard_elasticity_matrix[1][4] *= to_opposite
# from 1 to other slices
standard_elasticity_matrix[1][3] *= to_others
standard_elasticity_matrix[1][5] *= to_others

# from 2 to 2
standard_elasticity_matrix[2][2] *= to_own
# from 2 to Brussels 
standard_elasticity_matrix[2][0] *= to_Brussels
# from 2 to neighbouring slices
standard_elasticity_matrix[2][1] *= to_neighbour
standard_elasticity_matrix[2][3] *= to_neighbour
# from 2 to opposite slice
standard_elasticity_matrix[2][5] *= to_opposite
# from 2 to other slices
standard_elasticity_matrix[2][4] *= to_others
standard_elasticity_matrix[2][6] *= to_others

# from 3 to 3
standard_elasticity_matrix[3][3] *= to_own
# from 3 to Brussels 
standard_elasticity_matrix[2][0] *= to_Brussels
# from 2 to neighbouring slices
standard_elasticity_matrix[2][1] *= to_neighbour
standard_elasticity_matrix[2][3] *= to_neighbour
# from 2 to opposite slice
standard_elasticity_matrix[2][5] *= to_opposite
# from 2 to other slices
standard_elasticity_matrix[2][4] *= to_others
standard_elasticity_matrix[2][6] *= to_others

[0.384, 0.384, 0.384, 0.384, 0.384, 0.384]