In [3]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import psycopg2
import ipywidgets as widgets
import contextily as cx
from dotenv import load_dotenv

load_dotenv()

sv_path="../data/raw/straatvinken/SV2020_DataAll_20220211.csv"

def get_straatvinken_data(sv_path = sv_path):
    sv = pd.read_csv(sv_path, encoding = "ISO-8859-1")
    sv.columns = ["ID", "truck", "bus", "van", "car", "bike", "walk", "streetname", "municipality", "lat", "long"]
    sv_gpd = gpd.GeoDataFrame(sv, geometry=gpd.points_from_xy(sv.long, sv.lat), crs=4326)
    print("columns:", sv.columns)
    print("rows:", sv.shape[0])
    return sv_gpd


# Straatvinken source data

The straatvinken source data consists of a CSV file, which was previously already transformed and enriched.
The transformed version contains traffic intensity (road user counts) for a number of road user types - car, pedestrian, bike, truck, publicTransport. The start and end date are indicated with seconds since epoch, and should all be the same hour in 2020 (to be checked). The objectId is an unique identifier linking to the wegenregister.

The enriched version has been linked with the wegenregister and also  contains the objectId but this time renamed to refRoadSegment. It contains some additional data such as municipality, street, latlon, geometry, and some administratory data columns (we will drop those, as they're all practically useless).

In [65]:
sv_gpd = get_straatvinken_data()
sv_gpd.head()

columns: Index(['ID', 'truck', 'bus', 'van', 'car', 'bike', 'walk', 'streetname',
       'municipality', 'lat', 'long', 'geometry'],
      dtype='object')
rows: 3550


Unnamed: 0,ID,truck,bus,van,car,bike,walk,streetname,municipality,lat,long,geometry
0,SV20P1984,0,4,0,31,2,10,Kleemstraat,Halle,50.719737,4.258857,POINT (4.25886 50.71974)
1,SV2020A1676,4,2,25,272,53,51,Nijvelsesteenweg,Halle,50.728366,4.243156,POINT (4.24316 50.72837)
2,SV20P0210,3,0,12,115,14,24,Poststraat,Halle,50.734949,4.23223,POINT (4.23223 50.73495)
3,SV20P1555,4,0,29,205,10,22,Jules Bordetlaan,Ronse,50.739619,3.605092,POINT (3.60509 50.73962)
4,SV20P1839,0,2,21,84,3,9,Gomar Vandewielelaan,Ronse,50.741474,3.594312,POINT (3.59431 50.74147)


# Bebouwingsdichtheid data

Building density data distributed as an open data dataset by Departement omgeving in the GeoTiff format.

In [None]:
bedi_path = "../data/raw/lu_bebdicht_ha_vlaa_2013_v2/lu_bebdicht_ha_vlaa_2013_v2.tif"

Reading and plotting the data

In [None]:
import georasters as gr
data = gr.from_file(bedi_path)

In [None]:
from matplotlib.cm import ScalarMappable
# Plot data
fig, ax = plt.subplots(figsize=(20,7))
sm=ScalarMappable()
sm.set_clim((data.min(), data.max()))

data.plot(ax=ax)
fig.colorbar(sm, ax=ax)
plt.title("Bebouwingsdichtheid Vlaanderen")
plt.savefig("bedi.png")
plt.show()
# Get some stats
print(f"mean {data.mean()} \nmin {data.min()} \nmax {data.max()} \nstd {data.std()}")

# Convert to Pandas DataFrame
#df = data.to_pandas()

Appending the data to the original dataframe (requires a coordinate transformation)

In [None]:
sv_gpd = sv_gpd.to_crs(31370)
sv_gpd["bedi"] = data.map_pixel(sv_gpd.geometry.x, sv_gpd.geometry.y)
sv_gpd = sv_gpd.to_crs(4326)

sv_gpd.plot(column="bedi", markersize=4, figsize=(14, 8))
#sv_gpd

In [None]:
# all in one function
import georasters as gr
from pyproj.crs import CRS
import osgeo
from osgeo.osr import SpatialReference

bedi_path="../data/raw/lu_bebdicht_ha_vlaa_2019_v2/lu_bebdicht_ha_vlaa_2019_v2.tif"
#bedi_path="../data/raw/lu_bebdicht_ha_vlaa_2013_v2/lu_bebdicht_ha_vlaa_2013_v2.tif"

def add_bebouwingsdichtheid(gdf, tif_file=bedi_path):
    raster_data=gr.from_file(tif_file)
    original_crs = gdf.crs
    if osgeo.version_info.major < 3:
        raster_crs = CRS.from_wkt(raster_data.projection.ExportToWkt())
    else:
        raster_crs = CRS.from_wkt(raster_data.projection.ExportToWkt(["FORMAT=WKT2_2018"]))
    gdf = gdf.to_crs(raster_crs)
    gdf["bedi"] = raster_data.map_pixel(gdf.geometry.x, gdf.geometry.y)
    #restore original geometry
    gdf = gdf.to_crs(original_crs)
    return gdf

sv_gpd = get_straatvinken_data()
print("adding bedi")
sv_gpd = add_bebouwingsdichtheid(sv_gpd, bedi_path)
sv_gpd.head()

### Future options

* add building density of immediate vicinity (3x3 raster, 6x6 raster)

# Population density data

Statbel provides population density data (number of inhabitants) per statistical sector. These statistical sectors have a code, and need to be mapped to geospatial features before we can add the data.

In [None]:
pode_path = "../data/raw/population_density_statbel/OPENDATA_SECTOREN_2021.xlsx"
pode_df = pd.read_excel(pode_path)
pode_df

### getting the statistical sectors

You can download the statistical sectors from Digitaal Vlaanderen by registering to their portal and placing an "order" for this data. It will be available for free but you have to go throught the process.

In the end you will get a (29MB) zipfile with a shapefile in it:

In [None]:
statsect_path = "../data/raw/sh_statbel_statistical_sectors_20210101.shp/sh_statbel_statistical_sectors_20210101.shp"
statsect_gpd = gpd.read_file(statsect_path)
statsect_gpd.plot()
print(statsect_gpd.crs)
statsect_gpd = statsect_gpd.to_crs(sv_gpd.crs)
statsect_gpd.head()

In [None]:
pode_merged = pd.merge(statsect_gpd, pode_df, how="left", left_on="CS01012021", right_on="CD_SECTOR")
pode_merged = pode_merged[["Shape_Leng", "Shape_Area", "TOTAL", "geometry"]]
pode_merged.columns = ['ss_lengte', "ss_oppervl", "pode", "geometry"]
pode_merged

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

pode_merged.plot(column='pode', scheme='QUANTILES', k=10, ax=ax,\
             cmap='BuPu', legend=True,
             legend_kwds={'loc': 'center left', 'bbox_to_anchor':(1,0.5)})
plt.title("Bevolkingsdichtheid")
plt.savefig("pode.png", dpi=300)
plt.show()


In [None]:
def drop_col_if_exists(df, col = ("index_left", "index_right")):
    if type(col) == str:
        cols=[col]
    else:
        cols=col
    for col in cols:
        if col in df.columns:
            df = df.drop(columns=col)
    return df

def add_populationdensity(
    gdf, 
    pode_path = "../data/raw/population_density_statbel/OPENDATA_SECTOREN_2021.xlsx", 
    statsect_path="../data/raw/sh_statbel_statistical_sectors_20210101.shp/sh_statbel_statistical_sectors_20210101.shp"
    
):
    pode_df = pd.read_excel(pode_path)
    statsect_gpd = gpd.read_file(statsect_path)
    statsect_gpd = statsect_gpd.to_crs(gdf.crs)
    pode_merged = pd.merge(statsect_gpd, pode_df, how="left", left_on="CS01012021", right_on="CD_SECTOR")
    pode_merged = pode_merged[["Shape_Leng", "Shape_Area", "TOTAL", "geometry"]]
    pode_merged.columns = ['ss_lengte', "ss_oppervl", "pode", "geometry"]
    gdf = drop_col_if_exists(gdf, col=("index_left", "index_right"))
    gdf = gdf.sjoin(pode_merged, how="left")
    gdf = drop_col_if_exists(gdf, col="index_right")
    return gdf

#sv_gpd = get_straatvinken_data()
sv_gpd = add_populationdensity(sv_gpd)
sv_gpd

In [None]:
sv_gpd.pode.hist(bins=25)

# Number of cars

Per statistical sector, released by StatBel

In [None]:
ncars_path = "../data/raw/number_of_cars_statbel/TF_CAR_HH_SECTOR.xlsx"
pd.read_excel(ncars_path).head()

In [None]:
def add_numberofcars(
    gdf, 
    ncar_path = "../data/raw/number_of_cars_statbel/TF_CAR_HH_SECTOR.xlsx", 
    statsect_path="../data/raw/sh_statbel_statistical_sectors_20210101.shp/sh_statbel_statistical_sectors_20210101.shp"
):
    """
    add_numberofcars:
        add the number of cars (ncars), number of households (nhh) and cars per household (ncars_hh)
        to a geopandas dataset, based on the statistical sector data from Flanders
    """
    ncars_df = pd.read_excel(ncar_path)
    statsect_gpd = gpd.read_file(statsect_path)
    statsect_gpd = statsect_gpd.to_crs(gdf.crs)
    ncars_merged = pd.merge(statsect_gpd, ncars_df, how="left", left_on="CS01012021", right_on="CD_STAT_SECTOR")
    ncars_merged = ncars_merged[["MS_NUM_HH", "MS_NUM_CAR", "geometry"]]
    ncars_merged.columns = ['nhh', "ncars", "geometry"]
    ncars_merged["ncars_hh"] = ncars_merged["ncars"] / ncars_merged["nhh"]
    gdf = drop_col_if_exists(gdf, col=("index_left", "index_right"))
    gdf = gdf.sjoin(ncars_merged, how="left")
    gdf = drop_col_if_exists(gdf, col="index_right")
    return gdf

#sv_gpd = get_straatvinken_data()
sv_gpd = add_numberofcars(sv_gpd)
sv_gpd.head()

In [None]:
ncar_path = "../data/raw/number_of_cars_statbel/TF_CAR_HH_SECTOR.xlsx"
statsect_path="../data/raw/sh_statbel_statistical_sectors_20210101.shp/sh_statbel_statistical_sectors_20210101.shp"

ncars_df = pd.read_excel(ncar_path)
statsect_gpd = gpd.read_file(statsect_path)
ncars_merged = pd.merge(statsect_gpd, ncars_df, how="left", left_on="CS01012021", right_on="CD_STAT_SECTOR")
ncars_merged = ncars_merged[["MS_NUM_HH", "MS_NUM_CAR", "geometry"]]
ncars_merged.columns = ['nhh', "ncars", "geometry"]
ncars_merged["ncars_hh"] = ncars_merged["ncars"] / ncars_merged["nhh"]
ncars_merged.to_crs(4326)


In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

ncars_merged.plot(column='ncars_hh', scheme='QUANTILES', k=5, ax=ax,\
             cmap='Blues', legend=True,
             legend_kwds={'loc': 'center left', 'bbox_to_anchor':(1,0.5)})
plt.title("Aantal wagens per huishouden")
plt.savefig("ncars.png", dpi=300)
plt.show()

# Wegenregister data

The wegenregister data contains lots of interesting information on Flemish roads of all kinds. It is available on Geopunt: https://www.geopunt.be/catalogus/datasetfolder/74b2534d-a8e6-4e4b-99d7-987b4829ff8d

It contains shapefiles and dBase IV and Layer files. of interest are the shapefiles:
* Wegknoop.shp
* Wegsegment.shp
and referentiepunten data (??)

The aim is to append the following data for each coördinate:
* number of lanes per direction
* width of road
* morfologische wegklasse (wegsegment) (DONE)
    - autosnelweg
    - weg met gescheiden rijbanen die geen autosnelweg is
    - weg bestaande uit 1 rijbaan
    - rotonde
    - speciale verkeerssituatie (rotonde achtig maar toch niet rotonde)
    - verkeersplein
    - op- of afrit, behorende tot een niet-gelijkgrondse verbinding
    - op- of afrit, behorende tot een gelijkgrondse verbinding
    - parallelweg
    - ventweg
    - in- of uitrit parking
    - in- of uitrit van dienst
    - voetgangerszone
    - wandel of fietsweg
    - tramweg
    - dienstweg
    - aardeweg
    - veer
    - niet gekend
* wegcategorie (wegsegment) (DONE)
    - hoofdweg (H)
    - primaire weg I (PI)
    - primaire weg II (PII)
    - primaire weg II type 1 (PII-1)
    - primaire weg II type 2 (PII-2)
    - primaire weg II type 3 (PII-3)
    - primaire weg II type 4 (PII-4)
    - secundaire weg (S)
    - secundaire weg type 1 (SI)
    - secundaire weg type 2 (SI)
    - secundaire weg type 3 (SI)
    - secundaire weg type 4 (SI)
    - locale weg (L)
* toegangsbeperking (websegment)
* ondergrondse kruising (type)
* wegverharding (type)
    - weg met vaste verharding
    - weg met losse verharding
    - niet gekend
    - nvt
* speed limit


Nice to have:
* number of national roads within 1km
* number of primary roads within 1km
* number of secundary roads within 1km
* number of local roads within 1km


In [None]:
wrsegm_path = "../data/raw/Wegenregister_SHAPE_20211216/Shapefile/Wegsegment.shp"
wrsegm_gdf = gpd.read_file(wrsegm_path)
wrsegm_gdf.head()

In [None]:
wrsegm_gdf.columns

In [None]:
wrsegm_gdf["MORF"]

We couple the AWV data to the nearest road segment. To achieve this, we must change both datasets (SV and AWV wegenregister) to a projection where "distance" makes sense. As in: distance in meters. The World Mercator (EGSG:3395) projection works for this. 

In [None]:
wrsegm_mini_gdf = wrsegm_gdf[["MORF", "WEGCAT", "geometry"]]
wrsegm_mini_gdf.columns = ["morf", "wegcat", "geometry"]
wrsegm_mini_gdf

In [None]:
wrsegm_mini_gdf.head()
offs = 7000
l = 200000.0
u = 200000.0
xmin, ymin, xmax, ymax = l, u ,l+offs, u + offs
frag_gdf = wrsegm_mini_gdf.cx[xmin:xmax, ymin:ymax]
frag_gdf["morf"] = frag_gdf["morf"].astype(str)
print(frag_gdf.dtypes)
frag_gdf.plot("morf", figsize=(10,10), legend=True)
plt.title("Weg per morfologie")
plt.savefig("morf_vic.png", dpi=300)
plt.show()

In [None]:
sv_gpd_wr = sv_gpd.to_crs(3395).sjoin_nearest(wrsegm_mini_gdf.to_crs(3395), how="left", max_distance=10, distance_col="wegsegment_distance")
sv_gpd_wr = sv_gpd_wr.to_crs(4326)
sv_gpd_wr[["lat", "long", "wegsegment_distance", 'morf', 'wegcat']]



In [None]:
print(sv_gpd_wr.shape)
print(sv_gpd_wr.wegsegment_distance.isna().value_counts())

In [None]:
fig, ax=plt.subplots(figsize=(10,5))
(sv_gpd_wr.morf.value_counts()/sv_gpd_wr.shape[0]*100).plot.barh(ax=ax)
plt.title("Measurement locations per morfological category")
ax.set_xlabel("%")
ax.set_xlim((0, 100))
plt.savefig("morf.png", dpi=100)
plt.show()

In [None]:
fig, ax=plt.subplots(figsize=(10,5))
(sv_gpd_wr.wegcat.value_counts()/sv_gpd_wr.shape[0]*100).plot.barh(ax=ax)
plt.title("Measurement locations per road category")
ax.set_xlabel("%")
ax.set_xlim((0, 100))
plt.savefig("wegcat.png", dpi=100)
plt.show()

#### Adding # of 'feature' within distance

We want to add the number of occurence of road types within a given distance
    
Using: https://gis.stackexchange.com/questions/222315/finding-nearest-point-in-other-geodataframe-using-geopandas

We perform a little test, changing it to our needs:

In [None]:
import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gdfA = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gdfB = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])

feature_col = "Place"
max_distance = 100
max_neighbours = 1000

zero_features = {key: 0 for key in set(gdfB[feature_col])}

A = np.concatenate(
    [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
B_ix = tuple(itertools.chain.from_iterable(
    [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
B = np.concatenate(B)
max_neighbours = min(max_neighbours, B.shape[0])
ckd_tree = cKDTree(B)
dist, idx = ckd_tree.query(A, k=max_neighbours)
feature_counts = []
num_neighbours = []
for dist, idx in zip(dist, idx):
    idx = itemgetter(*idx)(B_ix)
    closest_gdf = pd.DataFrame({"idx": list(idx), feature_col: gdfB.iloc[list(idx)][feature_col].values, "dist": dist})
    closest_gdf = closest_gdf[eval(f"dist < {max_distance}")]
    cleanidx = list( dict.fromkeys(closest_gdf["idx"].values))
    feature_count = zero_features.copy()
    feature_count.update(dict(gdfB.iloc[cleanidx][feature_col].value_counts()))
    #print(feature_count)
    feature_counts.append(feature_count)
    num_neighbours.append(len(cleanidx))

features_df = pd.DataFrame(feature_counts)
features_df.columns = [f"{feature_col}_{col}" for col in features_df.columns]
gdf = pd.concat(
    [gdfA, features_df, pd.Series(num_neighbours, name=f"{feature_col}_num_neighbors")], axis=1)
gdf

In [None]:
from scipy.spatial import cKDTree
import time
from typing import Tuple

def get_featurecount_within_distance(
    src_gdf: gpd.GeoDataFrame, 
    featuredb_gdf: gpd.GeoDataFrame, 
    feature_col: str,
    max_distance: float = 2000,
    max_neighbours: int = 1000, 
    result_type: str = "feature_columns",
    column_suffix: str = "",
    total_buckets_prefix: Tuple[str, ...] = (),
    feature_filters: Tuple[str, ...] = ("-8"," -9"),
    verbose=False
):
    """
    Returns the number of a given 'feature column' occurence within a given distance

            Parameters:
                    src_gdf (GeoDataFrame): dataframe containing the coordinates 
                        to be enriched with the featurecounts
                    featuredb_gdf (GeoDataFrame): feature dataframe with linestrings for geometry objects
                    feature_col (str): the column of which occurence counts will be made
                    result_type (str): 
                        feature_columns: explode the categories into separate columns 
                            using the "feature_col"_"category" naming convention
                        map: add result as a dictionary object to a single column with the name "feature_col"

            Returns:
                    enriched_gdf (GeoDataFrame): the original dataframe with feature count data added
    
    """
    
    if result_type not in ["feature_columns", "map"]:
        raise ValueError(f"Invalid result_type parameter value '{result_type}'")
        
    start = time.time()
    src_gdf=src_gdf.to_crs(3395)
    featuredb_gdf=featuredb_gdf.to_crs(3395)
    if verbose:
        print(f"crs adaptation performed in {time.time() - start:.2f}s")
    
    if len(feature_filters) > 0:
        start=time.time()
        featuredb_gdf = featuredb_gdf[(~featuredb_gdf[feature_col].isin(feature_filters))]
        if verbose:
            print(f"feature filter performed in {time.time() - start:.2f}s")
    
    zero_features = {key: 0 for key in set(featuredb_gdf[feature_col])}
    
    start = time.time()
    A = np.concatenate(
        [np.array(geom.coords) for geom in src_gdf.geometry.to_list()])
    B = [np.array(geom.coords) for geom in featuredb_gdf.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    if verbose:
        print(f"geometries created in {time.time() - start:.2f}s")
    
    max_neighbours = min(max_neighbours, B.shape[0])
    
    start = time.time()
    ckd_tree = cKDTree(B)
    if verbose:
        print(f"ckd tree created in {time.time() - start:.2f}s")
    
    start = time.time()
    dist, idx = ckd_tree.query(A, k=max_neighbours)
    if verbose:
        print(f"query performed in {time.time() - start:.2f}s")
    
    start = time.time()
    feature_counts = []
    num_neighbours = []
    for dist, idx in zip(dist, idx):
        idx = itemgetter(*idx)(B_ix)
        closest_gdf = pd.DataFrame(
            {
                "idx": list(idx), 
                feature_col: featuredb_gdf.iloc[list(idx)][feature_col].values, 
                "dist": dist
            }
        )
        closest_gdf = closest_gdf[eval(f"dist < {max_distance}")]
        cleanidx = list(dict.fromkeys(closest_gdf["idx"].values))
        feature_count = zero_features.copy()
        feature_count.update(dict(featuredb_gdf.iloc[cleanidx][feature_col].value_counts()))
        
        #add totals
        for total_bucket in total_buckets_prefix:
            feature_count[f"{total_bucket}tot"] = sum([feature_count[key] for key in feature_count.keys() if key.startswith(total_bucket)])
        
        feature_counts.append(feature_count)
        num_neighbours.append(len(cleanidx))
    
    if verbose:
        print(f"data adaptation performed in {time.time() - start:.2f}s")
    if result_type == "feature_columns":
        features_df = pd.DataFrame(feature_counts)
        features_df.columns = [f"{feature_col}_{col}{column_suffix}" for col in features_df.columns]
        gdf = pd.concat(
            [src_gdf, features_df, pd.Series(num_neighbours, name=f"{feature_col}{column_suffix}_num_neighbors")], axis=1)
        return gdf.to_crs(4326)
    else:
        gdf = pd.concat(
            [src_gdf, 
             pd.Series(feature_counts, name=f"{feature_col}{column_suffix}_vicinity_counts"),
             pd.Series(num_neighbours, name=f"{feature_col}{column_suffix}_num_neighbors")], axis=1)
        return gdf.to_crs(4326)


In [None]:
#sv_gpd = get_straatvinken_data()
sv_gpd = get_featurecount_within_distance(
    src_gdf = sv_gpd,
    featuredb_gdf = wrsegm_mini_gdf,
    feature_col = "wegcat",
    max_distance = 1000,
    result_type = "feature_columns",
    column_suffix = "_1000",
    #total_buckets_prefix = ("H", "P", "S", "L"),
    feature_filters = ("-8", "-9")
)
sv_gpd

In [None]:
sv_gpd.loc[8, sorted([col for col in sv_gpd.columns if col.startswith("wegcat")])].T

In [None]:
sv_gpd[[col for col in sv_gpd.columns if col.startswith("wegcat") and not col.endswith("bors")]].max()

In [None]:
col_labels = ["H", "L1", "L2", "L3", "S", "S1", "S2", "S3"]#, "PI", "PII"]
col_names = [f"wegcat_{col}_1000" for col in col_labels]
fig, ax = plt.subplots(figsize=(15, 10))
sns.histplot(sv_gpd[col_names], bins=25, fill=True, alpha=0.3, multiple="stack")
plt.title("Aantal wegen per wegcategorie in straal van 1000m")
plt.savefig("wegcat_vicinity.png", dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
#sv_gpd.columns if col.startswith("wegcat") and not col.endswith("bors")
for col in col_names:
    sv_gpd[col].hist(alpha=0.5, bins=25)
#plt.hist([sv_gpd[col] for col in col_names],
#          bins=50, range=(0,267), stacked=True) #, color = ['r','g'])
ax.set_yscale('log')
#ax.set_xscale('log')
ax.set_xlabel("number of neighbors")
ax.set_xlabel("count")

ax.legend(col_labels)
plt.title("Stacked histogram of road categories in vicinity of 1000 meters")
plt.show()

In [None]:
sv_gpd.loc[8, sorted([col for col in sv_gpd.columns if col.startswith("morf")])].T

In [None]:
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 12}

matplotlib.rc('font', **font)

sv_gpd.wegcat_Ltot_1000.hist()

In [None]:
import time
#sv_gpd = get_straatvinken_data()
for max_distance in [50, 250, 1000]:
    start=time.time()
    sv_gpd = get_featurecount_within_distance(
        src_gdf=sv_gpd,
        featuredb_gdf=wrsegm_mini_gdf,
        feature_col="wegcat",
        max_distance=max_distance,
        result_type="feature_columns",
        column_suffix=f"_{max_distance}",
        total_buckets_prefix = ("H", "P", "S", "L"),
        feature_filters = ("-8", "-9"),
        verbose=False
    )
    print(f" - wegcat features within {max_distance} done in {time.time()-start:.2f}s")
    
    start=time.time()
    sv_gpd = get_featurecount_within_distance(
        src_gdf=sv_gpd,
        featuredb_gdf=wrsegm_mini_gdf,
        feature_col="morf",
        max_distance=max_distance,
        result_type="feature_columns",
        column_suffix=f"_{max_distance}",
        feature_filters = ("-8", "-9"),
        verbose=False
    )
    print(f" - morf features within {max_distance} done in {time.time()-start:.2f}s")
    
#sv_gpd = add_bebouwingsdichtheid(sv_gpd)
#sv_gpd = add_numberofcars(sv_gpd)
#sv_gpd = add_populationdensity(sv_gpd)

In [None]:
# maybe the parameter max_neighbours needs tweaking?
sv_gpd["wegcat_1000_num_neighbors"].hist()

In [None]:
morf_codetabel = wrsegm_gdf[["MORF", "LBLMORF"]].drop_duplicates().reset_index(drop=True)
morf_codetabel.columns = ["morf", "description"]
morf_codetabel = morf_codetabel.sort_values(by=["morf"]).reset_index(drop=True)
wegcat_codetabel = wrsegm_gdf[["WEGCAT", "LBLWEGCAT"]].drop_duplicates().reset_index(drop=True)
wegcat_codetabel.columns = ["wegcat", "description"]
wegcat_codetabel = wegcat_codetabel.sort_values(by=["wegcat"]).reset_index(drop=True)
wegcat_codetabel

In [None]:
morf_codetabel

## Correlation plots

In [None]:
import matplotlib
import seaborn as sns

font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 22}

matplotlib.rc('font', **font)

cols=["bike", "walk", "bus","car", "truck", "van", "bedi", "pode", "ncars"]
corr = sv_gpd[cols].corr()
#
# Set up the matplotlib plot configuration
#
f, ax = plt.subplots(figsize=(15, 15))
#
# Generate a mask for upper traingle
#
mask = np.triu(np.ones_like(corr, dtype=bool))
#
# Configure a custom diverging colormap
#
cmap = sns.color_palette("Spectral", as_cmap=True)
#
# Draw the heatmap
#
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap, center=0, vmin=-1, vmax=1)

In [None]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

htype = widgets.SelectMultiple(
    value=["L"],
    options=["L", "S", "P", "H"]
)

dist_options=[50, 250, 1000]
print(dist_options)
dist = widgets.SelectMultiple(
    value=[1000],
    options=dist_options
)


def show_corr_htype(h_type, max_distance):
    general_cols=["bike", "walk", "bus","car", "truck", "van"]
    wegcat_cols = general_cols
    for _h_type in h_type:
        wegcat_cols.extend(
            [col for col in sv_gpd.columns 
             if col.startswith(f"wegcat_{_h_type}") 
             and int(col.split("_")[-1]) in max_distance])
    corr = sv_gpd[wegcat_cols].corr()

    fig, ax = plt.subplots(figsize=(15, 15))

    mask = np.triu(np.ones_like(corr, dtype=bool))

    cmap = sns.color_palette("Spectral", as_cmap=True)
    sns.heatmap(corr, annot=True, mask = mask, cmap=cmap, center=0, vmin=-1, vmax=1)
    plt.title(f"Traffic intensity - correlation with roads of type {','.join(h_type)}")
    plt.show()
    
display(wegcat_codetabel)
    
interact(show_corr_htype, h_type=htype, max_distance=dist)

#### Findings for wegcat

There is a non-trivial correlation between:
* wegcat S2 max dist 100 and trucks of 0.39
* wegwat L3 max dist 1000 and walk of 0.48
* wegcat PII max dist 100 and bus of 0.23
* wegcat L1 max dist 100 and car (0.36) and van (0.38)


In [None]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

morf_options = list(morf_codetabel.morf.values)

morf = widgets.SelectMultiple(
    value=morf_options[:5],
    options=morf_options
)

dist_options=[50, 250, 1000]
dist = widgets.SelectMultiple(
    value=[1000],
    options=dist_options
)


def show_corr_morf(morf, max_distance):
    general_cols=["bike", "walk", "bus","car", "truck", "van"]
    morf_cols = general_cols
    if len(morf) > 5:
        print("More than 5 morfology smurfs selected!")
    for _morf in morf:
        morf_cols.extend(
            [col for col in sv_gpd.columns 
             if col.startswith(f"morf_{_morf}") 
             and int(col.split("_")[-1]) in max_distance])
    corr = sv_gpd[morf_cols[:len(general_cols) + 5]].corr()

    fig, ax = plt.subplots(figsize=(15, 15))

    mask = np.triu(np.ones_like(corr, dtype=bool))

    cmap = sns.color_palette("Spectral", as_cmap=True)
    sns.heatmap(corr, annot=True, mask = mask, cmap=cmap, center=0, vmin=-1, vmax=1)
    plt.title(f"Traffic intensity - correlation with morfology types")
    plt.show()
    
display(morf_codetabel)
    
interact(show_corr_morf, morf=morf, max_distance=dist)

#### Findings from morfology data

Non-trivial correlation between 
* morf 103 (weg bestaande uit één rijbaan) within max distance 500 and walk of 0.4
* morf 102 (weg met gescheiden rijbaan geen autosnelweg) within max dist 50 and bike (.24), bus (0.31), car (0.29), van (0.22)
* morf 113 (voetgangerszone) within max distance 1000 or 2000 with walk 0.27
* morf 116 (tramweg niet toegankelijk voor andere voertuigen) within max dist 1000 or 2000 with walk .26
* morf 116 within max dist 50 with bike 0.26, bus 0.25, car 0.21
* morf 116 within max dist 150 with bike 0.3
* morf 125 (aardeweg) within distance 2000 with walk (-0.19)

### Reading the DBF files of AWV's WR

Using simpledbf

In [None]:
import time

start=time.time()

wrsegm_path = "../data/raw/Wegenregister_SHAPE_20211216/Shapefile/Wegsegment.shp"
wrsegm_gdf = gpd.read_file(wrsegm_path)
print(f" - wr segments loaded in {time.time() - start:.2f}s")

In [None]:
from simpledbf import Dbf5
import time

start=time.time()
dbf_basepath = "../data/raw/Wegenregister_SHAPE_20211216/Shapefile/"
# route is E-weg
atteuropweg_df = Dbf5(os.path.join(dbf_basepath, "AttEuropweg.dbf")).to_dataframe()
atteuropweg_df["is_eurw"] = 1
atteuropweg_df = atteuropweg_df[["WS_OIDN", "is_eurw"]]
atteuropweg_df
# route is N-weg
attnationweg_df = Dbf5(os.path.join(dbf_basepath, "AttNationweg.dbf")).to_dataframe()
attnationweg_df["is_natw"] = 1
attnationweg_df = attnationweg_df[["WS_OIDN", "is_natw"]]
attnationweg_df
# aantal rijstroken
attrijstroken_df = Dbf5(os.path.join(dbf_basepath, "AttRijstroken.dbf")).to_dataframe()
attrijstroken_df = attrijstroken_df[(attrijstroken_df["AANTAL"] > 0)].groupby("WS_OIDN").agg({"AANTAL": sum}).reset_index()
attrijstroken_df.columns=["WS_OIDN", "nrijstr"]
attrijstroken_df
# weg verharding
attwegverhard_df = Dbf5(os.path.join(dbf_basepath, "AttWegverharding.dbf")).to_dataframe()
attwegverhard_df = attwegverhard_df[(attwegverhard_df.TYPE > 0)]
attwegverhard_df["LEN"] = np.abs(attwegverhard_df["TOTPOS"] - attwegverhard_df["VANPOS"])
attwegverhard_df = attwegverhard_df.sort_values(by=["WS_OIDN", "LEN"], ascending=[True, False])
attwegverhard_df = attwegverhard_df.groupby("WS_OIDN").nth(0).reset_index()[["WS_OIDN", "TYPE", "LBLTYPE"]]
attwegverhard_df["TYPE"] = attwegverhard_df["TYPE"].astype(str)
attwegverhard_df["LBLTYPE"] = attwegverhard_df["LBLTYPE"].astype(str)
attwegverhard_df.columns = ["WS_OIDN", "verh", "verhlbl"]
attwegverhard_df
# wegbreedte
attwegbreedte_df = Dbf5(os.path.join(dbf_basepath, "AttWegbreedte.dbf")).to_dataframe()
attwegbreedte_df = attwegbreedte_df[
    (attwegbreedte_df["BREEDTE"] > 0)
].groupby("WS_OIDN").agg({"BREEDTE": np.mean}).reset_index()
attwegbreedte_df.columns = ["WS_OIDN", "wb"]
attwegbreedte_df
# genummerde weg ... not usefull?
#attgenumweg_df = Dbf5(os.path.join(dbf_basepath, "AttGenumweg.dbf")).to_dataframe()
# ongelijkgrondse kruising
#attrltogkruising_df = Dbf5(os.path.join(dbf_basepath, "RltOgkruising.dbf")).to_dataframe()
print(f" - wr attributes loaded in {time.time() - start:.2f}s")

In [None]:
# Join to wrsegm_df
start = time.time()
wrsegm_gdf_att=wrsegm_gdf.copy()
for attdf in [atteuropweg_df, attnationweg_df, attrijstroken_df, attwegverhard_df, attwegbreedte_df]:
    wrsegm_gdf_att = pd.merge(wrsegm_gdf_att, attdf, how="left", on="WS_OIDN")
wrsegm_gdf_att[["is_eurw", "is_natw", "wb", "nrijstr"]] = wrsegm_gdf_att[["is_eurw", "is_natw", "wb", "nrijstr"]].fillna(0)
print(f" - wr attributes joined to segments in {time.time() - start:.2f}s")

In [None]:
wrsegm_gdf_att.loc[0].T

# Population grid data

In [None]:
popu_path = "../data/raw/TF_POPULATION_GRID_3035_20200101.shp/TF_POPULATION_GRID_3035_20200101.shp"
popu_gpd = gpd.read_file(popu_path)
print(popu_gpd.columns)
#popu_gpd.plot()
popu_gpd.head()

In [None]:
popu_gpd

fig, ax = plt.subplots(figsize=(12, 8))

popu_gpd.plot(column='ms_pop', scheme='QUANTILES', k=8, ax=ax,\
             cmap='Oranges', legend=True,
             legend_kwds={'loc': 'center left', 'bbox_to_anchor':(1,0.5)})
plt.title("Bevolking per km^2")
plt.savefig("mspop.png", dpi=300)
plt.show()

In [None]:
popu_gpd = popu_gpd.to_crs(sv_gpd.crs)
popu_gpd = popu_gpd[["ms_pop", "geometry"]]
sv_gpd = sv_gpd.sjoin(popu_gpd, how="left")
sv_gpd

In [None]:
def add_pop_sm(
    gdf, 
    popu_path = "../data/raw/TF_POPULATION_GRID_3035_20200101.shp/TF_POPULATION_GRID_3035_20200101.shp", 
):
    popu_df = gpd.read_file(pode_path)
    popu_gpd = popu_gpd.to_crs(sv_gpd.crs)
    popu_gpd = popu_gpd[["ms_pop", "geometry"]]
    gdf = gdf.sjoin(popu_gpd, how="left")
    gdf = drop_col_if_exists(gdf, col="index_right")
    return gdf

sv_gpd = add_pop_sm(sv_gpd, popu_path)

# Traffic accidents

Some explanation of the variables is in a separate XLSX:

In [None]:
acc_expl_path = "../data/raw/traffic_accidents_statbel/TF_ACCIDENTS.xlsx"
pd.read_excel(acc_expl_path)

In [None]:
acc_path = "../data/raw/traffic_accidents_statbel/TF_ACCIDENTS_2020.xlsx"
acc_df = pd.read_excel(acc_path)
acc_df = acc_df.rename(columns={
    "MS_ACCT": "acc",
    "MS_ACCT_WITH_DEAD": "acc_death",
    "MS_ACCT_WITH_DEAD_30_DAYS": "acc_death30",
    "MS_ACCT_WITH_MORY_INJ": "acc_mort",
    "MS_ACCT_WITH_SERLY_INJ": "acc_ser",
    "MS_ACCT_WITH_SLY_INJ": "acc_sly"
})
counter_cols = ["acc", "acc_death", "acc_death30", "acc_mort", "acc_ser", "acc_sly"]
acc_df["CD_MUNTY_REFNIS"] = acc_df["CD_MUNTY_REFNIS"].astype(str)
print(acc_df.columns)
acc_df.loc[0].T
counter_cols.append("CD_MUNTY_REFNIS")
acc_df = acc_df[counter_cols].groupby("CD_MUNTY_REFNIS").agg("sum").reset_index()

In [None]:
True in list((acc_df.groupby("CD_MUNTY_REFNIS").size()>1).value_counts().keys())

In [None]:
import warnings

def add_attributes(
    df: pd.DataFrame,
    attdf: pd.DataFrame,
    on,
    how: str,
    filtr: str=None,
    retain_cols: Tuple[str, ...]=None
):
    if how not in ["left", "right", "inner", "outer"]:
        raise ValueError("'how' parameter doesn't have one of the accepted values: left, right, outer, inner")
    
    if filtr is not None:
        attdf = attdf[attdf.eval(filtr)]
    if retain_cols is not None:
        retc = list(retain_cols)
        try:
            attdf[retc]
        except KeyError as ke:
            raise ValueError(f"one of required columns does not exist: {ke}")
    if type(on) == str:
        if not(on in df.columns and on in attdf.columns):
            raise ValueError("'on' column doesn't exist in one of the provided dataframes")    
        if retain_cols is not None:
            retc = list(retain_cols)
            retc.append(on)
            attdf = attdf[retc]
        # check attdf by grouping on "on"
        if True in list((attdf.groupby(on).size()>1).value_counts().keys()):
            warnings.warn("attribute dataframe has multiple rows when grouping on the 'on' column")
        df = pd.merge(df, attdf, how=how, on=on)
    elif type(on) in [tuple, list]:
        if len(on) != 2:
            raise ValueError("'on' tuple/list doesn't have exactly 2 items")
        left_on, right_on = on
        # check attdf by grouping on "right_on"
        if True in list((attdf.groupby(right_on).size()>1).value_counts().keys()):
            warnings.warn("attribute dataframe has multiple rows when grouping on the rightmost 'on' column")
        if retain_cols is not None:
            retc = list(retain_cols)
            retc.append(right_on)
            attdf = attdf[retc]
        df = pd.merge(df, attdf, how=how, left_on=left_on, right_on=right_on)
    else:
        raise ValueError(f"'on' has unacceptable type {type(on)} (should be in tuple, list or str)")
    return df

add_attributes(
    statsect_gpd, 
    acc_df, 
    how="left", 
    on=["CNIS5_2021", "CD_MUNTY_REFNIS"], 
    retain_cols=["acc", "acc_death", "acc_death30", "acc_mort", "acc_ser", "acc_sly"]
)

In [None]:
def add_trafficaccidents(
    gdf, 
    acc_path = "../data/raw/traffic_accidents_statbel/TF_ACCIDENTS_2020.xlsx",
    statsect_path="../data/raw/sh_statbel_statistical_sectors_20210101.shp/sh_statbel_statistical_sectors_20210101.shp"
):
    """
    add_trafficaccidents:
        add the number of traffic accidents to a geopandas dataset
        based on the statistical sector data from Flanders
    """
    acc_df = pd.read_excel(acc_path)
    acc_df = acc_df.rename(columns={
        "MS_ACCT": "acc",
        "MS_ACCT_WITH_DEAD": "acc_death",
        "MS_ACCT_WITH_DEAD_30_DAYS": "acc_death30",
        "MS_ACCT_WITH_MORY_INJ": "acc_mort",
        "MS_ACCT_WITH_SERLY_INJ": "acc_ser",
        "MS_ACCT_WITH_SLY_INJ": "acc_sly"
    })
    counter_cols = ["acc", "acc_death", "acc_death30", "acc_mort", "acc_ser", "acc_sly"]
    acc_df["CD_MUNTY_REFNIS"] = acc_df["CD_MUNTY_REFNIS"].astype(str)
    counter_cols.append("CD_MUNTY_REFNIS")
    acc_df = acc_df[counter_cols].groupby("CD_MUNTY_REFNIS").agg("sum").reset_index()
    statsect_gpd = gpd.read_file(statsect_path)
    statsect_gpd = statsect_gpd.to_crs(gdf.crs)
    statsect_gpd = add_attributes(
        statsect_gpd, 
        acc_df, 
        how="left", 
        on=["CNIS5_2021", "CD_MUNTY_REFNIS"], 
        retain_cols=["acc", "acc_death", "acc_death30", "acc_mort", "acc_ser", "acc_sly"]
    )
    statsect_gpd = statsect_gpd[["geometry", "acc", "acc_death", "acc_death30", "acc_mort", "acc_ser", "acc_sly"]]
    gdf = drop_col_if_exists(gdf, col=["index_lef", "index_right"])
    gdf = gdf.sjoin(statsect_gpd, how="left")
    gdf = drop_col_if_exists(gdf, col="index_right")
    return gdf

sv_gpd = add_trafficaccidents(sv_gpd)

# Ruimterapport: Kernen, linten, verspreide bebouwing in Vlaanderen

Deze kaart geeft een typologische indeling van de bebouwing in Vlaanderen weer, waarbij deze wordt opgedeeld in kernen, linten en verspreide bebouwing. Bij deze indeling wordt maximaal uitgegaan van een morfologische benadering.

In [None]:
rura_paths = [
    "../data/raw/lu_klv_vlaa_2019/bedrijfmilitaircamping2019_v2.shp",
    "../data/raw/lu_klv_vlaa_2019/kernclustergrenzen2019_v2.shp",
    "../data/raw/lu_klv_vlaa_2019/kernclusters_kernen2019_v2.shp",
    "../data/raw/lu_klv_vlaa_2019/kernen2019_v2.shp",
    "../data/raw/lu_klv_vlaa_2019/kernclusters_linten2019_v2.shp",    
    "../data/raw/lu_klv_vlaa_2019/verspreide_gebouwen2019_v2.shp",
    "../data/raw/lu_klv_vlaa_2019/linten2019_v2.shp"
]
colors = ["blue", "orange", "red", "green", "yellow", "purple", "cyan"]

fig, ax=plt.subplots(figsize=(18, 8))
for rura_path, color in zip(rura_paths,colors):
    rura_gdf = gpd.read_file(rura_path)
    label=rura_path.replace("../data/raw/lu_klv_vlaa_2019/", "").replace("_v2.shp", "")
    print(f"{label} ({rura_gdf.shape[0]}) - {rura_gdf.crs} - {rura_gdf.columns}")
    print(pd.unique(rura_gdf.geometry.type))
    display(rura_gdf.head())
    rura_gdf.plot(
        ax=ax,
        color=color,
        label=label
    )
plt.title("Ruimterapport 2021")
plt.legend()
plt.show()

In [None]:
rura_gdf.head()

In [7]:
#!/opt/anaconda3/envs/straatvinken/bin/pip install geog

[31mERROR: Could not find a version that satisfies the requirement pytables (from versions: none)[0m
[31mERROR: No matching distribution found for pytables[0m


In [None]:
import numpy as np
import json
import geog
import shapely.geometry

#def get_shape_within_radius()
_center = shapely.geometry.Point([3.75563, 51.05195])
n_points = 30
radius = 250 # meters
_angles = np.linspace(0, 360, n_points)
polygon = geog.propagate(_center, _angles, radius)
print(json.dumps(shapely.geometry.mapping(shapely.geometry.Polygon(polygon))))

In [None]:
sv_gpd["circle"].plot()

In [None]:
refdb

In [None]:
rura_path = "../data/raw/lu_klv_vlaa_2019/kernclusters_kernen2019_v2.shp"
refdb = gpd.read_file(rura_path)
print("KERNCAT\n", refdb.KERNCAT.value_counts())
print("ClusType\n", refdb.ClusType.value_counts())
refdb = refdb.to_crs(sv_gpd.crs)
refdb.head()

### Kernen zonder onderscheid van types (1 grote multipolygon)

In [None]:
sv_gpd = sv_gpd.to_crs(3395)
refdb = refdb.to_crs(3395)
kernen = refdb.unary_union
circles = sv_gpd.buffer(250)
circles.intersection(kernen)

In [None]:
sv_gpd["kernen"] = circles.intersection(kernen).area

In [None]:
sv_gpd["kern_prop"] = sv_gpd["kernen"].area / circles.area

In [None]:
sv_gpd = sv_gpd.to_crs(4326)

In [None]:
sv_gpd[["geometry", "circle", "kernen", "kern_prop"]].head()

In [None]:
#!/opt/anaconda3/envs/straatvinken/bin/pip install branca folium mapclassify

In [None]:
sv_gpd.explore(column="kern_prop")

In [None]:
sv_gpd["kern_prop"].hist(bins=25)

### totale lengte van linten

In [None]:
rura_path = "../data/raw/lu_klv_vlaa_2019/kernclusters_linten2019_v2.shp"
lintdb = gpd.read_file(rura_path)
display(lintdb.head())
print("ClusType\n", refdb.ClusType.value_counts())


In [None]:
sv_gpd = sv_gpd.to_crs(3395)
lintdb = lintdb.to_crs(3395)
linten = lintdb.unary_union
circles = sv_gpd.buffer(250)

In [None]:
sv_gpd["linten"] = circles.intersection(linten)

In [None]:
sv_gpd["linten"].length

In [None]:
sv_gpd["lint_len"] = sv_gpd["linten"].length

In [None]:


sv_gpd["lint_len"].hist(bins=25)

In [None]:
#m= lintdb.explore(color="red")
m=sv_gpd["linten"].explore(m=m)
sv_gpd.explore(m=m, column="lint_len", vmin=0, vmax=250, tiles='Stamen Watercolor')

In [None]:
rura_path = ""

# Google streetview - data gathering

We add google streetview data, on which we can retrieve 
* landscape elements through image segmentation
* latent representation using a CNN architecture or a Autoencoder architecture, to obtain a feature vector for every coördinate

In [None]:
#pip install google_streetview

You need to get a google developer API key to use the API. Place this API key in the .env file in the root of the solution.

In [None]:
from dotenv import load_dotenv
import os
load_dotenv() 
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
STREETVIEW_DOWNLOAD_DIR = "../data/raw/google_streetview/"
#GOOGLE_API_KEY

In [None]:
#sv_gpd = get_straatvinken_data()

In [None]:
sv_gpd.iloc[1900:]

In [None]:
import google_streetview.api
import google_streetview.helpers
import os
import time

heading_map = {
    0: "N_0",
    1: "E_90",
    2: "S_180",
    3: "W_270",
}

status_list = []
actual_locations = []

for i, row in sv_gpd.iterrows():
    coord = f"{row['lat']}, {row['long']}"
    # Fill in coordinates
    apiargs = {
        'size': '640x640',
        'heading': '0;90;180;270',
        #'fov': '90',
        #'pitch': '0',
        "location": coord,
        'key': GOOGLE_API_KEY
    }
    #print(apiargs)

    # Get a list of all possible queries from multiple parameters
    api_list = google_streetview.helpers.api_list(apiargs)

    # Create a results object for all possible queries
    results = google_streetview.api.results(api_list)
   
    if results.metadata[0] not in [{'status': 'NOT_FOUND'}, {'status': 'ZERO_RESULTS'}]:
        
        # Preview results
        #results.preview()

        # Download images to directory 'downloads'
        #results.download_links('../data/raw/google_streetview/tmp')

        # Save metadata
        # results.save_metadata('../data/raw/google_streetview/tmp/metadata.json')
        _all_pano_ids = ['_'.join(x.split("_")[:-3]) for x in os.listdir(STREETVIEW_DOWNLOAD_DIR)]
        if "pano_id" not in results.metadata[0]:
            print(f"pano_id not found in metadata! {results.metadata}")
            continue
        _pano_id = results.metadata[0]["pano_id"]

        if _pano_id in _all_pano_ids:
            print(f"{i:04d}/{sv_gpd.shape[0]} already downloaded!")
            _status = {
                    "i": i, 
                    "coord": coord, 
                    "location": results.metadata[0]["location"],
                    "status": "ok", 
                    "metadata": results.metadata,
                    "pano_id": results.metadata[0]["pano_id"],
                    "date": results.metadata[0]["date"],
                    "duplicate": False
                }
            status_list.append(_status)
        else:
            _status = {
                    "i": i, 
                    "coord": coord, 
                    "location": results.metadata[0]["location"],
                    "status": "ok", 
                    "metadata": results.metadata,
                    "pano_id": results.metadata[0]["pano_id"],
                    "date": results.metadata[0]["date"],
                }
            _actual_loc = results.metadata[0]["location"]
            if _actual_loc in actual_locations:
                print(f"{i:04d}/{sv_gpd.shape[0]} found a duplicate for this coördinate: {coord}")
                _status.update({"duplicate": True})
                status_list.append(_status)
            else:
                print(f"{i:04d}/{sv_gpd.shape[0]} found a new panorama at this coördinate: {coord} {_pano_id}")
                actual_locations.append(_actual_loc)
                _status.update({"duplicate": False})
                status_list.append(_status)
                results.download_links(os.path.join(STREETVIEW_DOWNLOAD_DIR, 'tmp'))
                for heading_key in heading_map.keys():
                    _src = os.path.join(STREETVIEW_DOWNLOAD_DIR, "tmp", f"gsv_{heading_key}.jpg")
                    _dst = os.path.join(STREETVIEW_DOWNLOAD_DIR, f"{_pano_id}_{heading_key}_{heading_map[heading_key]}.jpg")
                    os.rename(_src, _dst)
                # let's not overload google's API
                #time.sleep(0.2)
    else:
        print(f"{i:04d}/{sv_gpd.shape[0]} didn't find a panorama at this coordinate!")
        status_list.append({"i": i, "coord": coord, "status": "nok", "metadata": results.metadata})

In [None]:
#status_df = pd.DataFrame(status_list)

In [None]:
def join_coords(rowset):
    rowset["coord"] = rowset["lat"].astype(str).str.cat(rowset["long"].astype(str), sep=",")
    return "; ".join(rowset["coord"].values)
sv_gpd.groupby(by="municipality").apply(lambda x: join_coords(x))

In [None]:
sv_gpd.groupby(by="municipality").apply(lambda x: join_coords(x)).reset_index().iloc[:2]

In [None]:
_all_pano_ids = ['_'.join(x.split("_")[:-3]) for x in os.listdir(STREETVIEW_DOWNLOAD_DIR)]

actual_locations = []
status_list = []
j=-1

for i, (muni, locations) in sv_gpd.groupby(by="municipality").apply(lambda x: join_coords(x)).reset_index().iterrows():
    print(f"{muni} - {len(locations.split('; '))}")
    apiargs= {
            'size': '640x640',
            'heading': '0;90;180;270',
            "location": locations,
            'key': GOOGLE_API_KEY
    }
    # Get a list of all possible queries from multiple parameters
    api_list = google_streetview.helpers.api_list(apiargs)
    # Create a results object for all possible queries
    results = google_streetview.api.results(api_list)
    print(results.metadata[::4])
    
    for ix, item in enumerate(results.metadata[::4]):
        
        if item in [{'status': 'NOT_FOUND'}, {'status': 'ZERO_RESULTS'}]:
            print("error: ", item)
        else:
            j+=1
            if "pano_id" not in item:
                print(f"{j:04d} pano_id not found in metadata! {results.metadata}")
                continue
            _pano_id = item["pano_id"]

            if _pano_id in _all_pano_ids:
                print(f"{j:04d} already downloaded!")
                _status = {
                        "i": j,
                        "coord": locations.split("; ")[ix], 
                        "location": item["location"],
                        "status": "ok", 
                        "metadata": item,
                        "pano_id": _pano_id,
                        "date": item["date"],
                        "duplicate": False
                    }
                status_list.append(_status)
            else:
                _status = {
                        "i": i, 
                        "coord": locations.split("; ")[ix], 
                        "location": item["location"],
                        "status": "ok", 
                        "metadata": item,
                        "pano_id": _pano_id,
                        "date": item["date"],
                        "duplicate": False
                    }
                _actual_loc = item["location"]
                if _actual_loc in actual_locations:
                    print(f"{j:04d} found a duplicate for this coördinate: {_actual_loc}")
                    _status.update({"duplicate": True})
                    status_list.append(_status)
                else:
                    print(f"{j:04d} found a new panorama at this coördinate: {_actual_loc} {_pano_id}")
                    actual_locations.append(_actual_loc)
                    _status.update({"duplicate": False})
                    status_list.append(_status)
                    results.download_links(os.path.join(STREETVIEW_DOWNLOAD_DIR, 'tmp'))
                    for heading_key in heading_map.keys():
                        _src = os.path.join(STREETVIEW_DOWNLOAD_DIR, "tmp", f"gsv_{heading_key}.jpg")
                        _dst = os.path.join(STREETVIEW_DOWNLOAD_DIR, f"{_pano_id}_{heading_key}_{heading_map[heading_key]}.jpg")
                        os.rename(_src, _dst)
        
    

In [None]:
status_df = pd.DataFrame(status_list)

In [None]:
status_df.shape

In [None]:
# remove NA columns
display(status_df[status_df.isna().sum(axis=1) > 0])
#status_df = status_df[status_df.isna().sum(axis=1) == 0]

In [None]:
status_df.head()

In [None]:
#status_df = pd.DataFrame(status_list)
status_df.to_pickle("../data/processed/20220222_streetview_location_panorama_df.pkl")

In [68]:
import pickle
status_df = pickle.load(open("../data/processed/20220222_streetview_location_panorama_df.pkl", "rb"))
status_df

Unnamed: 0,i,coord,location,status,metadata,pano_id,date,duplicate
0,0,"50.940104,4.062487","{'lat': 50.94009849499412, 'lng': 4.0624448891...",ok,"{'copyright': '© Google', 'date': '2019-05', '...",OlhTCaEsakPsBnnrQ33b1Q,2019-05,False
1,1,"50.944063,4.028181","{'lat': 50.94009849499412, 'lng': 4.0624448891...",ok,"{'copyright': '© Google', 'date': '2019-05', '...",OlhTCaEsakPsBnnrQ33b1Q,2019-05,False
2,2,"51.084607,3.436877","{'lat': 51.08462265868495, 'lng': 3.4368991355...",ok,"{'copyright': '© Google', 'date': '2019-05', '...",JNVwpgpEKp5AZIYKxZm9ew,2019-05,False
3,3,"51.087108,3.458313","{'lat': 51.08462265868495, 'lng': 3.4368991355...",ok,"{'copyright': '© Google', 'date': '2019-05', '...",JNVwpgpEKp5AZIYKxZm9ew,2019-05,False
4,4,"50.983138,4.828583","{'lat': 50.9831431950842, 'lng': 4.82860213679...",ok,"{'copyright': '© Google', 'date': '2021-03', '...",TFSmQVL_3EteK0mI2BNVdw,2021-03,False
...,...,...,...,...,...,...,...,...
3513,3513,"51.267956,4.651661","{'lat': 51.26518856480902, 'lng': 4.7124199367...",ok,"{'copyright': '© Google', 'date': '2021-08', '...",HdYh-DPxLrZhE0ujdkI-Ew,2021-08,False
3514,3514,"50.886564,3.842151","{'lat': 50.88665674301171, 'lng': 3.8422360247...",ok,"{'copyright': '© Google', 'date': '2009-04', '...",c6XGTy6gp8FZVdYPG8CLNw,2009-04,False
3515,3515,"51.21378,4.325091","{'lat': 51.21380597844729, 'lng': 4.3250617700...",ok,"{'copyright': '© Google', 'date': '2020-08', '...",dNkksX4eaTJWRWpwzRuwNg,2020-08,False
3516,3516,"51.216422,4.331289","{'lat': 51.21641620372846, 'lng': 4.3312549338...",ok,"{'copyright': '© Google', 'date': '2021-08', '...",Wq6dUL99Zbo9PpHYJ5502w,2021-08,False


# Google streetview - landscape segmentation processing

This happens in a separate notebook: 002_verpla38_vision_analysis.ipynb

Results can be demoed using the streamlit app 003_verpla...

# Google streetview - landscape segmentation results analysis

The segmentation results can be joined with the original dataset (via its coordinates) for further exploration

In [3]:
import pickle
streetview_segmentation_df = pickle.load(open("../data/processed/20220222_streetview_segmentation_results_14148_images_flanders_df.pkl", "rb"))


In [4]:
streetview_segmentation_df

Unnamed: 0,img,img_full,segmentation_data
0,--QkJckaqobMhxpzoXUsMg_0_N_0.jpg,../data/raw/google_streetview/--QkJckaqobMhxpz...,"{'wall': 0.0, 'building': 62.846435546875, 'sk..."
1,--QkJckaqobMhxpzoXUsMg_1_E_90.jpg,../data/raw/google_streetview/--QkJckaqobMhxpz...,"{'wall': 0.110595703125, 'building': 38.410400..."
2,--QkJckaqobMhxpzoXUsMg_2_S_180.jpg,../data/raw/google_streetview/--QkJckaqobMhxpz...,"{'wall': 14.39404296875, 'building': 1.6340332..."
3,--QkJckaqobMhxpzoXUsMg_3_W_270.jpg,../data/raw/google_streetview/--QkJckaqobMhxpz...,"{'wall': 2.97705078125, 'building': 5.20581054..."
4,--RMkDzVN_XEAJSzRt9ZOA_0_N_0.jpg,../data/raw/google_streetview/--RMkDzVN_XEAJSz...,"{'wall': 0.0, 'building': 2.644287109375, 'sky..."
...,...,...,...
14143,zxbFN4N6IBfzq3qVFIzkSA_3_W_270.jpg,../data/raw/google_streetview/zxbFN4N6IBfzq3qV...,"{'wall': 0.0, 'building': 0.0, 'sky': 0.583740..."
14144,zzFWrWYXy9YWqEqjPtGtbw_0_N_0.jpg,../data/raw/google_streetview/zzFWrWYXy9YWqEqj...,"{'wall': 0.0, 'building': 69.44970703125, 'sky..."
14145,zzFWrWYXy9YWqEqjPtGtbw_1_E_90.jpg,../data/raw/google_streetview/zzFWrWYXy9YWqEqj...,"{'wall': 0.0, 'building': 95.3720703125, 'sky'..."
14146,zzFWrWYXy9YWqEqjPtGtbw_2_S_180.jpg,../data/raw/google_streetview/zzFWrWYXy9YWqEqj...,"{'wall': 0.0, 'building': 74.572265625, 'sky':..."


In [63]:
sv_gpd

Unnamed: 0,ID,truck,bus,van,car,bike,walk,streetname,municipality,lat,long,geometry
0,SV20P1984,0,4,0,31,2,10,Kleemstraat,Halle,50.719737,4.258857,POINT (4.25886 50.71974)
1,SV2020A1676,4,2,25,272,53,51,Nijvelsesteenweg,Halle,50.728366,4.243156,POINT (4.24316 50.72837)
2,SV20P0210,3,0,12,115,14,24,Poststraat,Halle,50.734949,4.232230,POINT (4.23223 50.73495)
3,SV20P1555,4,0,29,205,10,22,Jules Bordetlaan,Ronse,50.739619,3.605092,POINT (3.60509 50.73962)
4,SV20P1839,0,2,21,84,3,9,Gomar Vandewielelaan,Ronse,50.741474,3.594312,POINT (3.59431 50.74147)
...,...,...,...,...,...,...,...,...,...,...,...,...
3545,SV2020A1226,0,0,4,17,8,16,Kammenstraat,Essen,51.468886,4.454877,POINT (4.45488 51.46889)
3546,SV2020A1936SV20P1311avg,4,0,7,85,40,12,Nieuwstraat,Essen,51.469910,4.465937,POINT (4.46594 51.46991)
3547,SV2020A1903,0,0,4,75,15,7,Essendonk,Essen,51.470587,4.475584,POINT (4.47558 51.47059)
3548,SV20P1436,0,0,5,78,16,4,Essendonk,Essen,51.471278,4.476050,POINT (4.47605 51.47128)


In [69]:
import pickle
streetview_segmentation_df = pickle.load(open("../data/processed/20220302_streetview_coordinates_w_labels.pkl", "rb"))
#streetview_segmentation_df.head()
landscape_rename = {
    col: f"segm_{col}" for col in streetview_segmentation_df.select_dtypes(float).columns 
    if col not in ["lat", "long"]
}
streetview_segmentation_df = streetview_segmentation_df.rename(columns=landscape_rename)
segm_gpd = gpd.GeoDataFrame(streetview_segmentation_df, geometry=gpd.points_from_xy(streetview_segmentation_df.long, streetview_segmentation_df.lat), crs=4326)
sv_segm_gpd = sv_gpd.to_crs(3395).sjoin_nearest(segm_gpd.to_crs(3395), how="inner", max_distance=50, distance_col="segm_distance")
#sv_segm_gpd = drop_col_if_exists(sv_segm_gpd, col=("index_left", "index_right", 'lat_right', 'long_right'))
sv_segm_gpd = sv_segm_gpd.rename(columns={'lat_left': "lat", 'long_left': "long"})
sv_segm_gpd.head()

Unnamed: 0,ID,truck,bus,van,car,bike,walk,streetname,municipality,lat,...,img_north_masked,img_east,img_east_masked,img_south,img_south_masked,img_west,img_west_masked,lat_right,long_right,segm_distance
0,SV20P1984,0,4,0,31,2,10,Kleemstraat,Halle,50.719737,...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,50.719737,4.258857,0.0
1,SV2020A1676,4,2,25,272,53,51,Nijvelsesteenweg,Halle,50.728366,...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,50.728366,4.243156,0.0
2,SV20P0210,3,0,12,115,14,24,Poststraat,Halle,50.734949,...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,../data/raw/google_streetview/HJ2yCX61mI4HzXf3...,../data/processed/google_streetview_masked/HJ2...,50.734949,4.23223,0.0
3,SV20P1555,4,0,29,205,10,22,Jules Bordetlaan,Ronse,50.739619,...,../data/processed/google_streetview_masked/2au...,../data/raw/google_streetview/2au70LYaarz60M7Q...,../data/processed/google_streetview_masked/2au...,../data/raw/google_streetview/2au70LYaarz60M7Q...,../data/processed/google_streetview_masked/2au...,../data/raw/google_streetview/2au70LYaarz60M7Q...,../data/processed/google_streetview_masked/2au...,50.739619,3.605092,0.0
4,SV20P1839,0,2,21,84,3,9,Gomar Vandewielelaan,Ronse,50.741474,...,../data/processed/google_streetview_masked/CXS...,../data/raw/google_streetview/CXS1Ac1pRwjeFBBD...,../data/processed/google_streetview_masked/CXS...,../data/raw/google_streetview/CXS1Ac1pRwjeFBBD...,../data/processed/google_streetview_masked/CXS...,../data/raw/google_streetview/CXS1Ac1pRwjeFBBD...,../data/processed/google_streetview_masked/CXS...,50.741474,3.594312,0.0


In [80]:
sv_gpd.shape

(3550, 12)

In [79]:
sv_segm_gpd.shape

(3518, 87)

In [70]:
sv_segm_gpd.columns

Index(['ID', 'truck', 'bus', 'van', 'car', 'bike', 'walk', 'streetname',
       'municipality', 'lat', 'long', 'geometry', 'index_right', 'pano_id',
       'segm_wall', 'segm_building', 'segm_sky', 'segm_tree', 'segm_road',
       'segm_sidewalk', 'segm_earth', 'segm_plant', 'segm_car', 'segm_fence',
       'segm_signboard', 'segm_ashcan', 'segm_grass', 'segm_floor',
       'segm_house', 'segm_path', 'segm_streetlight', 'segm_stairs',
       'segm_person', 'segm_bicycle', 'segm_bench', 'segm_pot', 'segm_pole',
       'segm_minibike', 'segm_van', 'segm_rock', 'segm_railing', 'segm_box',
       'segm_water', 'segm_sea', 'segm_swimming', 'segm_table', 'segm_rug',
       'segm_bannister', 'segm_conveyer', 'segm_column', 'segm_flower',
       'segm_truck', 'segm_awning', 'segm_dirt', 'segm_mountain', 'segm_field',
       'segm_ceiling', 'segm_seat', 'segm_windowpane', 'segm_stairway',
       'segm_bridge', 'segm_sand', 'segm_grandstand', 'segm_door', 'segm_bus',
       'segm_bag', 'segm_tra

In [78]:
(sv_segm_gpd.isnull().sum()>0).value_counts()

False    87
dtype: int64

In [72]:
streetview_segmentation_df.drop_duplicates().shape

(3518, 74)

In [62]:
3550-2338

1212

In [None]:
sv_segm_gpd.columns

### How far away is streetview data from straatvinken data?

In [None]:
sv_segm_gpd.segm_distance.hist(bins=25)

Most of the google streetview data is within a distance of 10 meters. A few are up to a distance of > 40m.

In [None]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, fixed
#font = {'family' : 'normal', 'weight' : 'normal', 'size'   : 12}
#matplotlib.rc('font', **font)
modality_cols=["bike", "walk", "bus","car", "truck", "van"]

mod_col = widgets.Select(
    options=modality_cols, 
    value=modality_cols[0],
    description="modal sort",
    rows=len(modality_cols)
)

landscape_cols = [
    col for col in streetview_segmentation_df.columns 
    if col not in ["pano_id", "lat", "long", "geometry"]
    and not col.startswith("img_")
]

cols = modality_cols.copy()
cols.extend(landscape_cols)
corr = sv_segm_gpd[cols].corr()

corr = corr.loc[landscape_cols, ["bike", "walk", "bus","car", "truck", "van"]]

@interact(mod_col=mod_col, modality_cols=fixed(modality_cols), landscape_cols=fixed(landscape_cols), corr=fixed(corr))
def show_matrix(mod_col, modality_cols=modality_cols, landscape_cols=landscape_cols, corr=corr):
    corr = corr.sort_values(by=[mod_col], key=lambda col: abs(col), ascending=False)
    f, ax = plt.subplots(figsize=(10, 20))
    cmap = sns.color_palette("Spectral", as_cmap=True)
    sns.heatmap(corr, annot=True, cmap=cmap, center=0, vmin=-1, vmax=1)
    ax.xaxis.tick_top()
    plt.show()

# Number of schools in vicinity

In [None]:
school_path = "../data/raw/Onderwijsaanbod_in_Vlaanderen_en_Brussel_via_POI_service_Shapefile/Onderwijsaanbod_in_Vlaanderen_en_Brussel_via_POI_service/Shapefile/POI_Onderwijs.shp"
school_df = gpd.read_file(school_path)
print(school_df.crs)
print(school_df.dtypes)
school_df.head(2).T

In [None]:
for col in ["THEMA", "CATEGORIE"]:
    print(col)
    print(school_df[col].value_counts())

In [None]:
school_df.to_crs(4326).plot("CATEGORIE", markersize=3, figsize=(14,8), legend=True)
plt.show()

In [None]:
school_df["school"]=school_df["CATEGORIE"].apply(lambda x: x[0])

In [None]:
def get_avg_distance_of_k_neighbors(
    src_gdf: gpd.GeoDataFrame, 
    featuredb_gdf: gpd.GeoDataFrame, 
    neighbors: int = 25, 
    feature_col: str="CATEGORIE",
    column_suffix: str = "",
    total_buckets_prefix: Tuple[str, ...] = (),
    feature_filters: Tuple[str, ...] = (),
    verbose=False
):
    """
    Returns the average distance of the k neareest neightbors subdivided to a given 'feature column'

            Parameters:
                    src_gdf (GeoDataFrame): dataframe containing the coordinates 
                        to be enriched with the featurecounts
                    featuredb_gdf (GeoDataFrame): feature dataframe with POI's for geometry objects
                    feature_col (str): the column of which occurence counts will be made
                    result_type (str): 
                        feature_columns: explode the categories into separate columns 
                            using the "feature_col"_"category" naming convention
                        map: add result as a dictionary object to a single column with the name "feature_col"

            Returns:
                    enriched_gdf (GeoDataFrame): the original dataframe with feature count data added
    
    """
    
    start = time.time()
    # this CRS allows for measuring distance in meters
    src_gdf=src_gdf.to_crs(3395)
    featuredb_gdf=featuredb_gdf.to_crs(3395)
    if verbose:
        print(f"crs adaptation performed in {time.time() - start:.2f}s")
    
    if len(feature_filters) > 0:
        start=time.time()
        featuredb_gdf = featuredb_gdf[(~featuredb_gdf[feature_col].isin(feature_filters))]
        if verbose:
            print(f"feature filter performed in {time.time() - start:.2f}s")
    
    zero_features = {key: 0 for key in set(featuredb_gdf[feature_col])}
    
    start = time.time()
    
    nA = np.array(list(src_gdf.geometry.apply(lambda x: (x.x, x.y))))
    
    
    if verbose:
        print(f"source geometries created in {time.time() - start:.2f}s")
    
    
    feature_counts = {}
    
    for feature in pd.unique(featuredb_gdf[feature_col]):
        start = time.time()
        frag = featuredb_gdf[featuredb_gdf.eval(f"{feature_col} == '{feature}'")]
        
        nB = np.array(list(frag.geometry.apply(lambda x: (x.x, x.y))))
        btree = cKDTree(nB)
        
        if verbose:
            print(f"ckd tree created for feature {feature} in {time.time() - start:.2f}s")
    
        start = time.time()
        max_neighbors = min(neighbors, nB.shape[0])
        dist, idx = btree.query(nA, k=max_neighbors)
        if verbose:
            print(f"query for feature {feature} performed in {time.time() - start:.2f}s")
    
        start = time.time()
        #gdB_nearest = frag.iloc[idx].drop(columns="geometry").reset_index(drop=True)
        feature_avg_dist=[]
        for dist, idx in zip(dist, idx):
            if neighbors == 1:
                feature_avg_dist.append(dist)
            else:
                feature_avg_dist.append(np.mean(dist))
        #feature_counts[f"{feature}_num_neighbors"] = nn
        feature_counts[f"{feature}_avg_dist"] = feature_avg_dist
        
    
    if verbose:
        print(f"data adaptation performed in {time.time() - start:.2f}s")
    features_df = pd.DataFrame(feature_counts)
    features_df.columns = [f"{feature_col}_{col}_{neighbors}" for col in features_df.columns]
    gdf = pd.concat([src_gdf, features_df], axis=1)
    return gdf.to_crs(4326)

for n in [1, 2, 3, 5, 10, 15]:
    sv_gdf = get_avg_distance_of_k_neighbors(sv_gdf, school_df, feature_col="school", neighbors=n)


In [None]:
for schooltype_col in [col for col in sv_gdf.columns if col.startswith("school")]:
    sv_gdf[schooltype_col].hist(bins=25)
    plt.title(schooltype_col)
    plt.show()

In [None]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, fixed
#font = {'family' : 'normal', 'weight' : 'normal', 'size'   : 12}
#matplotlib.rc('font', **font)
modality_cols=["bike", "walk", "bus","car", "truck", "van"]

mod_col = widgets.Select(
    options=modality_cols, 
    value=modality_cols[0],
    description="modal sort",
    rows=len(modality_cols)
)

schooltype_cols = [col for col in sv_gdf.columns if col.startswith("school")]

cols = modality_cols.copy()
cols.extend(schooltype_cols)
corr = sv_gdf[cols].corr()

corr = corr.loc[schooltype_cols, ["bike", "walk", "bus","car", "truck", "van"]]

@interact(mod_col=mod_col, modality_cols=fixed(modality_cols), schooltype_cols=fixed(schooltype_cols), corr=fixed(corr))
def show_matrix(mod_col, modality_cols=modality_cols, schooltype_cols=schooltype_cols, corr=corr):
    corr = corr.sort_values(by=[mod_col], key=lambda col: abs(col), ascending=False)
    f, ax = plt.subplots(figsize=(5, 25))
    cmap = sns.color_palette("Spectral", as_cmap=True)
    sns.heatmap(corr, annot=True, cmap=cmap, center=0, vmin=-1, vmax=1)
    ax.xaxis.tick_top()
    plt.yticks(rotation=45)
    plt.show()

### Dealing with missing data (playing ground)

In [33]:
df= pd.DataFrame({"a": [1, 2, 3, 4, 2], "b": ["a", "b", np.nan, "c", "a"]})
df


from sklearn.impute import MissingIndicator, SimpleImputer
import yaml

from sklearn.pipeline import Pipeline, FeatureUnion

transformer = SimpleImputer(strategy='most_frequent')
transformer.fit_transform(df)

array([[1, 'a'],
       [2, 'b'],
       [3, 'a'],
       [4, 'c'],
       [2, 'a']], dtype=object)

In [45]:
df

null_mask = df.isnull().sum()
df_report=null_mask[null_mask.index[(null_mask>0)]]

In [35]:
df.isnull().sum()

a    0
b    1
dtype: int64

In [43]:
print(df.isnull().sum()[df.isnull().sum().index[(df.isnull().sum()>0)]].to_string())

b    1


In [49]:
nl = '\n'
print(f"missing data:{nl.join(df_report)}")

TypeError: sequence item 0: expected str instance, int found

In [51]:
df_report.to_string()

'b    1'

In [84]:
df = pickle.load(open("../data/processed/20220302_straatvinken_abt_complete_df.pkl", "rb"))
df["morf"] = df["morf"].astype(int).astype(str)

In [85]:
pickle.dump(df, open("../data/processed/20220302_straatvinken_abt_complete_df.pkl", "wb"))

In [2]:
import geopandas as gpd
wrsegm_path = "../data/raw/Wegenregister_SHAPE_20211216/Shapefile/Wegsegment.shp"
wrsegm_gdf = gpd.read_file(wrsegm_path)
wrsegm_gdf.head()



Unnamed: 0,WS_OIDN,WS_UIDN,WS_GIDN,B_WK_OIDN,E_WK_OIDN,STATUS,LBLSTATUS,MORF,LBLMORF,WEGCAT,...,LBLBEHEER,METHODE,LBLMETHOD,OPNDATUM,BEGINTIJD,BEGINORG,LBLBGNORG,TGBEP,LBLTGBEP,geometry
0,1,1_1,1_1,126722,41353,4,in gebruik,114,"wandel- of fietsweg, niet toegankelijk voor an...",-9,...,Stad Hasselt,2,ingemeten,20140220T143532,20140220T143532,AGIV,Agentschap voor Geografische Informatie Vlaand...,1,openbare weg,"LINESTRING (217368.750 181577.016, 217400.110 ..."
1,2,2_1,2_1,3,4,4,in gebruik,120,dienstweg,-9,...,niet gekend,2,ingemeten,20140220T143532,20140220T143532,AGIV,Agentschap voor Geografische Informatie Vlaand...,1,openbare weg,"LINESTRING (243234.893 160239.383, 243245.995 ..."
2,4,4_3,4_3,7,8,4,in gebruik,120,dienstweg,-9,...,Agentschap Wegen en Verkeer - District Centraa...,2,ingemeten,20170315T154411,20170315T154448,AWV,Agentschap Wegen en Verkeer,1,openbare weg,"LINESTRING (232327.054 165044.681, 232319.001 ..."
3,6,6_3,6_3,650425,12,4,in gebruik,120,dienstweg,-9,...,Agentschap Wegen en Verkeer - District Centraa...,2,ingemeten,20170309T101322,20170309T101533,AWV,Agentschap Wegen en Verkeer,1,openbare weg,"LINESTRING (219742.688 177266.625, 219748.501 ..."
4,7,7_1,7_1,41353,146626,4,in gebruik,114,"wandel- of fietsweg, niet toegankelijk voor an...",-9,...,Stad Hasselt,2,ingemeten,20140220T143532,20140220T143532,AGIV,Agentschap voor Geografische Informatie Vlaand...,1,openbare weg,"LINESTRING (217400.110 181499.516, 217403.479 ..."


In [4]:
sv_gpd = get_straatvinken_data()

columns: Index(['ID', 'truck', 'bus', 'van', 'car', 'bike', 'walk', 'streetname',
       'municipality', 'lat', 'long', 'geometry'],
      dtype='object')
rows: 3550


In [69]:
sv_gpd = sv_gpd.to_crs(3395)
wrsegm_gdf = wrsegm_gdf.to_crs(3395)
circles = sv_gpd.buffer(1000)
wrsegm_frag = wrsegm_gdf[wrsegm_gdf.intersects(circles.iloc[1])]
wrsegm_frag = wrsegm_frag.reset_index(drop=True)
wrsegm_frag.loc[:, ["truck","bus","van","car","bike","walk"]] = sv_gpd.reset_index(drop=True).iloc[:len(wrsegm_frag)][["truck","bus","van","car","bike","walk"]]
wrsegm_frag[["geometry", "truck","bus","van","car","bike","walk"]]

Unnamed: 0,geometry,truck,bus,van,car,bike,walk
0,"LINESTRING (472126.674 6539785.826, 472125.706...",0,4,0,31,2,10
1,"LINESTRING (472850.735 6540054.835, 472882.646...",4,2,25,272,53,51
2,"LINESTRING (473073.332 6540239.294, 473037.846...",3,0,12,115,14,24
3,"LINESTRING (471996.197 6540174.708, 471986.143...",4,0,29,205,10,22
4,"LINESTRING (472126.674 6539785.826, 472099.440...",0,2,21,84,3,9
...,...,...,...,...,...,...,...
311,"LINESTRING (473318.918 6540163.095, 473307.672...",0,0,0,2,3,4
312,"LINESTRING (472362.851 6539262.104, 472368.903...",12,2,73,386,36,4
313,"LINESTRING (473318.918 6540163.095, 473322.404...",1,0,1,32,13,46
314,"LINESTRING (473329.744 6540179.587, 473318.918...",0,0,1,28,27,67


In [84]:
tooltip_cols=["LSTRNM", "LBLBEHEER", "truck","bus","van","car","bike","walk", "geometry"]
to_show="bike"
vmax =  np.percentile(wrsegm_frag[to_show].values, 95)
wrsegm_frag[tooltip_cols].explore(
    column=to_show, 
    tiles="CartoDB positron", 
    vmin=0, 
    vmax=vmax, 
    k=10, 
    scheme="naturalbreaks", 
    cmap="YlOrRd",
    marker_type="marker"
)
