# Detect Trucks Sentinel-2 - Validation

This is a script for validating Sentinel-2 truck detection. It uses data from traffic count station, currently only in Germany. The validation is based on a recursive shortest path finder. However, in the future this should become much more efficient working only on the linestrings.

In [1]:
# load creds
%load_ext dotenv
%dotenv

In [2]:
import os, sys, subprocess
import pandas as pd
import geopandas as gpd
import numpy as np
import xarray as xr

# installations
def install_package(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
install_package("obspy")

from xcube_sh.cube import open_cube
from xcube_sh.config import CubeConfig

from rasterio import features
from affine import Affine
from pyproj import Proj, Transformer
from obspy.geodetics import degrees2kilometers, kilometers2degrees
from shapely.geometry import Point, LineString

from glob import glob

_______________
## 1 | Setup

In [3]:
speed = 90

#### General

In [4]:
def VLD(): return "validation"
def CSV_SEP(): return ";"

#### Directories

In [5]:
dir_main = os.getcwd()
dir_not_commit = os.path.join(dir_main, "not_commit")
dirs = {VLD():os.path.join(dir_main, VLD()), "processing":os.path.join(dir_main, "processing"), 
        "truck_points":os.path.join(dir_not_commit, "processed", "overall", "acquisitions_trucks")}

#### True count data related

In [6]:
hour = 10
minutes = 20
def NAME_DATE(): return "Datum"
def NAME_HOUR(): return "Stunde"
def NAME_TR1(): return "Lkw_R1"
def NAME_TR2(): return "Lkw_R2"

#### Paths

In [7]:
grid_gadm = gpd.read_file(os.path.join(dirs["processing"], "processing_grid_gadm_eu27.geojson"))

In [43]:
suffix = "_2018.csv"
true_counts_paths = {os.path.join(dirs[VLD()], "zst7781"+suffix):(5599337, 400384),
                     os.path.join(dirs[VLD()], "zst7121"+suffix):(5605914, 390060),
                     os.path.join(dirs[VLD()], "zst7955"+suffix):(5589735, 374175),
                     os.path.join(dirs[VLD()], "zst7443"+suffix):(5557659, 399612)}

_______________
## 2 | Utils

In [44]:
def get_bbox_id(point, grid_gadm):
    point_in_box = []
    for i, geom in enumerate(grid_gadm.geometry):
        if geom.contains(point):
            point_in_box.append(grid_gadm.bbox_id[i])
    return point_in_box
            
def utm32N_to_4326(utm32n_point):
    source_crs = "EPSG:25832"
    target_crs = "EPSG:4326"
    transformer = Transformer.from_crs(source_crs, target_crs)
    transformed = transformer.transform(utm32n_point[1], utm32n_point[0])
    return Point(transformed[1], transformed[0])

_______________
## 3 | Prep points

In [45]:
bbox_ids = []
for path, point in true_counts_paths.items():
    true_counts_paths[path] = utm32N_to_4326(point)
    bbox_ids.append(get_bbox_id(true_counts_paths[path], grid_gadm)[0])
bbox_ids

[789, 789, 789, 789]

In [46]:
def get_points(bbox_id):
    files = glob(dirs["truck_points"]+os.sep+"2018*"+str(bbox_id)+".gpkg")
    n = []
    if len(files) > 0:
        for file in files:
            points = gpd.read_file(file)
            n.append(len(points))
        n = np.array(n)
        selected = n==n.max()
        return np.array(files)[selected][0], n.max()
    else:
        return None, None

truck_points_paths = []
number = []
for bbox_id in bbox_ids:
    file, n = get_points(bbox_id)
    truck_points_paths.append(file)
    number.append(n)
print(truck_points_paths)
print(number)

['/home/jovyan/Detect_Trucks_Sentinel2_Upscaling/not_commit/processed/overall/acquisitions_trucks/2018-05-08T10_40_25_acquisitions_trucks_bbox_id789.gpkg', '/home/jovyan/Detect_Trucks_Sentinel2_Upscaling/not_commit/processed/overall/acquisitions_trucks/2018-05-08T10_40_25_acquisitions_trucks_bbox_id789.gpkg', '/home/jovyan/Detect_Trucks_Sentinel2_Upscaling/not_commit/processed/overall/acquisitions_trucks/2018-05-08T10_40_25_acquisitions_trucks_bbox_id789.gpkg', '/home/jovyan/Detect_Trucks_Sentinel2_Upscaling/not_commit/processed/overall/acquisitions_trucks/2018-05-08T10_40_25_acquisitions_trucks_bbox_id789.gpkg']
[1985, 1985, 1985, 1985]


_______________
## 4 | Utils for OSM rasterization

In [20]:
# osm geodataframe of polygons
# reference_raster xarray with lat and lon
def rasterize_osm(osm, reference_raster):
    osm_values = list(set(osm["osm_value"]))
    nan_placeholder = 100
    road_rasters = []
    for osm_value in osm_values:
        osm_subset = osm[osm["osm_value"] == osm_value]
        raster = rasterize(osm_subset, reference_raster.lat, reference_raster.lon)
        cond = np.isfinite(raster)
        raster_osm = np.where(cond, list(osm_subset.osm_value_int)[0], nan_placeholder) # use placeholder instead of nan first
        raster_osm = raster_osm.astype(np.float)
        road_rasters.append(raster_osm)        
    # merge road types in one layer
    road_raster_np = np.array(road_rasters).min(axis=0) # now use the lowest value (highest road level) because some intersect
    road_raster_np[road_raster_np == nan_placeholder] = 0
    return road_raster_np # 0=no_road 1=motorway, 2=trunk, ...

def transform_lat_lon(lat, lon):
    lat = np.asarray(lat)
    lon = np.asarray(lon)
    trans = Affine.translation(lon[0], lat[0])
    scale = Affine.scale(lon[1] - lon[0], lat[1] - lat[0])
    return trans * scale

def rasterize(polygons, lat, lon, fill=np.nan):
    transform = transform_lat_lon(lat, lon)
    out_shape = (len(lat), len(lon))
    raster = features.rasterize(polygons.geometry, out_shape=out_shape,
                                fill=fill, transform=transform,
                                dtype=float)
    return xr.DataArray(raster, coords={"lat":lat, "lon":lon}, dims=("lat", "lon"))

def get_osm_raster(osm, grid_gadm, date, points_path):
    bbox_id = int(os.path.basename(points_path).split("_")[-1].split(".")[0][2:])
    i = list(grid_gadm.bbox_id).index(bbox_id)
    bbox = list(grid_gadm.geometry)[i].bounds
    # get cube for rasterizing osm
    config = CubeConfig(
        dataset_name="S2L2A",
        band_names=["B04"],
        tile_size=[512, 512],
        geometry=bbox,
        spatial_res=0.00009,
        time_range=[date, date])
    cube = open_cube(config)
    osm_np = rasterize_osm(osm, cube.B04)
    return xr.Dataset({"roadmask":xr.DataArray(osm_np, coords={"lat":cube.lat, "lon":cube.lon}, dims=["lat", "lon"])})

_______________
## 5 | Classes

In [21]:
class Validator:
    def __init__(self, truck_points_path, true_counts_path, station_xy):
        self.eo_counts = gpd.read_file(truck_points_path) # gpd points
        self.truck_points_path = truck_points_path
        self.tc = TrueCounts(true_counts_path, station_xy)
        self.osm_mask = None
        self.bbox_id = int(os.path.basename(truck_points_path).split("_")[-1].split(".")[0][2:])
        self.eo_counts.crs = "EPSG:4326"
        self.eo_counts_subset = None
        self.truck_dist = None
        self.within_dist = None
        self.eo_vs_truth = None
    
    def subset_to_buffer(self, buffer):
        self.eo_counts_subset = gpd.sjoin(self.eo_counts, buffer, op="within")
        
    def within_distance(self, osm_mask, station, km_max_distance):
        self.truck_dist = TruckDistance(osm_mask)
        self.within_dist = 0
        for point in self.eo_counts_subset.geometry:
            self.truck_dist.calc_travel_dist(point, station)
            dist = self.truck_dist.dist 
            if dist is not None and dist <= km_max_distance:
                self.within_dist += 1
                
    def validate(self, date, hour, minutes, speed, grid_gadm, dir_not_commit):
        self.tc.sub_hour_count(date[2:4]+date[5:7]+date[8:], hour, minutes)
        self.tc.buffer(minutes, speed)
        print(self.bbox_id)
        osm = gpd.read_file(os.path.join(dir_not_commit, "ancillary_data", "roads", str(self.bbox_id)+"_"+"highway.gpkg"))
        self.osm_mask = get_osm_raster(osm, grid_gadm, date, self.truck_points_path)
        osm_in_buffer = gpd.overlay(self.tc.buff, osm)
        # mask osm raster to buffered osm
        bounds = osm_in_buffer.total_bounds
        lat, lon = self.osm_mask.lat.values, self.osm_mask.lon.values
        lat_bounds = (lat >= bounds[1]) * (lat <= bounds[3])
        lon_bounds = (lon >= bounds[0]) * (lon <= bounds[2])
        mesh = np.meshgrid(lon_bounds, lat_bounds)
        mask = mesh[0]*mesh[1]
        self.osm_mask = self.osm_mask.where(mask)
        self.osm_mask.roadmask.values[np.isnan(self.osm_mask.roadmask.values)] = 0.
        self.subset_to_buffer(self.tc.buff)
        self.within_distance(self.osm_mask, self.tc.station_xy, self.tc.max_distance)
        eo_count = self.within_dist / 2 # divide by two in order to include only lanes where trucks are coming from the station
        self.eo_vs_truth = {"date":date, "eo_count":eo_count, "true_count":self.tc.counts, "percentage":(eo_count / self.tc.counts) * 100, "cars":self.tc.cars}

In [22]:
class TrueCounts:
    def __init__(self, true_count_csv, station_xy):
        self.data = pd.read_csv(true_count_csv, sep=CSV_SEP())
        self.station_xy = station_xy
        self.data_subset = None # pd
        self.counts = None
        self.cars = None
        self.buff = None # gpd polygon
        self.max_distance = None
    
    def sub_hour_count(self, date, hour, minutes):
        self.minutes = minutes
        self.data_subset = self.data[self.data[NAME_DATE()]==int(date)]
        self.data_subset = self.data_subset[self.data_subset[NAME_HOUR()]==hour]
        amount = minutes / 60
        tr1, tr2 = NAME_TR1(), NAME_TR2()
        self.counts = float(self.data_subset[tr1]) * amount + float(self.data_subset[tr2]) * amount
        self.cars = float(self.data_subset["Pkw_R1"]) * amount + float(self.data_subset["Pkw_R2"]) * amount
    
    def max_traveled_dist(self, minutes, speed):
        self.max_distance = speed / (60 / minutes) # km
        
    def buffer(self, minutes, speed):
        self.max_traveled_dist(minutes, speed)
        self.buff = gpd.GeoDataFrame(geometry=[self.station_xy.buffer(kilometers2degrees(self.max_distance))])
        self.buff.crs = "EPSG:4326"

In [23]:
class PathFinder:
    def __init__(self, arr, end):
        self.arr = arr # np
        self.end = end
        self.paths = []
        self.reached = []
        self.len = []
        self.shortest_path = None

    def find_path(self, start):
        self.find(start) 
        length = np.array(self.len)
        if any(self.reached):
            shortest = length==length[self.reached].min()
            target = np.multiply(np.array(shortest), np.array(self.reached))
            path = self.paths[int(np.where(target)[0][0])]
            self.shortest_path = [path[:,0], path[:,1]]
    
    def should_follow(self, y, x, point, arr):                            
        y_altered = point[0] + y
        x_altered = point[1] + x                                                    
        exceeds_y = y_altered >= arr.shape[0]
        exceeds_x = x_altered >= arr.shape[1]
        below_y = y_altered < 0
        below_x = x_altered < 0
        outside = any([exceeds_y, exceeds_x, below_y, below_x])
        follow = not outside and arr[y_altered,x_altered] > 0
        return follow

    def find(self, point):
        arr = self.arr
        if not (point[0] == self.end[0] and point[1] == self.end[1]):
            shift = 2999
            y_neighbors = [-shift,-shift,-shift,0,0,shift,shift,shift] 
            x_neighbors = [-shift,0,shift,-shift,shift,-shift,0,shift] 
            valid = []
            # check if end point is in window
            bounds = (point[0]+y_neighbors[0], point[1]+x_neighbors[0], point[0]+y_neighbors[-1], point[1]+x_neighbors[-1])
            point_in_box = self.end[0] >= bounds[0] and self.end[0] <= bounds[2] and self.end[1] >= bounds[1] and self.end[1] <= bounds[3]
            if point_in_box: 
                valid.append(self.end)
            else:
                for y in range(-shift, shift):
                    if np.abs([y]) == shift: # edges
                        for x in range(-shift, shift):
                            if self.should_follow(y, x, point, arr):
                                valid.append(np.array([point[0]+y, point[1]+x]))
                    else:
                        for x in [-shift, shift]: # edges
                            if self.should_follow(y, x, point, arr):
                                valid.append(np.array([point[0]+y, point[1]+x]))
            if len(valid) > 0:
                if len(valid)>=2 and all(valid[0] == valid[1]):
                    raise Exception("Nonono")
                origin_path = [np.array(point)] if len(self.paths) == 0 else self.paths[-1].copy()
                for i, p in enumerate(valid):
                    p = np.array(p)
                    already_in = []
                    for origin in origin_path:
                        already_in.append(all(p==np.array(origin)))
                    if not any(already_in):
                        new_path = np.vstack([origin_path, p])
                        self.paths.append(new_path)
                        self.reached.append(all(new_path[new_path.shape[0]-1] == np.array(self.end)))
                        self.len.append(len(new_path))
                        self.find(p)

In [24]:
class TruckDistance:
    def __init__(self, osm_mask):
        self.osm_mask = osm_mask.roadmask
        self.lon_lat = {"lat":osm_mask.lat.values, "lon":osm_mask.lon.values}
        self.pf = None
        self.line = None
        self.dist = None
        
    # a shapely Point truck
    # b shapely Point station
    def calc_travel_dist(self, a, b):
        indices_a = int(np.where(self.lon_lat["lat"] == a.y)[0]), int(np.where(self.lon_lat["lon"] == a.x)[0])
        b_deviation_y, b_deviation_x = np.abs(self.lon_lat["lat"]-b.y), np.abs(self.lon_lat["lon"]-b.x)
        indices_b = int(np.where(b_deviation_y == b_deviation_y.min())[0]), int(np.where(b_deviation_x == b_deviation_x.min())[0])
        self.pf = PathFinder(self.osm_mask.values, indices_b)
        self.pf.find_path(indices_a)
        path = self.pf.shortest_path
        if path is None:
            self.dist = None
        else:
            self.indices_to_line(path)
            self.dist = degrees2kilometers(self.line.length)    
        
    def indices_to_line(self, path):
        lat = self.lon_lat["lat"][path[0]]
        lon = self.lon_lat["lon"][path[1]]
        df = pd.DataFrame({"lat":lat, "lon":lon})
        points = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
        self.line = LineString([x for x in points.geometry])

_______________
## 5 | Execute

In [47]:
eo_vs_truth = []
n = len(truck_points_paths)
for i in range(n):
    print("Validating %s" %(str(i)))
    fpath_points = truck_points_paths[i]
    if fpath_points is None:
        eo_vs_truth.append(None)
    else:
        date = os.path.basename(truck_points_paths[i])[0:10]
        true_counts_path = list(true_counts_paths.keys())[i]
        station_xy = list(true_counts_paths.values())[i]
        validator = Validator(
            fpath_points, 
            true_counts_path, 
            station_xy
        )
        validator.validate(date, hour, minutes, speed, grid_gadm, dir_not_commit)
        print("Done with %s" %(str(i)))
        eo_vs_truth.append(validator.eo_vs_truth)
print("Done")

Validating 0
789
Done with 0
Validating 1
789
Done with 1
Validating 2
789
Done with 2
Validating 3
789
Done with 3
Done


In [48]:
eo_vs_truth

[{'date': '2018-05-08',
  'eo_count': 221.0,
  'true_count': 253.33333333333331,
  'percentage': 87.23684210526316,
  'cars': 933.3333333333333},
 {'date': '2018-05-08',
  'eo_count': 251.0,
  'true_count': 291.3333333333333,
  'percentage': 86.15560640732267,
  'cars': 1020.6666666666666},
 {'date': '2018-05-08',
  'eo_count': 249.5,
  'true_count': 407.3333333333333,
  'percentage': 61.252045826513914,
  'cars': 793.6666666666666},
 {'date': '2018-05-08',
  'eo_count': 192.0,
  'true_count': 270.0,
  'percentage': 71.11111111111111,
  'cars': 547.3333333333333}]

In [49]:
points = list(true_counts_paths.values())
stations = gpd.GeoDataFrame({"geometry":points, "station":[7781, 7121, 7955,7443]})
stations.to_file(os.path.join(dirs[VLD()], "stations.geojson"), driver="GeoJSON")