# Validation 

Here we use 20% of historical incident data to validate how the model perform. Next, we will validate it in different aspectors: day of week, time period of day and number of highway.

Preparation

In [46]:
## Importing all neccesary modules
import geopandas as gpd
import folium 
import json
import pandas as pd
import numpy as np

from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
import branca.colormap as cm
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import pickle
import networkx as nx
from scipy import spatial


In [65]:
# Functions
def WGS84toDutchRD(wgs84East, wgs84North):
    # translated from Peter Knoppers's code

    # wgs84East: longtitude
    # wgs84North: latitude

    # Western boundary of the Dutch RD system. */
    WGS84_WEST_LIMIT = 3.2

    # Eastern boundary of the Dutch RD system. */
    WGS84_EAST_LIMIT = 7.3

    # Northern boundary of the Dutch RD system. */
    WGS84_SOUTH_LIMIT = 50.6

    # Southern boundary of the Dutch RD system. */
    WGS84_NORTH_LIMIT = 53.7

    if (wgs84North > WGS84_NORTH_LIMIT) or \
        (wgs84North < WGS84_SOUTH_LIMIT) or \
        (wgs84East < WGS84_WEST_LIMIT) or \
        (wgs84East > WGS84_EAST_LIMIT):
        resultX = -1
        resultY = -1
    else:
        r = [[155000.00, 190094.945,   -0.008, -32.391, 0.0],
            [-0.705, -11832.228,    0.0  ,   0.608, 0.0],
            [0.0  ,   -114.221,    0.0  ,   0.148, 0.0],
            [0.0  ,     -2.340,    0.0  ,   0.0  , 0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  , 0.0]]
        s = [[463000.00 ,      0.433, 3638.893,   0.0  ,  0.092],
            [309056.544,     -0.032, -157.984,   0.0  , -0.054],
            [73.077,      0.0  ,   -6.439,   0.0  ,  0.0],
            [59.788,      0.0  ,    0.0  ,   0.0  ,  0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  ,  0.0]]
        resultX = 0
        resultY = 0
        powNorth = 1
        dNorth = 0.36 * (wgs84North - 52.15517440)
        dEast = 0.36 * (wgs84East - 5.38720621)

        for p in range(5):
            powEast = 1
            for q in range(5):
                resultX = resultX + r[p][q] * powEast * powNorth
                resultY = resultY + s[p][q] * powEast * powNorth
                powEast = powEast * dEast
            powNorth = powNorth * dNorth
    return resultX, resultY

def DutchRDtoWGS84(rdX, rdY):
    """ Convert DutchRD to WGS84
    """
    RD_MINIMUM_X = 11000
    RD_MAXIMUM_X = 280000
    RD_MINIMUM_Y = 300000
    RD_MAXIMUM_Y = 630000
    if (rdX < RD_MINIMUM_X or rdX > RD_MAXIMUM_X
        or rdY < RD_MINIMUM_Y or rdY > RD_MAXIMUM_Y):
        resultNorth = -1
        resultEast = -1
        return resultNorth, resultEast
    # else
    dX = (rdX - 155000.0) / 100000.0
    dY = (rdY - 463000.0) / 100000.0
    k = [[3600 * 52.15517440, 3235.65389, -0.24750, -0.06550, 0.0],
        [-0.00738   ,   -0.00012,  0.0    ,  0.0    , 0.0],
        [-32.58297   ,   -0.84978, -0.01709, -0.00039, 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [0.00530   ,    0.00033,  0.0    ,  0.0    , 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0]]
    l = [[3600 * 5.38720621,    0.01199,  0.00022,  0.0    , 0.0],
        [5260.52916   ,  105.94684,  2.45656,  0.05594, 0.00128],
        [-0.00022   ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [-0.81885   ,   -0.05607, -0.00256,  0.0    , 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [0.00026   ,    0.0    ,  0.0    ,  0.0    , 0.0]]
    resultNorth = 0
    resultEast = 0
    powX = 1

    for p in range(6):
        powY = 1
        for q in range(5):
            resultNorth = resultNorth + k[p][q] * powX * powY / 3600.0
            resultEast = resultEast + l[p][q] * powX * powY / 3600.0
            powY = powY * dY
        powX = powX * dX
    return resultNorth, resultEast



# travel_time function
def travel_time_func(point1, point2, time='min'):
    """This function uses the information given in network X to return the travel time between two points.
        point1 and point2 should be tuples with the coordinates in longitude, latitude.
        if time = 'peak', the peak travel time is used. In all other cases the minimum travel time is used."""

    # Determine which travel times to use
    if time == 'peak':
        time_string = 'Peak_travel_time_[s]'
    else:
        time_string = 'Min_travel_time_[s]'

    # Change points to Dutch system
    p1_x, p1_y = WGS84toDutchRD(point1[0], point1[1]) # inspector
    p2_x, p2_y = WGS84toDutchRD(point2[0], point2[1]) # incident

    # Create numpy matrix from nodes
    A = np.array(list(G.nodes()))

    # Get node closest to each point
    dist_node1, index_node1 = spatial.KDTree(A).query([p1_x, p1_y])
    node1 = (A[index_node1][0], A[index_node1][1])

    dist_node2, index_node2 = spatial.KDTree(A).query([p2_x, p2_y])
    node2 = (A[index_node2][0], A[index_node2][1])

    # Get shortest path between nodes
    route = nx.shortest_path(G, node1, node2, time_string)
    travel_time = nx.shortest_path_length(G, node1, node2, time_string)

    return route, travel_time

def calc_distance(line_wkt):
    line = ogr.CreateGeometryFromWkt(line_wkt)
    points = line.GetPoints()
    d = 0
    for p0, p1 in zip(points, points[1:]):
        d = d + geodesic(p0, p1).m
    return d

highway_shapefile = 'Shapefiles/Snelheid_Wegvakken.shp'
network_temp = gpd.read_file(highway_shapefile)

if __name__=="__main__":
    x, y = WGS84toDutchRD(4.33, 52.04)
    print(DutchRDtoWGS84(x, y))

def WGS84toDutchRD(wgs84East, wgs84North):
    # translated from Peter Knoppers's code

    # wgs84East: longtitude
    # wgs84North: latitude

    # Western boundary of the Dutch RD system. */
    WGS84_WEST_LIMIT = 3.2

    # Eastern boundary of the Dutch RD system. */
    WGS84_EAST_LIMIT = 7.3

    # Northern boundary of the Dutch RD system. */
    WGS84_SOUTH_LIMIT = 50.6

    # Southern boundary of the Dutch RD system. */
    WGS84_NORTH_LIMIT = 53.7

    if (wgs84North > WGS84_NORTH_LIMIT) or \
        (wgs84North < WGS84_SOUTH_LIMIT) or \
        (wgs84East < WGS84_WEST_LIMIT) or \
        (wgs84East > WGS84_EAST_LIMIT):
        resultX = -1
        resultY = -1
    else:
        r = [[155000.00, 190094.945,   -0.008, -32.391, 0.0],
            [-0.705, -11832.228,    0.0  ,   0.608, 0.0],
            [0.0  ,   -114.221,    0.0  ,   0.148, 0.0],
            [0.0  ,     -2.340,    0.0  ,   0.0  , 0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  , 0.0]]
        s = [[463000.00 ,      0.433, 3638.893,   0.0  ,  0.092],
            [309056.544,     -0.032, -157.984,   0.0  , -0.054],
            [73.077,      0.0  ,   -6.439,   0.0  ,  0.0],
            [59.788,      0.0  ,    0.0  ,   0.0  ,  0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  ,  0.0]]
        resultX = 0
        resultY = 0
        powNorth = 1
        dNorth = 0.36 * (wgs84North - 52.15517440)
        dEast = 0.36 * (wgs84East - 5.38720621)

        for p in range(5):
            powEast = 1
            for q in range(5):
                resultX = resultX + r[p][q] * powEast * powNorth
                resultY = resultY + s[p][q] * powEast * powNorth
                powEast = powEast * dEast
            powNorth = powNorth * dNorth
    return resultX, resultY

road_section_data = pd.read_csv('speed_data', sep=';')
G = pickle.load(open('NetworkX_graph_new.pickle', 'rb'))

# Read the CSV file
df = pd.read_csv('incidents19Q3Q4.csv')
df.columns = ['index', 'id', 'type', 'start_time', 'end_time', 'road_number', 'longitude', 'latitude']
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

def data_filter(data_input):
    data_input = df.dropna()
    data_input.loc[:,'road_number'] = data_input['road_number'].replace({'A12 hrb':'A12', 'A16 hrb':'A16', 'A2 hrb':'A2'})
    new_data = data_input[data_input['road_number'].str.startswith('A')]
    new_data = new_data.drop(new_data.loc[new_data['index'] == 183130].index)
    
    return new_data

incidents_df = data_filter(df)

# The dateframe for validation
data = incidents_df[['latitude', 'longitude']].values

(52.03999999894767, 4.330000046074026)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Here I set the selecting probibility of data is 0.1% to reduce the running time, note that use the remaining data for validation beyond the training data

In [66]:
# Here I set the selecting probibility of data is 0.1% to reduce the running time
percentage_of_test = 0.2
num_rows_to_select = int(percentage_of_test * len(data))
random_dataframe = incidents_df.sample(n=num_rows_to_select, random_state=42)

validation_dataframe = pd.DataFrame(random_dataframe)
validation_data = validation_dataframe[['latitude', 'longitude']].values

Prepare your result of method, make it as a dataframe

In [67]:
validation_dataframe.describe()

Unnamed: 0,index,longitude,latitude
count,14422.0,14422.0,14422.0
mean,221892.900638,5.132876,52.017099
std,109047.320793,0.611447,0.410717
min,12.0,3.596818,50.758739
25%,142329.25,4.649327,51.825677
50%,219901.0,5.009739,52.024513
75%,328425.5,5.56286,52.274388
max,374075.0,7.211078,53.20644


# Here put your result here as the same format of the dataframe below:

Here is the example of inspector coordinates you should print

In [68]:
result_df = pd.read_csv('inspector_locations.csv',sep=';')
result_df = pd.DataFrame(result_df,columns=('0','1'))

latitude = result_df['0'].values
longitude = result_df['1'].values

latitude = np.array([float(lat) for lat in latitude])
longitude = np.array([float(lon) for lon in longitude])

inspector_coordinates = np.column_stack((longitude, latitude))

print(inspector_coordinates)

[[ 5.7239645  51.91524764]
 [ 4.52396981 52.14752122]
 [ 6.20289793 52.63947774]
 [ 5.18776183 52.09215873]
 [ 4.53817811 51.91010657]
 [ 5.40491152 51.47007894]
 [ 5.7975566  50.96415582]
 [ 4.69602076 52.49033978]
 [ 4.31724415 51.4890146 ]
 [ 5.69953407 53.0028806 ]
 [ 4.88231301 51.65220186]
 [ 6.59330058 53.16281055]
 [ 4.99493484 52.31657651]
 [ 5.98102422 51.98215196]
 [ 5.59189851 52.16875261]
 [ 4.36775207 51.90883476]
 [ 3.89769992 51.47991597]
 [ 6.74363123 52.28336146]
 [ 5.60434972 51.31409948]
 [ 5.23227072 51.85587892]
 [ 6.01888619 52.19444814]
 [ 5.20307828 53.01839289]
 [ 5.41940852 52.43158897]
 [ 6.12596274 51.36855153]
 [ 5.95421782 52.98485238]
 [ 4.85495271 52.40613961]
 [ 4.94238452 51.84428368]
 [ 4.68921873 52.03289537]
 [ 5.97856965 52.45435165]
 [ 5.34020214 51.71359349]
 [ 4.73204738 51.56566005]
 [ 5.20394008 52.26707061]
 [ 5.41777745 52.15672068]
 [ 6.89932974 53.12808138]
 [ 5.06749026 52.0274545 ]
 [ 5.13415271 51.56276424]
 [ 6.3082515  52.24184736]
 

## 1. Day of a week

First we need classify all data by day of week

In [69]:
validation_data = validation_data.tolist()
validation_data = [[coord[1], coord[0]] for coord in validation_data]
validation_data = np.array(validation_data)
validation_data


array([[ 6.00574398, 50.83420944],
       [ 5.22854805, 51.69330978],
       [ 5.71147871, 52.01897049],
       ...,
       [ 4.62426043, 51.8567009 ],
       [ 5.04567814, 52.32836914],
       [ 4.44758892, 51.86660004]])

Here we calculate the shorest travel time of each incidents

In [70]:
# Create an empty list to store the results
results = []

# Loop through each incident
for i, incident_coord in enumerate(validation_data):
    minimum_travel_time = float('inf')
    
    # Loop through each inspector's coordinates
    for j, inspector_coord in enumerate(inspector_coordinates):
        _, travel_time = travel_time_func(inspector_coord, incident_coord)
        if travel_time < minimum_travel_time:
            minimum_travel_time = travel_time
    
    results.append([i, minimum_travel_time])

# Create a DataFrame from the results list
results_df = pd.DataFrame(results, columns=["Incident Index", "Minimum Travel Time"])


In [71]:
validation_dataframe

Unnamed: 0,index,id,type,start_time,end_time,road_number,longitude,latitude
69456,333510,RWS03_801224_1,vehicle_obstruction,2019-10-24 07:27:56,2019-10-24 08:09:29,A76,6.005744,50.834209
68709,331758,RWS03_800217_1,vehicle_obstruction,2019-10-22 14:37:30,2019-10-22 15:13:02,A59,5.228548,51.693310
79043,354186,RWS03_821463_1,vehicle_obstruction,2019-12-02 11:36:37,2019-12-02 12:51:49,A12,5.711479,52.018970
23397,224174,RWS02_0000107275_107275,vehicle_obstruction,2019-10-16 17:12:44,2019-10-16 17:13:17,A2,5.788115,50.966351
75224,346437,RWS03_816290_1,vehicle_obstruction,2019-11-21 09:35:26,2019-11-21 10:50:32,A4,4.376596,51.893478
...,...,...,...,...,...,...,...,...
791,176512,LCM-LCM19201088-IM_1,general_obstruction,2019-11-07 07:56:07,2019-11-07 09:11:19,A1,6.786745,52.288742
19108,124982,RWS02_0000102391_102391,vehicle_obstruction,2019-09-30 12:45:37,2019-09-30 12:58:28,A12,5.623880,52.021141
86127,368623,RWS03_830076_1,vehicle_obstruction,2019-12-20 05:48:46,2019-12-20 07:04:24,A15,4.624260,51.856701
43810,139866,RWS03_759355_1,vehicle_obstruction,2019-08-12 06:11:17,2019-08-12 07:26:50,A1,5.045678,52.328369


In [72]:
start_time_column = pd.DataFrame(validation_dataframe['start_time'].values, columns=['start_time'])
longitude_column = pd.DataFrame(validation_dataframe['longitude'].values, columns=['longitude'])
latitude_column = pd.DataFrame(validation_dataframe['latitude'].values, columns=['latitude'])
results_df['start_time'] = start_time_column
results_df['longitude'] = longitude_column
results_df['latitude'] = latitude_column


# Validation in time dimension

Here we classify the data by day of week and time of a day, and then calculate the average travel time they need

In [73]:
day_of_week = results_df['start_time'].dt.dayofweek
results_df['day of week'] = day_of_week
results_df['hour of day'] = results_df['start_time'].apply(lambda x: x.hour)
results_df

Unnamed: 0,Incident Index,Minimum Travel Time,start_time,longitude,latitude,day of week,hour of day
0,0,838.189187,2019-10-24 07:27:56,6.005744,50.834209,3,7
1,1,603.974491,2019-10-22 14:37:30,5.228548,51.693310,1,14
2,2,884.100000,2019-12-02 11:36:37,5.711479,52.018970,0,11
3,3,193.019233,2019-10-16 17:12:44,5.788115,50.966351,2,17
4,4,637.564092,2019-11-21 09:35:26,4.376596,51.893478,3,9
...,...,...,...,...,...,...,...
14417,14417,1919.550958,2019-11-07 07:56:07,6.786745,52.288742,3,7
14418,14418,612.200000,2019-09-30 12:45:37,5.623880,52.021141,0,12
14419,14419,477.593967,2019-12-20 05:48:46,4.624260,51.856701,4,5
14420,14420,212.080254,2019-08-12 06:11:17,5.045678,52.328369,0,6


In [74]:
results = []
time_ranges = [(0, 5), (6, 9), (10, 14), (15, 18), (19, 23)]
for i in range(7):
    data_filter = results_df[results_df['day of week'] == i]

    for start_hour, end_hour in time_ranges:
        data = data_filter[data_filter['hour of day'].between(start_hour, end_hour)][['Minimum Travel Time']]
        
        # Calculate the probability that travel time is less than 18 minutes
        count_below_18mins = (data['Minimum Travel Time'] < 1080).sum()


        if len(data) > 0:
            pro_below_18mins = count_below_18mins / len(data)
            pro_below_18mins = "{:.2f}".format(pro_below_18mins)

        else:
            pro_below_18mins = 'NAN'

        # Calculate the average travel time
        avg_travel_time = data['Minimum Travel Time'].mean()


        if pd.notna(avg_travel_time):
            avg_travel_time = "{:.2f}".format(avg_travel_time)
        else:
            avg_travel_time = 'NAN'

        number_of_incidents = len(data)
        results.append({
            'Day': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'][i],
            'Time ranges': f'{start_hour}-{end_hour}',
            'Number of incidents': number_of_incidents,
            'Average Travel Time': avg_travel_time,
            'Probability of <18 mins': pro_below_18mins
        })

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,Day,Time ranges,Number of incidents,Average Travel Time,Probability of <18 mins
0,Monday,0-5,148,669.75,0.84
1,Monday,6-9,762,607.46,0.89
2,Monday,10-14,681,631.15,0.86
3,Monday,15-18,880,618.38,0.88
4,Monday,19-23,345,606.12,0.88
5,Tuesday,0-5,121,618.16,0.86
6,Tuesday,6-9,744,596.39,0.90
7,Tuesday,10-14,685,624.95,0.88
8,Tuesday,15-18,999,586.70,0.90
9,Tuesday,19-23,367,646.14,0.87


In [75]:
df_results.to_csv('results11.csv', index=False)


## Overview Visualization (Time dimension)

In [76]:
pip install plotly


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [77]:
import numpy as np
import plotly.graph_objs as go
import plotly.offline as pyo

# Prepare the data
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
time_ranges = ['0-5', '6-9', '10-14', '15-18', '19-23']

# Sample data for demonstration purposes
Z = np.random.rand(len(time_ranges), len(days))  # Replace this with your actual data
colors = np.random.rand(len(time_ranges), len(days))  # Replace this with your actual data

# Create a surface plot
surface = go.Surface(z=Z, x=days, y=time_ranges, colorscale='Viridis', cmin=0, cmax=1, colorbar=dict(title='Probability of <18 mins'))

# Set axis labels
layout = go.Layout(
    scene=dict(
        xaxis_title='Day',
        yaxis_title='Time ranges',
        zaxis_title='Average Travel Time'
    )
)

# Set custom x-axis labels
layout.scene.xaxis.update(tickvals=days, ticktext=days)

# Set custom y-axis labels
layout.scene.yaxis.update(tickvals=time_ranges, ticktext=time_ranges)

# Create the figure
fig = go.Figure(data=[surface], layout=layout)

# Set subplot title
fig.update_layout(title='Validation Heatmap')

# Save the interactive plot as an HTML file
pyo.plot(fig, filename='Time dimension.html')


'Time dimension.html'

## Overview Visualization (Geographic dimension)

In [78]:
import folium
from folium.plugins import HeatMap
import pandas as pd


m = folium.Map(location=[52.399190, 4.893658], zoom_start=10, zoom_control=False)

# Create a HeatMap layer to visualize Minimum Travel Time
heat_data = [[row['latitude'], row['longitude'], row['Minimum Travel Time']] for _, row in results_df.iterrows()]
HeatMap(heat_data).add_to(m)

color_gradient = {
    0.0: 'blue',
    500.0: 'green',
    1000.0: 'yellow',
    2000.0: 'red'
}

HeatMap(heat_data, gradient=color_gradient).add_to(m)

m.save('Geographic_dimension.html')






