# Validation 

Here we use 20% of historical incident data to validate how the model perform. Next, we will validate it in different aspectors: day of week, time period of day and number of highway.

Preparation

In [1]:
## Importing all neccesary modules
import geopandas as gpd
import folium 
import json
import pandas as pd
import numpy as np

from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
import branca.colormap as cm
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import pickle
import networkx as nx
from scipy import spatial

# Functions
def WGS84toDutchRD(wgs84East, wgs84North):
    # translated from Peter Knoppers's code

    # wgs84East: longtitude
    # wgs84North: latitude

    # Western boundary of the Dutch RD system. */
    WGS84_WEST_LIMIT = 3.2

    # Eastern boundary of the Dutch RD system. */
    WGS84_EAST_LIMIT = 7.3

    # Northern boundary of the Dutch RD system. */
    WGS84_SOUTH_LIMIT = 50.6

    # Southern boundary of the Dutch RD system. */
    WGS84_NORTH_LIMIT = 53.7

    if (wgs84North > WGS84_NORTH_LIMIT) or \
        (wgs84North < WGS84_SOUTH_LIMIT) or \
        (wgs84East < WGS84_WEST_LIMIT) or \
        (wgs84East > WGS84_EAST_LIMIT):
        resultX = -1
        resultY = -1
    else:
        r = [[155000.00, 190094.945,   -0.008, -32.391, 0.0],
            [-0.705, -11832.228,    0.0  ,   0.608, 0.0],
            [0.0  ,   -114.221,    0.0  ,   0.148, 0.0],
            [0.0  ,     -2.340,    0.0  ,   0.0  , 0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  , 0.0]]
        s = [[463000.00 ,      0.433, 3638.893,   0.0  ,  0.092],
            [309056.544,     -0.032, -157.984,   0.0  , -0.054],
            [73.077,      0.0  ,   -6.439,   0.0  ,  0.0],
            [59.788,      0.0  ,    0.0  ,   0.0  ,  0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  ,  0.0]]
        resultX = 0
        resultY = 0
        powNorth = 1
        dNorth = 0.36 * (wgs84North - 52.15517440)
        dEast = 0.36 * (wgs84East - 5.38720621)

        for p in range(5):
            powEast = 1
            for q in range(5):
                resultX = resultX + r[p][q] * powEast * powNorth
                resultY = resultY + s[p][q] * powEast * powNorth
                powEast = powEast * dEast
            powNorth = powNorth * dNorth
    return resultX, resultY

def DutchRDtoWGS84(rdX, rdY):
    """ Convert DutchRD to WGS84
    """
    RD_MINIMUM_X = 11000
    RD_MAXIMUM_X = 280000
    RD_MINIMUM_Y = 300000
    RD_MAXIMUM_Y = 630000
    if (rdX < RD_MINIMUM_X or rdX > RD_MAXIMUM_X
        or rdY < RD_MINIMUM_Y or rdY > RD_MAXIMUM_Y):
        resultNorth = -1
        resultEast = -1
        return resultNorth, resultEast
    # else
    dX = (rdX - 155000.0) / 100000.0
    dY = (rdY - 463000.0) / 100000.0
    k = [[3600 * 52.15517440, 3235.65389, -0.24750, -0.06550, 0.0],
        [-0.00738   ,   -0.00012,  0.0    ,  0.0    , 0.0],
        [-32.58297   ,   -0.84978, -0.01709, -0.00039, 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [0.00530   ,    0.00033,  0.0    ,  0.0    , 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0]]
    l = [[3600 * 5.38720621,    0.01199,  0.00022,  0.0    , 0.0],
        [5260.52916   ,  105.94684,  2.45656,  0.05594, 0.00128],
        [-0.00022   ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [-0.81885   ,   -0.05607, -0.00256,  0.0    , 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [0.00026   ,    0.0    ,  0.0    ,  0.0    , 0.0]]
    resultNorth = 0
    resultEast = 0
    powX = 1

    for p in range(6):
        powY = 1
        for q in range(5):
            resultNorth = resultNorth + k[p][q] * powX * powY / 3600.0
            resultEast = resultEast + l[p][q] * powX * powY / 3600.0
            powY = powY * dY
        powX = powX * dX
    return resultNorth, resultEast



# travel_time function
def travel_time_func(point1, point2, time='min'):
    """This function uses the information given in network X to return the travel time between two points.
        point1 and point2 should be tuples with the coordinates in longitude, latitude.
        if time = 'peak', the peak travel time is used. In all other cases the minimum travel time is used."""

    # Determine which travel times to use
    if time == 'peak':
        time_string = 'Peak_travel_time_[s]'
    else:
        time_string = 'Min_travel_time_[s]'

    # Change points to Dutch system
    p1_x, p1_y = WGS84toDutchRD(point1[0], point1[1]) # inspector
    p2_x, p2_y = WGS84toDutchRD(point2[0], point2[1]) # incident

    # Create numpy matrix from nodes
    A = np.array(list(G.nodes()))

    # Get node closest to each point
    dist_node1, index_node1 = spatial.KDTree(A).query([p1_x, p1_y])
    node1 = (A[index_node1][0], A[index_node1][1])

    dist_node2, index_node2 = spatial.KDTree(A).query([p2_x, p2_y])
    node2 = (A[index_node2][0], A[index_node2][1])

    # Get shortest path between nodes
    route = nx.shortest_path(G, node1, node2, time_string)
    travel_time = nx.shortest_path_length(G, node1, node2, time_string)

    return route, travel_time

def calc_distance(line_wkt):
    line = ogr.CreateGeometryFromWkt(line_wkt)
    points = line.GetPoints()
    d = 0
    for p0, p1 in zip(points, points[1:]):
        d = d + geodesic(p0, p1).m
    return d

highway_shapefile = 'Shapefiles/Snelheid_Wegvakken.shp'
network_temp = gpd.read_file(highway_shapefile)

if __name__=="__main__":
    x, y = WGS84toDutchRD(4.33, 52.04)
    print(DutchRDtoWGS84(x, y))

def WGS84toDutchRD(wgs84East, wgs84North):
    # translated from Peter Knoppers's code

    # wgs84East: longtitude
    # wgs84North: latitude

    # Western boundary of the Dutch RD system. */
    WGS84_WEST_LIMIT = 3.2

    # Eastern boundary of the Dutch RD system. */
    WGS84_EAST_LIMIT = 7.3

    # Northern boundary of the Dutch RD system. */
    WGS84_SOUTH_LIMIT = 50.6

    # Southern boundary of the Dutch RD system. */
    WGS84_NORTH_LIMIT = 53.7

    if (wgs84North > WGS84_NORTH_LIMIT) or \
        (wgs84North < WGS84_SOUTH_LIMIT) or \
        (wgs84East < WGS84_WEST_LIMIT) or \
        (wgs84East > WGS84_EAST_LIMIT):
        resultX = -1
        resultY = -1
    else:
        r = [[155000.00, 190094.945,   -0.008, -32.391, 0.0],
            [-0.705, -11832.228,    0.0  ,   0.608, 0.0],
            [0.0  ,   -114.221,    0.0  ,   0.148, 0.0],
            [0.0  ,     -2.340,    0.0  ,   0.0  , 0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  , 0.0]]
        s = [[463000.00 ,      0.433, 3638.893,   0.0  ,  0.092],
            [309056.544,     -0.032, -157.984,   0.0  , -0.054],
            [73.077,      0.0  ,   -6.439,   0.0  ,  0.0],
            [59.788,      0.0  ,    0.0  ,   0.0  ,  0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  ,  0.0]]
        resultX = 0
        resultY = 0
        powNorth = 1
        dNorth = 0.36 * (wgs84North - 52.15517440)
        dEast = 0.36 * (wgs84East - 5.38720621)

        for p in range(5):
            powEast = 1
            for q in range(5):
                resultX = resultX + r[p][q] * powEast * powNorth
                resultY = resultY + s[p][q] * powEast * powNorth
                powEast = powEast * dEast
            powNorth = powNorth * dNorth
    return resultX, resultY

road_section_data = pd.read_csv('speed_data.txt', sep=';')
G = pickle.load(open('NetworkX_graph_new.pickle', 'rb'))

# Read the CSV file
df = pd.read_csv('incidents19Q3Q4.csv')
df.columns = ['index', 'id', 'type', 'start_time', 'end_time', 'road_number', 'longitude', 'latitude']
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

def data_filter(data_input):
    data_input = df.dropna()
    data_input.loc[:,'road_number'] = data_input['road_number'].replace({'A12 hrb':'A12', 'A16 hrb':'A16', 'A2 hrb':'A2'})
    new_data = data_input[data_input['road_number'].str.startswith('A')]
    new_data = new_data.drop(new_data.loc[new_data['index'] == 183130].index)
    
    return new_data

incidents_df = data_filter(df)

# The dateframe for validation
data = incidents_df[['latitude', 'longitude']].values




(52.03999999894767, 4.330000046074026)


Here I set the selecting probibility of data is 0.1% to reduce the running time, note that use the remaining data for validation beyond the training data

In [2]:
# Here I set the selecting probibility of data is 0.1% to reduce the running time
percentage_of_test = 0.001
num_rows_to_select = int(percentage_of_test * len(data))
random_dataframe = incidents_df.sample(n=num_rows_to_select, random_state=42)

validation_dataframe = pd.DataFrame(random_dataframe)
validation_data = validation_dataframe[['latitude', 'longitude']].values

Prepare your result of method, make it as a dataframe

In [3]:
validation_dataframe.describe()

Unnamed: 0,index,start_time,end_time,longitude,latitude
count,72.0,72,72,72.0,72.0
mean,218730.138889,2019-10-15 12:37:36.666666752,2019-10-15 13:30:00,5.085964,51.970602
min,628.0,2019-08-06 13:17:00,2019-08-06 13:58:00,4.3082,50.834209
25%,137128.25,2019-09-05 19:55:45,2019-09-05 21:14:00,4.647894,51.693205
50%,221594.5,2019-10-16 13:39:30,2019-10-16 13:58:30,4.963928,52.034362
75%,334360.5,2019-11-23 02:31:00,2019-11-23 03:46:45,5.43948,52.331007
max,373922.0,2019-12-31 18:01:00,2019-12-31 19:16:00,6.647789,53.199619
std,110771.063137,,,0.541366,0.430378


# Here put your result here as the same format of the dataframe below:

Here is the example of inspector coordinates you should print

In [4]:
result_df = pd.read_csv('Joey result1.csv')

latitude = result_df['latitude'].values
longitude = result_df['longitude'].values

latitude = np.array([float(lat) for lat in latitude])
longitude = np.array([float(lon) for lon in longitude])

inspector_coordinates = np.column_stack((longitude, latitude))

print(inspector_coordinates)

[[ 4.50187431 52.04574431]
 [ 5.67297483 52.01272177]
 [ 5.19670239 52.09236214]
 [ 6.19055917 52.57923173]
 [ 5.23121567 51.52133068]
 [ 4.53683559 51.92267934]
 [ 5.82605138 51.03960755]
 [ 6.63486046 53.2026885 ]
 [ 4.89588392 52.49408976]
 [ 4.08584293 51.42662118]
 [ 5.51762132 52.25671568]
 [ 5.74317938 52.87121431]
 [ 4.72367748 51.59067395]
 [ 5.88324848 51.38606201]
 [ 4.09880591 51.93189666]
 [ 6.62631029 52.2904633 ]
 [ 5.21408784 53.02402607]
 [ 6.27547179 51.94338812]
 [ 4.72094075 51.83727655]
 [ 5.26226288 51.85850537]
 [ 6.02309835 52.44413292]
 [ 4.31577554 51.47890297]
 [ 6.02899282 51.18835695]
 [ 4.35926842 51.9249212 ]
 [ 4.73594229 52.38630993]
 [ 5.11376255 52.3221628 ]
 [ 5.63050498 52.60517898]
 [ 5.63930105 51.42269475]
 [ 6.51624137 52.87752579]
 [ 6.02155437 52.20606677]
 [ 4.986789   52.18730474]
 [ 4.8851476  51.68728427]
 [ 6.08262744 53.08895083]
 [ 4.64245207 51.75405905]
 [ 5.62490303 51.75670605]
 [ 5.69150101 52.34232821]
 [ 4.70205227 52.03844319]
 

## 1. Day of a week

First we need classify all data by day of week

In [5]:
validation_data = validation_data.tolist()
validation_data = [[coord[1], coord[0]] for coord in validation_data]
validation_data = np.array(validation_data)
validation_data


array([[ 6.00574398, 50.83420944],
       [ 5.22854805, 51.69330978],
       [ 5.71147871, 52.01897049],
       [ 5.78811502, 50.96635056],
       [ 4.37659645, 51.89347839],
       [ 4.91174984, 52.42041016],
       [ 4.98722982, 52.3384819 ],
       [ 4.4807725 , 52.12430954],
       [ 5.43932915, 52.19535828],
       [ 5.28893423, 51.50384903],
       [ 4.78152609, 52.3932991 ],
       [ 4.48322487, 52.04769135],
       [ 4.89559698, 52.33385086],
       [ 6.13935   , 51.91523   ],
       [ 4.64989996, 51.78752899],
       [ 4.61131716, 51.84289169],
       [ 4.50442362, 52.13596725],
       [ 4.46320677, 51.93878937],
       [ 4.84232235, 52.37279892],
       [ 5.43837023, 51.40491104],
       [ 5.25423   , 51.8303    ],
       [ 4.7986002 , 52.33005905],
       [ 4.91259813, 52.28876114],
       [ 4.36975002, 52.05672073],
       [ 5.84753799, 52.38959885],
       [ 5.6237669 , 52.6033287 ],
       [ 5.07707787, 51.9581604 ],
       [ 5.01974011, 52.07426071],
       [ 5.42151546,

Here we calculate the shorest travel time of each incidents

In [6]:
# Create an empty list to store the results
results = []

# Loop through each incident
for i, incident_coord in enumerate(validation_data):
    minimum_travel_time = float('inf')
    
    # Loop through each inspector's coordinates
    for j, inspector_coord in enumerate(inspector_coordinates):
        _, travel_time = travel_time_func(inspector_coord, incident_coord)
        if travel_time < minimum_travel_time:
            minimum_travel_time = travel_time
    
    results.append([i, minimum_travel_time])

# Create a DataFrame from the results list
results_df = pd.DataFrame(results, columns=["Incident Index", "Minimum Travel Time"])


In [15]:
validation_dataframe

Unnamed: 0,index,id,type,start_time,end_time,road_number,longitude,latitude
69456,333510,RWS03_801224_1,vehicle_obstruction,2019-10-24 07:27:00,2019-10-24 08:09:00,A76,6.005744,50.834209
68709,331758,RWS03_800217_1,vehicle_obstruction,2019-10-22 14:37:00,2019-10-22 15:13:00,A59,5.228548,51.693310
79043,354186,RWS03_821463_1,vehicle_obstruction,2019-12-02 11:36:00,2019-12-02 12:51:00,A12,5.711479,52.018970
23397,224174,RWS02_0000107275_107275,vehicle_obstruction,2019-10-16 17:12:00,2019-10-16 17:13:00,A2,5.788115,50.966351
75224,346437,RWS03_816290_1,vehicle_obstruction,2019-11-21 09:35:00,2019-11-21 10:50:00,A4,4.376596,51.893478
...,...,...,...,...,...,...,...,...
70948,336912,RWS03_804064_1,accident,2019-10-29 08:30:00,2019-10-29 09:08:00,A29,4.449137,51.797321
73510,342741,RWS03_808305_1,vehicle_obstruction,2019-11-04 18:33:00,2019-11-04 19:49:00,A10,4.970390,52.363892
82452,361100,RWS03_825610_1,accident,2019-12-10 17:37:00,2019-12-10 18:52:00,A50,5.498248,51.585007
64576,322391,RWS03_793160_1,general_obstruction,2019-10-09 16:33:00,2019-10-09 19:05:00,A12,4.534351,52.037473


In [22]:
start_time_column = pd.DataFrame(validation_dataframe['start_time'].values, columns=['start_time'])
longitude_column = pd.DataFrame(validation_dataframe['longitude'].values, columns=['longitude'])
latitude_column = pd.DataFrame(validation_dataframe['latitude'].values, columns=['latitude'])
results_df['start_time'] = start_time_column
results_df['longitude'] = longitude_column
results_df['latitude'] = latitude_column


# Validation in time dimension

Here we classify the data by day of week and time of a day, and then calculate the average travel time they need

In [23]:
day_of_week = results_df['start_time'].dt.dayofweek
results_df['day of week'] = day_of_week
results_df['hour of day'] = results_df['start_time'].apply(lambda x: x.hour)
results_df

Unnamed: 0,Incident Index,Minimum Travel Time,start_time,day of week,hour of day,longitude,latitude
0,0,1616.019997,2019-10-24 07:27:00,3,7,6.005744,50.834209
1,1,262.129535,2019-10-22 14:37:00,1,14,5.228548,51.693310
2,2,421.100000,2019-12-02 11:36:00,0,11,5.711479,52.018970
3,3,994.738611,2019-10-16 17:12:00,2,17,5.788115,50.966351
4,4,756.085859,2019-11-21 09:35:00,3,9,4.376596,51.893478
...,...,...,...,...,...,...,...
67,67,1282.700000,2019-10-29 08:30:00,1,8,4.449137,51.797321
68,68,742.074534,2019-11-04 18:33:00,0,18,4.970390,52.363892
69,69,777.198223,2019-12-10 17:37:00,1,17,5.498248,51.585007
70,70,67.847825,2019-10-09 16:33:00,2,16,4.534351,52.037473


In [9]:
results = []
time_ranges = [(0, 5), (6, 9), (10, 14), (15, 18), (19, 23)]
for i in range(7):
    data_filter = results_df[results_df['day of week'] == i]

    for start_hour, end_hour in time_ranges:
        data = data_filter[data_filter['hour of day'].between(start_hour, end_hour)][['Minimum Travel Time']]
        
        # Calculate the probability that travel time is less than 18 minutes
        count_below_18mins = (data['Minimum Travel Time'] < 1080).sum()


        if len(data) > 0:
            pro_below_18mins = count_below_18mins / len(data)
            pro_below_18mins = "{:.2f}".format(pro_below_18mins)

        else:
            pro_below_18mins = 'NAN'

        # Calculate the average travel time
        avg_travel_time = data['Minimum Travel Time'].mean()


        if pd.notna(avg_travel_time):
            avg_travel_time = "{:.2f}".format(avg_travel_time)
        else:
            avg_travel_time = 'NAN'

        number_of_incidents = len(data)
        results.append({
            'Day': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'][i],
            'Time ranges': f'{start_hour}-{end_hour}',
            'Number of incidents': number_of_incidents,
            'Average Travel Time': avg_travel_time,
            'Probability of <18 mins': pro_below_18mins
        })

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,Day,Time ranges,Number of incidents,Average Travel Time,Probability of <18 mins
0,Monday,0-5,0,NAN,NAN
1,Monday,6-9,2,743.07,1.00
2,Monday,10-14,2,612.53,1.00
3,Monday,15-18,2,694.76,1.00
4,Monday,19-23,3,534.37,1.00
5,Tuesday,0-5,1,604.99,1.00
6,Tuesday,6-9,3,872.78,0.67
7,Tuesday,10-14,5,546.03,0.80
8,Tuesday,15-18,7,790.71,0.86
9,Tuesday,19-23,3,634.50,0.67


In [10]:
df_results.to_csv('results11.csv', index=False)


## Overview Visualization (Time dimension)

In [11]:
pip install plotly





In [78]:
import numpy as np
import plotly.graph_objs as go
import plotly.offline as pyo

# Prepare the data
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
time_ranges = ['0-5', '6-9', '10-14', '15-18', '19-23']

# Sample data for demonstration purposes
Z = np.random.rand(len(time_ranges), len(days))  # Replace this with your actual data
colors = np.random.rand(len(time_ranges), len(days))  # Replace this with your actual data

# Create a surface plot
surface = go.Surface(z=Z, x=days, y=time_ranges, colorscale='Viridis', cmin=0, cmax=1, colorbar=dict(title='Probability of <18 mins'))

# Set axis labels
layout = go.Layout(
    scene=dict(
        xaxis_title='Day',
        yaxis_title='Time ranges',
        zaxis_title='Average Travel Time'
    )
)

# Set custom x-axis labels
layout.scene.xaxis.update(tickvals=days, ticktext=days)

# Set custom y-axis labels
layout.scene.yaxis.update(tickvals=time_ranges, ticktext=time_ranges)

# Create the figure
fig = go.Figure(data=[surface], layout=layout)

# Set subplot title
fig.update_layout(title='Validation Heatmap')

# Save the interactive plot as an HTML file
pyo.plot(fig, filename='Time dimension.html')


'Time dimension.html'

## Overview Visualization (Geographic dimension)

In [77]:
import folium
from folium.plugins import HeatMap
import pandas as pd


m = folium.Map(location=[52.399190, 4.893658], zoom_start=10, zoom_control=False)

# Create a HeatMap layer to visualize Minimum Travel Time
heat_data = [[row['latitude'], row['longitude'], row['Minimum Travel Time']] for _, row in results_df.iterrows()]
HeatMap(heat_data).add_to(m)

color_gradient = {
    0.0: 'blue',
    500.0: 'green',
    1000.0: 'yellow',
    2000.0: 'red'
}

HeatMap(heat_data, gradient=color_gradient).add_to(m)

m.save('Geographic_dimension.html')






