# Final notebook

This is the main notebook for the midterm submission from RWS group 3

## Table of contents
1. Analysis of incident data
2. The travel time of the road sections

In [None]:
## Importing all neccesary modules
%matplotlib inline
import geopandas as gpd
import folium 
import json
import pandas as pd
import numpy as np

from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
import branca.colormap as cm
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import pickle
import networkx as nx

## 1. Analysis of incident data

### 1.1 Preparation work

In [None]:
# Functions

def DutchRDtoWGS84(rdX, rdY):
    """ Convert DutchRD to WGS84
    """
    RD_MINIMUM_X = 11000
    RD_MAXIMUM_X = 280000
    RD_MINIMUM_Y = 300000
    RD_MAXIMUM_Y = 630000
    if (rdX < RD_MINIMUM_X or rdX > RD_MAXIMUM_X
        or rdY < RD_MINIMUM_Y or rdY > RD_MAXIMUM_Y):
        resultNorth = -1
        resultEast = -1
        return resultNorth, resultEast
    # else
    dX = (rdX - 155000.0) / 100000.0
    dY = (rdY - 463000.0) / 100000.0
    k = [[3600 * 52.15517440, 3235.65389, -0.24750, -0.06550, 0.0],
        [-0.00738   ,   -0.00012,  0.0    ,  0.0    , 0.0],
        [-32.58297   ,   -0.84978, -0.01709, -0.00039, 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [0.00530   ,    0.00033,  0.0    ,  0.0    , 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0]]
    l = [[3600 * 5.38720621,    0.01199,  0.00022,  0.0    , 0.0],
        [5260.52916   ,  105.94684,  2.45656,  0.05594, 0.00128],
        [-0.00022   ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [-0.81885   ,   -0.05607, -0.00256,  0.0    , 0.0],
        [0.0       ,    0.0    ,  0.0    ,  0.0    , 0.0],
        [0.00026   ,    0.0    ,  0.0    ,  0.0    , 0.0]]
    resultNorth = 0
    resultEast = 0
    powX = 1

    for p in range(6):
        powY = 1
        for q in range(5):
            resultNorth = resultNorth + k[p][q] * powX * powY / 3600.0
            resultEast = resultEast + l[p][q] * powX * powY / 3600.0
            powY = powY * dY
        powX = powX * dX
    return resultNorth, resultEast

def WGS84toDutchRD(wgs84East, wgs84North):
    # translated from Peter Knoppers's code

    # wgs84East: longtitude
    # wgs84North: latitude

    # Western boundary of the Dutch RD system. */
    WGS84_WEST_LIMIT = 3.2

    # Eastern boundary of the Dutch RD system. */
    WGS84_EAST_LIMIT = 7.3

    # Northern boundary of the Dutch RD system. */
    WGS84_SOUTH_LIMIT = 50.6

    # Southern boundary of the Dutch RD system. */
    WGS84_NORTH_LIMIT = 53.7

    if (wgs84North > WGS84_NORTH_LIMIT) or \
        (wgs84North < WGS84_SOUTH_LIMIT) or \
        (wgs84East < WGS84_WEST_LIMIT) or \
        (wgs84East > WGS84_EAST_LIMIT):
        resultX = -1
        resultY = -1
    else:
        r = [[155000.00, 190094.945,   -0.008, -32.391, 0.0],
            [-0.705, -11832.228,    0.0  ,   0.608, 0.0],
            [0.0  ,   -114.221,    0.0  ,   0.148, 0.0],
            [0.0  ,     -2.340,    0.0  ,   0.0  , 0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  , 0.0]]
        s = [[463000.00 ,      0.433, 3638.893,   0.0  ,  0.092],
            [309056.544,     -0.032, -157.984,   0.0  , -0.054],
            [73.077,      0.0  ,   -6.439,   0.0  ,  0.0],
            [59.788,      0.0  ,    0.0  ,   0.0  ,  0.0],
            [0.0  ,      0.0  ,    0.0  ,   0.0  ,  0.0]]
        resultX = 0
        resultY = 0
        powNorth = 1
        dNorth = 0.36 * (wgs84North - 52.15517440)
        dEast = 0.36 * (wgs84East - 5.38720621)

        for p in range(5):
            powEast = 1
            for q in range(5):
                resultX = resultX + r[p][q] * powEast * powNorth
                resultY = resultY + s[p][q] * powEast * powNorth
                powEast = powEast * dEast
            powNorth = powNorth * dNorth
    return resultX, resultY

def calc_distance(line_wkt):
    line = ogr.CreateGeometryFromWkt(line_wkt)
    points = line.GetPoints()
    d = 0
    for p0, p1 in zip(points, points[1:]):
        d = d + geodesic(p0, p1).m
    return d

if __name__=="__main__":
    x, y = WGS84toDutchRD(4.33, 52.04)
    print(DutchRDtoWGS84(x, y))

In [None]:
#Extract subnetwork
highway_shapefile = 'Shapefiles/Snelheid_Wegvakken.shp'
network_temp = gpd.read_file(highway_shapefile)


In [None]:
network_temp.plot()

### 1.2 Loading data

In [None]:
df = pd.read_csv('incidents19Q3Q4.csv')
df.columns = ['index', 'id', 'type', 'start_time','end_time', 'road_number','longitude','latitude']
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df.head()

### 1.3 Data Filtering: 
A function that can:
- (1) Delete the incidents which are not occured on the high ways 
- (2) Delete the incidents which the values for road number is missing 
- (3) Change name of all roads 'hrb'
- (4) Delete the incidents that are out of the border of the Netherlands 

In [None]:
def data_filter(data_input):
    data_input = df.dropna()
    data_input.loc[:,'road_number'] = data_input['road_number'].replace({'A12 hrb':'A12', 'A16 hrb':'A16', 'A2 hrb':'A2'})
    new_data = data_input[data_input['road_number'].str.startswith('A')]
    new_data.drop(new_data.loc[new_data['index'] == 183130].index, inplace=True)
    
    return new_data
incidents_df = data_filter(df)

In [None]:
incidents_df

### 1.4 Data visualization

#### 1.4.1 Mark all incidents at the road network in heatmap

In [None]:
def draw_incidents(filter_type, keyword):

    # Define a new map
    m = folium.Map(location=[52.399190, 4.893658])

    if filter_type == 'Incident_type':
        new_data = incidents_df.loc[incidents_df['type'] == keyword]
        # Extract the latitude and longitude as a list of lists
        heat_data = [[row['latitude'], row['longitude']] for _, row in new_data.iterrows()]
        # Create a heatmap layer
        HeatMap(heat_data).add_to(m)
    return m

map_new = draw_incidents('Incident_type', 'accident')
map_new


#### 1.4.2 Analysis of time periods when incidents occurs (starting time)

Draw a bar chart show the distribution of time when incidents occurs

In [None]:
incidents_df['hour_of_day'] = incidents_df['start_time'].apply(lambda x: x.hour)
hourly_counts = incidents_df.groupby('hour_of_day').size().reset_index(name='accident_count')

plt.figure(figsize=(12, 6))
plt.bar(hourly_counts['hour_of_day'], hourly_counts['accident_count'])
plt.xlabel('Hour of a day')
plt.ylabel('Accidents count')
plt.title('Accident count by hour of day')
plt.xticks(hourly_counts['hour_of_day'])

plt.axvspan(0, 5, alpha=0.2, color='red', label='0-5h')
plt.axvspan(6, 9, alpha=0.2, color='black', label='6-9h')
plt.axvspan(10, 14, alpha=0.2, color='green', label='10-14h')
plt.axvspan(15, 18, alpha=0.2, color='yellow', label='15-18h')
plt.axvspan(19, 23, alpha=0.2, color='orange', label='19-23h')
plt.legend()

plt.show()

Calculate the probability of accident of time of a day

In [None]:
# Calculate the probability of time of a day for incident
pro_0_5h = hourly_counts['accident_count'][:6].sum() / hourly_counts['accident_count'].sum()
pro_6_9h = hourly_counts['accident_count'][6:10].sum() / hourly_counts['accident_count'].sum()
pro_10_14h = hourly_counts['accident_count'][10:15].sum() / hourly_counts['accident_count'].sum()
pro_15_18h =hourly_counts['accident_count'][15:19].sum() / hourly_counts['accident_count'].sum()
pro_19_23h = hourly_counts['accident_count'][19:].sum() / hourly_counts['accident_count'].sum()

result_data = {
    'Time Range': ['0-5 hours', '6-9 hours', '10-14 hours', '15-18 hours', '19-23 hours'],
    'Probability': [pro_0_5h, pro_6_9h, pro_10_14h, pro_15_18h, pro_19_23h]
}

pro_time = pd.DataFrame(result_data)
pro_time

#### 1.4.3 Analysis of day of week when incidents occurs

Calculate the probability of accident of time of week

In [None]:
incidents_df['day of week'] = incidents_df ['start_time'].dt.dayofweek
weekly_count = incidents_df['day of week'].value_counts(normalize=True)
day_mapping = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}
weekly_count.index = weekly_count.index.map(day_mapping)

pro_weekly = pd.DataFrame({
    'Day of Week': weekly_count.index,
    'Probability': weekly_count.values
})
pro_weekly 

Draw a bar chart to visualize the probability

In [None]:
plt.figure(figsize=(12,6))
plt.bar(pro_weekly['Day of Week'], pro_weekly['Probability'])
plt.ylabel('Probability')
plt.title('Probability of incident occuring on different days of a week')

#### 1.4.4 Analysis of accident frequency and duration time in each highway

In [None]:
# count the number of accidents in each road
accidents_number = incidents_df.groupby('road_number').size()
accidents_df = accidents_number.reset_index()
accidents_df.columns = ['road_number', 'accidents_number']

# count the average lasting time for each road
incidents_df['Duration_time'] = (incidents_df['end_time'] - incidents_df['start_time']).dt.total_seconds() / 60
average_duration_by_road = incidents_df.groupby('road_number')['Duration_time'].mean()
duration_df = average_duration_by_road.reset_index()
duration_df.columns = ['road_number', 'average_duration']

# Mix them and create the new dataframe
road_number_counts = pd.merge(accidents_df, duration_df, on='road_number', how='left')

In [None]:
road_number_counts

### 1.5 Show the incident hot spots

Here we use DBSCAN (Density-Based Spatial Clustering of Applications with Noise) clustering algorithm to show hotspots of incident, which is based on density of incidents in map

In [None]:
def draw_clusters(eps, min_samples, data):
   
    # Create a DBSCAN clustering model
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='haversine', algorithm='ball_tree')

    # Fit the model to the latitude and longitude data
    dbscan.fit(data[['latitude', 'longitude']].values)

    # Assign cluster labels to data points
    data['cluster'] = dbscan.labels_

    # Filter out noise points (-1 labels)
    clustered_data = data[data['cluster'] != -1]

    m = folium.Map(location=[52.399190, 4.893658], zoom_start=8, zoom_control=False)

    # Create a MarkerCluster layer for clustered data
    marker_cluster = MarkerCluster().add_to(m)

    # Add markers for clustering
    for _, row in clustered_data.iterrows():
        popup_text = f"Cluster: {row['cluster']}<br>Type: {row['type']}<br>Index: {row['index']}"
        folium.Marker([row['latitude'], row['longitude']], icon=None, popup=popup_text).add_to(marker_cluster)

    return m


# Call the function to visualize clusters with markers and centroids
clustered_map = draw_clusters(eps=0.1, min_samples=100, data=incidents_df)
clustered_map


Then we can show the specific longitude and langtitude of each hotspot 

## 2. The travel time of the road sections

In this section the speed of the road section will be determined. Together with the length of each road section, the travel time of each section can be deterined. For the full notebook with all explanations. See **speed_network_data.ipynb**. 

This section will only show the results obtained from the speed_network_data notebook. 

First there is the following dataframe. It has been obtained by combining the given shapefiles, open source wkd ("Wegkenmerkendatabase") data about the maximum speed and INWEVA ("INtensiteit WEgVAkken") data which gives traffic intensities on road sections. There was some missing data, which has been filled with data from adjacent road sections.

In [None]:
road_section_data = pd.read_csv('speed_data', sep=';')
road_section_data.head()

So far the travel time of each road section has been estimated in optimal conditions (the average speed equals the speed limit) and the speed during peak hours. 

The data from the dataframe has also been converted to a NetworkX graph. This graph can later be used in the optimization methods to calculate the travel time of the shortest path

In [None]:
G = pickle.load(open('NetworkX_graph.pickle', 'rb'))

The graph contains all edges with several attributes

In [None]:
# Example of the data of a random edge
G.edges[list(G.edges)[0]]

## 3. Optimization Methods

# method 3

In [None]:
def draw_clusters(k_value, data):
    # Create a K-Means clustering model
    kmeans = KMeans(n_clusters=k_value)

    # Fit the model to the latitude and longitude data
    locations = data[['latitude', 'longitude']].values
    kmeans.fit(locations)

    # Assign cluster labels to data points
    data['cluster'] = kmeans.labels_

    m = folium.Map(location=[np.mean(locations[:, 0]), np.mean(locations[:, 1])], zoom_start=8, zoom_control=False)

    # Create a MarkerCluster layer for clustered data
    marker_cluster = MarkerCluster().add_to(m)

    # Add markers for clustering
    for cluster_label in range(k_value):
        cluster_data = data[data['cluster'] == cluster_label]
        for _, row in cluster_data.iterrows():
            popup_text = f"Cluster: {row['cluster']}<br>Type: {row['type']}<br>Index: {row['index']}"
            folium.Marker([row['latitude'], row['longitude']], icon=None, popup=popup_text).add_to(marker_cluster)

    cluster_centers = kmeans.cluster_centers_
    for i, center in enumerate(cluster_centers):
        center_popup_text = f"Cluster Center {i + 1}"
        folium.Marker([center[0], center[1]], icon=folium.Icon(color='red'), popup=center_popup_text).add_to(m)

    return m

k_value = 20  # Set the number of clusters
clustered_map = draw_clusters(k_value, incidents_df)
clustered_map
