# Modelling

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

import requests
import json
from pandas import json_normalize

import geopandas as gpd
from shapely.geometry import Point

import folium
from folium import plugins
from folium.plugins import HeatMap, MarkerCluster

import h3
from shapely.geometry import Polygon, box, Point  # Import Point from shapely.geometry

In [12]:
final_df = pd.read_csv('assets/processed_data/modeling_final.csv')
final_df = final_df.drop(columns = 'Unnamed: 0')
final_df.head()

Unnamed: 0,h3_index,h3_address,latitude,longitude,geometry,weekday_00,weekday_01,weekday_02,weekday_03,weekday_04,...,weekend_14,weekend_15,weekend_16,weekend_17,weekend_18,weekend_19,weekend_20,weekend_21,weekend_22,weekend_23
0,1,876520c18ffffff,1.29095,103.628706,POLYGON ((103.64169005775774 1.287376944150446...,0.032258,0.0,0.0,0.0,0.0,...,252.5,131.714286,176.0,185.444444,168.5,71.75,35.9,35.090909,38.166667,11.75
1,2,876526aeaffffff,1.289424,103.943911,"POLYGON ((103.9568727977856 1.285851866048183,...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,876520c82ffffff,1.415494,103.707587,POLYGON ((103.72057022657783 1.411915276730170...,0.0,0.0,0.0,0.0,0.0,...,28.5,60.0,25.0,110.0,35.5,79.0,34.0,6.0,1.428571,1.0
3,4,87652632effffff,1.4305,104.006687,"POLYGON ((104.01964910323045 1.42692106210811,...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,876520c93ffffff,1.460598,103.720903,"POLYGON ((103.7338872805112 1.457016714997125,...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
final_df.shape

(246, 53)

In [14]:
# Remove the hexagons where there are no taxi nor public volume data. This is indicated by the value of zero at all timeframe
final_df = final_df.loc[~(final_df.iloc[:, 5:] == 0).all(axis=1)]
final_df.reset_index(drop=True, inplace=True)
final_df.shape

(113, 53)

In [15]:
final_df.head()

Unnamed: 0,h3_index,h3_address,latitude,longitude,geometry,weekday_00,weekday_01,weekday_02,weekday_03,weekday_04,...,weekend_14,weekend_15,weekend_16,weekend_17,weekend_18,weekend_19,weekend_20,weekend_21,weekend_22,weekend_23
0,1,876520c18ffffff,1.29095,103.628706,POLYGON ((103.64169005775774 1.287376944150446...,0.032258,0.0,0.0,0.0,0.0,...,252.5,131.714286,176.0,185.444444,168.5,71.75,35.9,35.090909,38.166667,11.75
1,3,876520c82ffffff,1.415494,103.707587,POLYGON ((103.72057022657783 1.411915276730170...,0.0,0.0,0.0,0.0,0.0,...,28.5,60.0,25.0,110.0,35.5,79.0,34.0,6.0,1.428571,1.0
2,7,876520d99ffffff,1.326402,103.765495,POLYGON ((103.77847131144851 1.322826893715314...,8.513995,0.0,0.0,0.0,0.0,...,543.20202,545.030928,652.952381,937.138462,635.010638,625.914634,434.247525,416.472527,240.203704,94.903226
3,9,876520d81ffffff,1.270614,103.791103,POLYGON ((103.80407506487525 1.267042182687771...,3.89759,0.0,0.0,0.0,0.0,...,156.75,245.166667,303.666667,643.333333,371.714286,430.333333,181.652174,88.851852,75.7,54.55
4,10,876520ca0ffffff,1.337127,103.726561,POLYGON ((103.73954044620316 1.333551410689863...,14.362981,0.0,0.0,0.0,0.0,...,364.603175,368.069767,362.380597,506.128205,418.970149,415.574074,357.308411,242.564516,153.428571,73.051136


In [16]:
final_df['weekend_16'].sum()

31397.3141766953

In [17]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import folium

# Function to convert WKT to Shapely geometry if needed
def convert_to_geometry(geom):
    if isinstance(geom, str):  # Check if the geometry is in WKT format
        return shapely.wkt.loads(geom)
    return geom  # If it's already a geometry, return as is

# Assuming final_df is already defined and contains a column 'geometry'
final_df['geometry'] = final_df['geometry'].apply(convert_to_geometry)

# Create the GeoDataFrame
gdf = gpd.GeoDataFrame(final_df, geometry='geometry')

# Calculate the centroid of all geometries to find the center of the map
centroid = gdf.geometry.unary_union.centroid
center_lat, center_lon = centroid.y, centroid.x

# Create a folium map with no base tiles, centered on the centroid of the geometries
m = folium.Map(location=[center_lat, center_lon], zoom_start=11, tiles='cartodbpositron')

# Iterate through each geometry and add it to the map
for idx, row in gdf.iterrows():
    geojson = row.geometry.__geo_interface__
    folium.GeoJson(geojson).add_to(m)

# Save the map to an HTML file
m

In [18]:
# Drop the columns that are not required for modeling
final_df_model = final_df.drop(columns=['h3_address', 'longitude', 'latitude', 'geometry'])
final_df_model.head()

Unnamed: 0,h3_index,weekday_00,weekday_01,weekday_02,weekday_03,weekday_04,weekday_05,weekday_06,weekday_07,weekday_08,...,weekend_14,weekend_15,weekend_16,weekend_17,weekend_18,weekend_19,weekend_20,weekend_21,weekend_22,weekend_23
0,1,0.032258,0.0,0.0,0.0,0.0,0.0,365.4,40.938776,11.080357,...,252.5,131.714286,176.0,185.444444,168.5,71.75,35.9,35.090909,38.166667,11.75
1,3,0.0,0.0,0.0,0.0,0.0,0.0,23.5,20.6,12.894737,...,28.5,60.0,25.0,110.0,35.5,79.0,34.0,6.0,1.428571,1.0
2,7,8.513995,0.0,0.0,0.0,0.0,41.783394,523.509506,1003.231939,1285.225962,...,543.20202,545.030928,652.952381,937.138462,635.010638,625.914634,434.247525,416.472527,240.203704,94.903226
3,9,3.89759,0.0,0.0,0.0,0.0,11.236842,351.52381,278.0,271.686047,...,156.75,245.166667,303.666667,643.333333,371.714286,430.333333,181.652174,88.851852,75.7,54.55
4,10,14.362981,0.0,0.0,0.0,0.0,139.108974,560.31383,1146.800633,933.769481,...,364.603175,368.069767,362.380597,506.128205,418.970149,415.574074,357.308411,242.564516,153.428571,73.051136


In [19]:
# Transpose the dataframe to have timeframe as the index and each hexagon as the column
final_df_model = final_df_model.T

# Update the column name to be the h3_index
final_df_model.columns = final_df_model.iloc[0]

# Remove the h3_index from the row
final_df_model = final_df_model[1:]

final_df_model

h3_index,1.0,3.0,7.0,9.0,10.0,11.0,12.0,14.0,15.0,22.0,...,227.0,228.0,229.0,230.0,231.0,234.0,235.0,242.0,243.0,246.0
weekday_00,0.032258,0.0,8.513995,3.89759,14.362981,0.0,0.153846,0.0,4.422803,5.380313,...,5.850289,5.792593,3.69186,11.722307,0.0,1.951501,14.315385,11.18932,0.0,3.768448
weekday_01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
weekday_02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
weekday_03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
weekday_04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
weekday_05,0.0,0.0,41.783394,11.236842,139.108974,0.0,0.0,0.0,16.873016,33.237589,...,22.184729,34.31003,12.617021,109.661738,0.0,36.077966,33.716418,187.909091,0.0,20.940767
weekday_06,365.4,23.5,523.509506,351.52381,560.31383,367.6,233.9375,1164.0,182.544379,164.885387,...,169.4847,222.220877,92.960699,522.188433,0.375,185.143345,202.564706,995.879121,159.923077,176.416382
weekday_07,40.938776,20.6,1003.231939,278.0,1146.800633,77.227273,101.6,262.666667,246.440644,223.730382,...,192.854727,447.597173,189.516746,1162.903226,0.142857,339.136719,294.020833,1856.820225,95.833333,237.392857
weekday_08,11.080357,12.894737,1285.225962,271.686047,933.769481,28.830769,9.92233,175.0,379.043062,259.165306,...,275.838736,496.104129,193.342222,977.573333,0.043478,280.773381,360.847059,1241.363636,41.941176,263.730375
weekday_09,6.201835,14.421053,563.418251,89.5,269.614035,9.085106,5.725806,50.75,203.561776,83.084123,...,145.543829,325.31087,159.487654,547.469136,0.03125,156.789668,133.782875,612.234375,19.380952,103.253247


#### Weekdays

In [20]:
# Slice the dataframe to take the weekdays dataset
weekday_model = final_df_model.iloc[0:24]

# Remove the weekday_00 to weekday_04 as the public transport starts operating at 05:00 AM
weekday_model = weekday_model[5:]
weekday_model.head()

h3_index,1.0,3.0,7.0,9.0,10.0,11.0,12.0,14.0,15.0,22.0,...,227.0,228.0,229.0,230.0,231.0,234.0,235.0,242.0,243.0,246.0
weekday_05,0.0,0.0,41.783394,11.236842,139.108974,0.0,0.0,0.0,16.873016,33.237589,...,22.184729,34.31003,12.617021,109.661738,0.0,36.077966,33.716418,187.909091,0.0,20.940767
weekday_06,365.4,23.5,523.509506,351.52381,560.31383,367.6,233.9375,1164.0,182.544379,164.885387,...,169.4847,222.220877,92.960699,522.188433,0.375,185.143345,202.564706,995.879121,159.923077,176.416382
weekday_07,40.938776,20.6,1003.231939,278.0,1146.800633,77.227273,101.6,262.666667,246.440644,223.730382,...,192.854727,447.597173,189.516746,1162.903226,0.142857,339.136719,294.020833,1856.820225,95.833333,237.392857
weekday_08,11.080357,12.894737,1285.225962,271.686047,933.769481,28.830769,9.92233,175.0,379.043062,259.165306,...,275.838736,496.104129,193.342222,977.573333,0.043478,280.773381,360.847059,1241.363636,41.941176,263.730375
weekday_09,6.201835,14.421053,563.418251,89.5,269.614035,9.085106,5.725806,50.75,203.561776,83.084123,...,145.543829,325.31087,159.487654,547.469136,0.03125,156.789668,133.782875,612.234375,19.380952,103.253247


In [21]:
weekday_model.shape

(19, 113)

In [22]:
# Update the index from weekday_05 to 05:00 datetime
weekday_model.index = weekday_model.index.str.replace('weekday_', '')
weekday_model.index = pd.to_datetime(weekday_model.index, format='%H').strftime('%H:%M')
weekday_model.head()

h3_index,1.0,3.0,7.0,9.0,10.0,11.0,12.0,14.0,15.0,22.0,...,227.0,228.0,229.0,230.0,231.0,234.0,235.0,242.0,243.0,246.0
05:00,0.0,0.0,41.783394,11.236842,139.108974,0.0,0.0,0.0,16.873016,33.237589,...,22.184729,34.31003,12.617021,109.661738,0.0,36.077966,33.716418,187.909091,0.0,20.940767
06:00,365.4,23.5,523.509506,351.52381,560.31383,367.6,233.9375,1164.0,182.544379,164.885387,...,169.4847,222.220877,92.960699,522.188433,0.375,185.143345,202.564706,995.879121,159.923077,176.416382
07:00,40.938776,20.6,1003.231939,278.0,1146.800633,77.227273,101.6,262.666667,246.440644,223.730382,...,192.854727,447.597173,189.516746,1162.903226,0.142857,339.136719,294.020833,1856.820225,95.833333,237.392857
08:00,11.080357,12.894737,1285.225962,271.686047,933.769481,28.830769,9.92233,175.0,379.043062,259.165306,...,275.838736,496.104129,193.342222,977.573333,0.043478,280.773381,360.847059,1241.363636,41.941176,263.730375
09:00,6.201835,14.421053,563.418251,89.5,269.614035,9.085106,5.725806,50.75,203.561776,83.084123,...,145.543829,325.31087,159.487654,547.469136,0.03125,156.789668,133.782875,612.234375,19.380952,103.253247


In [23]:
weekday_model

h3_index,1.0,3.0,7.0,9.0,10.0,11.0,12.0,14.0,15.0,22.0,...,227.0,228.0,229.0,230.0,231.0,234.0,235.0,242.0,243.0,246.0
05:00,0.0,0.0,41.783394,11.236842,139.108974,0.0,0.0,0.0,16.873016,33.237589,...,22.184729,34.31003,12.617021,109.661738,0.0,36.077966,33.716418,187.909091,0.0,20.940767
06:00,365.4,23.5,523.509506,351.52381,560.31383,367.6,233.9375,1164.0,182.544379,164.885387,...,169.4847,222.220877,92.960699,522.188433,0.375,185.143345,202.564706,995.879121,159.923077,176.416382
07:00,40.938776,20.6,1003.231939,278.0,1146.800633,77.227273,101.6,262.666667,246.440644,223.730382,...,192.854727,447.597173,189.516746,1162.903226,0.142857,339.136719,294.020833,1856.820225,95.833333,237.392857
08:00,11.080357,12.894737,1285.225962,271.686047,933.769481,28.830769,9.92233,175.0,379.043062,259.165306,...,275.838736,496.104129,193.342222,977.573333,0.043478,280.773381,360.847059,1241.363636,41.941176,263.730375
09:00,6.201835,14.421053,563.418251,89.5,269.614035,9.085106,5.725806,50.75,203.561776,83.084123,...,145.543829,325.31087,159.487654,547.469136,0.03125,156.789668,133.782875,612.234375,19.380952,103.253247
10:00,9.375,25.363636,245.382653,43.97166,197.793684,27.717949,6.993103,69.75,100.325452,62.763789,...,67.542301,205.198758,97.425,377.759259,0.052632,95.532468,77.083507,534.045455,21.603774,64.756364
11:00,16.25,31.785714,270.469697,90.751634,188.50431,73.0,16.02439,82.333333,87.304079,84.579276,...,79.362923,129.641168,82.424242,275.022869,0.138889,92.13,101.380457,346.477477,39.714286,57.47191
12:00,52.454545,154.666667,352.293413,195.463158,269.39881,180.909091,55.9375,164.5,101.457931,122.020772,...,124.631873,144.333811,90.783465,255.957968,0.064516,92.242038,188.175258,348.136752,17.384615,88.877083
13:00,30.088235,93.142857,410.179878,166.195402,270.285714,75.166667,37.058824,212.0,91.927389,106.374821,...,133.339531,153.217327,140.066667,262.719611,0.0,127.854103,188.027668,274.853333,37.4375,98.080831
14:00,34.102564,117.857143,421.52963,125.478723,268.746032,57.870968,29.745098,116.5,118.50764,121.903537,...,154.879802,142.691505,125.586957,257.374775,0.4375,103.802867,144.623494,317.65812,46.666667,107.10582


#### Weekends

In [24]:
# Slice the dataframe to take the weekends data
weekend_model = final_df_model.iloc[24:]

# Remove the weekday_00 to weekday_04 as the public transport starts operating at 05:00 AM
weekend_model = weekend_model[5:]
weekend_model.head()

h3_index,1.0,3.0,7.0,9.0,10.0,11.0,12.0,14.0,15.0,22.0,...,227.0,228.0,229.0,230.0,231.0,234.0,235.0,242.0,243.0,246.0
weekend_05,0.0,0.0,18.391667,5.6,56.286765,0.0,0.0,0.0,8.610526,18.02439,...,10.611801,22.793578,7.064516,51.91,0.0,16.938356,26.576923,71.949153,0.0,7.568
weekend_06,87.875,17.0,237.974684,52.9,239.087591,794.0,306.75,367.0,90.5,61.777778,...,98.937008,96.579167,41.333333,189.830688,0.4,73.110092,203.785714,332.463415,176.4,79.27551
weekend_07,85.538462,46.0,389.282353,513.625,389.040323,141.166667,257.875,195.5,146.959016,95.145455,...,157.264706,181.813953,94.246154,402.786667,0.0,118.873786,169.454545,643.09375,518.0,139.141176
weekend_08,27.580645,145.0,536.926829,248.75,561.444444,65.25,49.0,114.0,242.705882,147.482517,...,205.925676,257.366834,142.486486,447.304598,0.2,171.5,359.115385,922.259259,100.833333,179.684783
weekend_09,39.92,38.0,647.818182,133.758621,598.736842,62.0,46.552632,359.0,267.514493,126.857143,...,181.513453,237.724891,153.772152,506.174699,0.0,137.716667,280.3125,854.25,106.727273,169.884615


In [25]:
weekend_model.shape

(19, 113)

In [26]:
# Update the index from weekend_05 to 05:00 datetime
weekend_model.index = weekend_model.index.str.replace('weekend_', '')
weekend_model.index = pd.to_datetime(weekend_model.index, format='%H').strftime('%H:%M')
weekend_model.head()

h3_index,1.0,3.0,7.0,9.0,10.0,11.0,12.0,14.0,15.0,22.0,...,227.0,228.0,229.0,230.0,231.0,234.0,235.0,242.0,243.0,246.0
05:00,0.0,0.0,18.391667,5.6,56.286765,0.0,0.0,0.0,8.610526,18.02439,...,10.611801,22.793578,7.064516,51.91,0.0,16.938356,26.576923,71.949153,0.0,7.568
06:00,87.875,17.0,237.974684,52.9,239.087591,794.0,306.75,367.0,90.5,61.777778,...,98.937008,96.579167,41.333333,189.830688,0.4,73.110092,203.785714,332.463415,176.4,79.27551
07:00,85.538462,46.0,389.282353,513.625,389.040323,141.166667,257.875,195.5,146.959016,95.145455,...,157.264706,181.813953,94.246154,402.786667,0.0,118.873786,169.454545,643.09375,518.0,139.141176
08:00,27.580645,145.0,536.926829,248.75,561.444444,65.25,49.0,114.0,242.705882,147.482517,...,205.925676,257.366834,142.486486,447.304598,0.2,171.5,359.115385,922.259259,100.833333,179.684783
09:00,39.92,38.0,647.818182,133.758621,598.736842,62.0,46.552632,359.0,267.514493,126.857143,...,181.513453,237.724891,153.772152,506.174699,0.0,137.716667,280.3125,854.25,106.727273,169.884615


#### Export the preprocessed data for modelling

In [27]:
weekday_model.to_csv('assets/processed_data/weekday_model.csv')
weekend_model.to_csv('assets/processed_data/weekend_model.csv')