In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

import requests
import json
from pandas import json_normalize

import geopandas as gpd
from shapely.geometry import Point

import folium
from folium import plugins
from folium.plugins import HeatMap, MarkerCluster

import h3
from shapely.geometry import Polygon, box, Point  # Import Point from shapely.geometry

# Taxi Availability

Write a code that calls the data.gov.sg API for taxi availability

May Data

In [5]:
def fetch_taxi_data(start_datetime, end_datetime, interval, verbose=False):
    # Define the URL for the API request
    url = "https://api.data.gov.sg/v1/transport/taxi-availability"
    
    # Initialize a list to store date-time parameters and taxi data
    date_times = []
    taxi_data_list = []
    
    # Generate date-time parameters
    current_datetime = start_datetime
    while current_datetime <= end_datetime:
        date_times.append(current_datetime.strftime('%Y-%m-%dT%H:%M:%S'))
        current_datetime += interval
    
    # Iterate through each date-time parameter
    for dt in date_times:
        if verbose:
            print(f"Fetching data for: {dt}")
        
        # Define parameters for the API request
        params = {'date_time': dt}
        
        # Initiate the API request
        response = requests.get(url, params=params)
        
        # Check if the request is successful
        if response.status_code == 200:
            if verbose:
                print(f"Successful request for: {dt}")
            
            # Parse the JSON response
            taxi = response.json()
            
            # Extract the taxi coordinates
            taxi_data = taxi['features'][0]['geometry']['coordinates']
            
            # Create a DataFrame from the taxi coordinates
            taxi_df = pd.DataFrame(taxi_data, columns=['longitude', 'latitude'])
            
            # Convert the timestamp to a datetime object
            dt_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S')
            
            # Add the hour as a timestamp column to the DataFrame
            taxi_df['timestamp'] = dt_obj.strftime('%H:00')
            
            # Add a column for the day type (weekday or weekend)
            taxi_df['day_type'] = 'weekday' if dt_obj.weekday() < 5 else 'weekend'
            
            # Append the DataFrame to the list
            taxi_data_list.append(taxi_df)
        else:
            if verbose:
                print(f"Error for {dt}: {response.status_code}")
    
    # Concatenate all DataFrames into a single DataFrame
    if taxi_data_list:
        all_taxi_data = pd.concat(taxi_data_list, ignore_index=True)
        if verbose:
            print("Data successfully retrieved and concatenated.")
        
        return all_taxi_data
    else:
        if verbose:
            print("No data retrieved.")
        return pd.DataFrame()

In [6]:
# Define the start and end datetime
start_datetime = datetime(2024, 4, 1, 0, 0, 0)
end_datetime = datetime(2024, 4, 30, 23, 59, 0)

# Define the time interval
interval = timedelta(minutes=60)

# Fetch the taxi data with verbose output
taxi_april = fetch_taxi_data(start_datetime, end_datetime, interval, verbose=True)

# Display the final DataFrame
taxi_april

Fetching data for: 2024-04-01T00:00:00
Successful request for: 2024-04-01T00:00:00
Fetching data for: 2024-04-01T01:00:00
Successful request for: 2024-04-01T01:00:00
Fetching data for: 2024-04-01T02:00:00
Successful request for: 2024-04-01T02:00:00
Fetching data for: 2024-04-01T03:00:00
Successful request for: 2024-04-01T03:00:00
Fetching data for: 2024-04-01T04:00:00
Successful request for: 2024-04-01T04:00:00
Fetching data for: 2024-04-01T05:00:00
Successful request for: 2024-04-01T05:00:00
Fetching data for: 2024-04-01T06:00:00
Successful request for: 2024-04-01T06:00:00
Fetching data for: 2024-04-01T07:00:00
Successful request for: 2024-04-01T07:00:00
Fetching data for: 2024-04-01T08:00:00
Successful request for: 2024-04-01T08:00:00
Fetching data for: 2024-04-01T09:00:00
Successful request for: 2024-04-01T09:00:00
Fetching data for: 2024-04-01T10:00:00
Successful request for: 2024-04-01T10:00:00
Fetching data for: 2024-04-01T11:00:00
Successful request for: 2024-04-01T11:00:00
Fetc

Unnamed: 0,longitude,latitude,timestamp,day_type
0,103.625536,1.305826,00:00,weekday
1,103.667031,1.298071,00:00,weekday
2,103.670050,1.319960,00:00,weekday
3,103.678860,1.318840,00:00,weekday
4,103.680730,1.322490,00:00,weekday
...,...,...,...,...
1342616,104.001260,1.379150,23:00,weekday
1342617,104.001820,1.377500,23:00,weekday
1342618,104.002220,1.375970,23:00,weekday
1342619,104.002220,1.375970,23:00,weekday


In [7]:
taxi_april_data = taxi_april

In [8]:
def preprocess_taxi_data(taxi_data):
    # Create the 'day_type_time' column
    taxi_data['day_type_time'] = taxi_data['day_type'] + "_" + taxi_data['timestamp'].str[:2]

    # Drop the 'timestamp' and 'day_type' columns as they are no longer needed
    taxi_data = taxi_data.drop(columns=['timestamp', 'day_type'])

    # One-hot encode 'day_type_time' column
    taxi_data_dummified = pd.get_dummies(taxi_data, columns=['day_type_time'])

    # Get the prefix to remove ('day_type_time_')
    prefix_to_remove = 'day_type_time_'

    # Create a mapping dictionary for renaming columns
    rename_mapping = {col: col.replace(prefix_to_remove, '') for col in taxi_data_dummified.columns if col.startswith(prefix_to_remove)}

    # Rename columns
    taxi_data_dummified.rename(columns=rename_mapping, inplace=True)

    return taxi_data_dummified

# Example usage:
# Assuming taxi_data is your DataFrame
# taxi_data_processed = preprocess_taxi_data(taxi_data)
# print(taxi_data_processed)


In [10]:
taxi_april_df = preprocess_taxi_data(taxi_april_data)
taxi_april_df

Unnamed: 0,longitude,latitude,weekday_00,weekday_01,weekday_02,weekday_03,weekday_04,weekday_05,weekday_06,weekday_07,...,weekend_14,weekend_15,weekend_16,weekend_17,weekend_18,weekend_19,weekend_20,weekend_21,weekend_22,weekend_23
0,103.625536,1.305826,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,103.667031,1.298071,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,103.670050,1.319960,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,103.678860,1.318840,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,103.680730,1.322490,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342616,104.001260,1.379150,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1342617,104.001820,1.377500,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1342618,104.002220,1.375970,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1342619,104.002220,1.375970,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
taxi_april_df.to_csv('assets/processed_data/taxi_availability.csv')