#Load Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

In [None]:
dataset_SG = pq.ParquetDataset('Data/city=Singapore')

In [None]:
dataset_SG = dataset_SG.read()
dataset_SG = dataset_SG.to_pandas()

# SG dataset preprocess

In [None]:
dataset_SG['timestamp'] = pd.to_datetime(dataset_SG['pingtimestamp'], unit='s')
dataset_SG['timestamp'] = pd.DatetimeIndex(dataset_SG.timestamp)
SG_car = dataset_SG.drop(columns=['driving_mode'])
del dataset_SG

SG_car['trj_id'] = SG_car['trj_id'].astype('int')
SG_car = SG_car.sort_values(by=['trj_id', 'timestamp'])

# get the travel time by substracting difference in first pingtimestamp with the last pingtimestamp for each trajectory ID
SG_car['travel_time'] = SG_car.groupby(['trj_id']).timestamp.transform(lambda x: x.max() - x.min()).astype('timedelta64[s]')
SG_car['pickup_time'] = SG_car.groupby(['trj_id']).timestamp.transform(lambda x: x.min())
SG_car['dropoff_time'] = SG_car.groupby(['trj_id']).timestamp.transform(lambda x: x.max())

In [None]:
# collect garbage, run when RAM is almost full or just need to clear up RAM
import gc
gc.collect()

0

In [None]:
def extractCoordinateAndUpdateDF(df, variable_name, timestamp_col, coordinate_component):
#Create new column containing a coordinate component of geolocation at a certain timestamp
    coord_series = df.groupby(['trj_id']).apply(lambda x: x[x['timestamp'] == x[timestamp_col]][coordinate_component])
    coord_series = coord_series.reset_index().set_index('level_1')
    df_update = df.merge(coord_series, how='left', left_on='trj_id', right_on='trj_id', suffixes=['', '_{}'.format(variable_name)])
    return df_update

In [None]:
SG_car = SG_car.rename(columns={'rawlat': 'latitude', 'rawlng': 'longitude'})

In [None]:
# create and merge pickup latitude into SG_car
SG_car = extractCoordinateAndUpdateDF(SG_car, 'origin', 'pickup_time', 'latitude')
# create and merge pickup longitude into SG_car
SG_car = extractCoordinateAndUpdateDF(SG_car, 'origin', 'pickup_time', 'longitude')
# add drop off latitude
SG_car = extractCoordinateAndUpdateDF(SG_car, 'destination', 'dropoff_time', 'latitude')
# add drop off longitude
SG_car = extractCoordinateAndUpdateDF(SG_car, 'destination', 'dropoff_time', 'longitude')

In [None]:
SG_car = SG_car.reset_index()
SG_car = SG_car.drop(labels=['index'], axis=1)

# Time-based features

In [None]:
SG_car = SG_car.drop_duplicates(subset=['trj_id', 'pickup_time'], keep='first')

# add pickup_hour and pickup_day column
SG_car['hour_of_day'] = SG_car['pickup_time'].dt.hour
SG_car['day_of_week'] = SG_car['pickup_time'].dt.dayofweek

SG_car['is_weekend'] = np.where(SG_car['day_of_week'].isin([5,6]), 1, 0)
SG_car['is_weekday'] = np.where(SG_car['day_of_week'].isin([5,6]), 0, 1)

"""
Wee hours = 1 AM - 5 AM SGT. 
So, it will be 5 PM - 9 PM UTC
"""
SG_car['is_wee_hours'] = np.where(SG_car['hour_of_day'].isin([17,18,19,20,21]), 1, 0)

"""
Rush hour = 7.30 AM - 9.30 AM SGT or 5 PM - 8 PM SGT. 
So, it will be 11.30 PM - 1.30 AM UTC or 9 AM - 12 PM UTC
"""
SG_car['is_rush_hours_morning'] = np.where(SG_car.timestamp.dt.strftime('%H:%M:%S').between('23:30:00', '01:30:00'), 1, 0)
SG_car['is_rush_hours_evening'] = np.where(SG_car.timestamp.dt.strftime('%H:%M:%S').between('09:00:00', '12:00:00'), 1, 0)

# Hour and day to cyclical data
we will use sin and cos function to model the time and day into a continuos cyclical cycle.

In [None]:
SG_car['sin_hour_of_day'] = np.sin(2*np.pi*SG_car['hour_of_day']/24)
SG_car['cos_hour_of_day'] = np.cos(2*np.pi*SG_car['hour_of_day']/24)
SG_car['sin_day_of_week'] = np.sin(2*np.pi*SG_car['day_of_week']/7)
SG_car['cos_day_of_week'] = np.cos(2*np.pi*SG_car['day_of_week']/7)

# Location-based feature

In [None]:
def haversine_km(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

def haversine_m(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    m = 3956 * c
    return m

SG_car['haversine_km'] = haversine_km(SG_car['longitude_origin'], SG_car['latitude_origin'], 
                                 SG_car['longitude_destination'], SG_car['latitude_destination'])

SG_car['haversine_m'] = haversine_m(SG_car['longitude_origin'], SG_car['latitude_origin'], 
                                 SG_car['longitude_destination'], SG_car['latitude_destination'])

In [None]:
SG_car.to_csv("Dataset/SG_car.csv")