In [40]:
# Google BQ connector
from google.cloud import bigquery

# Data processing libraries
import pandas as pd
import numpy as np

# Data visualisation tools
import matplotlib.pyplot as plt
from IPython.display import clear_output

# sklearn
from sklearn.cluster import KMeans

# pickle
import pickle

# IPython display
from IPython.display import clear_output

# datetime
import datetime

In [3]:
# creating a BQ client
bqclient = bigquery.Client()

In [4]:
query = """
SELECT
  *
FROM
  `jjkoh95.jjkoh.grab_raw_singapore_74361`
"""

In [55]:
df = bqclient.query(query).result().to_dataframe()

In [56]:
with open("models/traffic-cluster-speed-{0}-kmeans.pkl".format(20), "rb") as pkl:
    speed_kmeans = pickle.load(pkl)
    
with open("models/traffic-cluster-density-{0}-kmeans.pkl".format(20), "rb") as pkl:
    density_kmeans = pickle.load(pkl)

In [57]:
# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [58]:
# https://stackoverflow.com/questions/54873868/python-calculate-bearing-between-two-lat-long
from geographiclib.geodesic import Geodesic

def get_bearing(lat1, lat2, long1, long2):
    brng = Geodesic.WGS84.Inverse(lat1, long1, lat2, long2)['azi1']
    return brng

In [89]:
# density_cluster, speed_cluster
def predict_cluster(row):
    row['density_cluster'] = density_kmeans.predict([[row['rawlat'], row['rawlng']]])[0]
    row['speed_cluster'] = speed_kmeans.predict([[row['rawlat'], row['rawlng']]])[0]
    return row

df['density_cluster'] = 0
df['speed_cluster'] = 0
df.density_cluster = df.density_cluster.astype('uint8')
df.speed_cluster = df.speed_cluster.astype('uint8')

df = df.apply(predict_cluster, axis=1)

In [90]:
# day, hour
def epoch_to_day_hour(row):
    dt = datetime.datetime.fromtimestamp(row['pingtimestamp'])
    row['day'] = dt.weekday()
    row['hour'] = dt.hour
    return row

df['day'] = 0
df['hour'] = 0
df.day = df.day.astype('uint8')
df.hour = df.hour.astype('uint8')

df = df.apply(epoch_to_day_hour, axis=1)

In [91]:
# day_sin, day_cos, hour_sin, hour_cos
def get_time_meta(row):
    row['day_sin'] = sin(row['day']*(np.pi/7))
    row['day_cos'] = cos(row['day']*(np.pi/7))
    row['hour_sin'] = sin(row['hour']*(np.pi/7))
    row['hour_cos'] = cos(row['hour']*(np.pi/7))
    return row

df['day_sin'] = 0
df['day_cos'] = 0
df['hour_sin'] = 0
df['hour_cos'] = 0
df.day_sin = df.day_sin.astype('float32')
df.day_cos = df.day_cos.astype('float32')
df.hour_sin = df.hour_sin.astype('float32')
df.hour_cos = df.hour_cos.astype('float32')

df = df.apply(get_time_meta, axis=1)

In [107]:
# origin
# for cluster in [density.cluster, speed.cluster]
#   distance-from-origin-cluster
#   bearing-from-origin-cluster

def get_origin_meta(row):
    density_center = density_kmeans.cluster_centers_[row['density_cluster']]
    speed_center = speed_kmeans.cluster_centers_[row['speed_cluster']]
    
    row['distance_from_density_center'] = haversine(row['rawlat'], density_center[0], row['rawlng'], density_center[1])
    row['bearing_from_density_center'] = get_bearing(row['rawlat'], density_center[0], row['rawlng'], density_center[1])
    
    row['distance_from_speed_center'] = haversine(row['rawlat'], speed_center[0], row['rawlng'], speed_center[1])
    row['bearing_from_speed_center'] = get_bearing(row['rawlat'], speed_center[0], row['rawlng'], speed_center[1])
    
    return row

df['distance_from_density_center'] = 0.0
df['bearing_from_density_center'] = 0.0
df['distance_from_speed_center'] = 0.0
df['bearing_from_speed_center'] = 0.0

df = df.apply(get_origin_meta, axis=1)

In [None]:
# destination
# for cluster in [density.cluster, speed.cluster]
#   distance-from-destination-cluster
#   bearing-from-destination-cluster

In [None]:
# origin-destination
# distance-from-origin-to-destination
# bearing-from-origin-to-destination