In [20]:
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.cluster import DBSCAN
import math
from shapely.ops import transform
from shapely.ops import cascaded_union
import pyproj
from functools import partial
from shapely.geometry import Point
import mplleaflet
import folium
from folium.vector_layers import Polygon

In [21]:
def get_hour(ts):
    return ts.hour

In [22]:
def get_minute(ts):
    return ts.hour * 100 + int(ts.minute / 15) * 15

In [23]:
def load_day(day):
    header = ['timestamp', 'line_id', 'direction', 'jrny_patt_id', 'time_frame', 'journey_id', 'operator', 
              'congestion', 'lon', 'lat', 'delay', 'block_id', 'vehicle_id', 'stop_id', 'at_stop']
    types = {'timestamp': np.int64,
             'journey_id': np.int32,
             'congestion': np.int8,
             'lon': np.float64,
             'lat': np.float64,
             'delay': np.int8,
             'vehicle_id': np.int32,
             'at_stop': np.int8}
    file_name = 'data/siri.201301{0:02d}.csv'.format(day)
    df = pd.read_csv(file_name, header=None, names=header, dtype=types, parse_dates=['time_frame'], 
                     infer_datetime_format=True)
    null_replacements = {'line_id': 0, 'stop_id': 0}
    df = df.fillna(value=null_replacements)
    df['line_id'] = df['line_id'].astype(np.int32)
    df['stop_id'] = df['stop_id'].astype(np.int32)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='us')
    df['hour'] = df['timestamp'].apply(get_hour)
    return df

In [24]:
def load_data():
    header = ['timestamp', 'line_id', 'direction', 'jrny_patt_id', 'time_frame', 'journey_id', 'operator', 
              'congestion', 'lon', 'lat', 'delay', 'block_id', 'vehicle_id', 'stop_id', 'at_stop']
    types = {'timestamp': np.int64,
             'journey_id': np.int32,
             'congestion': np.int8,
             'lon': np.float64,
             'lat': np.float64,
             'delay': np.int8,
             'vehicle_id': np.int32,
             'at_stop': np.int8}
    
    data = None
    
    for day in range(1, 32):
        file_name = 'data/siri.201301{0:02d}.csv'.format(day)
        df = pd.read_csv(file_name, header=None, names=header, dtype=types, parse_dates=['time_frame'], 
                         infer_datetime_format=True)
        
        if data is None:
            data = df
        else:
            data = data.append(df)
            
    null_replacements = {'line_id': 0, 'stop_id': 0}
    data = data.fillna(value=null_replacements)
    data['line_id'] = data['line_id'].astype(np.int32)
    data['stop_id'] = data['stop_id'].astype(np.int32)
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit='us')
    data['hour'] = data['timestamp'].apply(get_hour)
    data['minute'] = data['timestamp'].apply(get_minute)
    return data

In [25]:
def load_day_hour(day, hour):
    df = load_day(day)
    return df[df['hour'] == hour]

In [26]:
def create_radian_columns(df):
    df['rad_lng'] = np.radians(df['lon'].values)
    df['rad_lat'] = np.radians(df['lat'].values)
    return df

In [27]:
def density_cluster(df, eps_in_meters=50, num_samples=15):
    earth_perimeter = 40070000.0  # In meters
    eps_in_radians = eps_in_meters / earth_perimeter * (2 * math.pi)

    db_scan = DBSCAN(eps=eps_in_radians, min_samples=num_samples, metric='haversine',
                    algorithm='ball_tree')
    return db_scan.fit_predict(df[['rad_lat', 'rad_lng']])

In [28]:
def generate_blob_clusters(df, eps_in_meters=5):
    # Group the observations by cluster identifier
    groups = df.groupby('cluster')
    clusters = list()
    blobs = list()
    counts = list()

    for cluster_id, points in groups:
        if cluster_id >= 0:
            buffer_radius = eps_in_meters
            buffers = [buffer_in_meters(lon, lat, buffer_radius)
                       for lon, lat in zip(points['lon'], points['lat'])]
            blob = cascaded_union(buffers)
            blobs.append(blob)
            clusters.append(cluster_id)
            counts.append(len(points))

    # Create the GeoDataFrame from the cluster numbers and blobs
    data = {'cluster': clusters, 'polygon': blobs, 'count': counts}

    cluster_gdf = gpd.GeoDataFrame(pd.DataFrame(data), geometry='polygon')
    cluster_gdf.crs = {'init': 'epsg:4326'}
    return cluster_gdf

In [29]:
def buffer_in_meters(lng, lat, radius):
    proj_meters = pyproj.Proj(init='epsg:3857')
    proj_latlng = pyproj.Proj(init='epsg:4326')

    project_to_meters = partial(pyproj.transform, proj_latlng, proj_meters)
    project_to_latlng = partial(pyproj.transform, proj_meters, proj_latlng)

    pt_latlng = Point(lng, lat)
    pt_meters = transform(project_to_meters, pt_latlng)

    buffer_meters = pt_meters.buffer(radius)
    buffer_latlng = transform(project_to_latlng, buffer_meters)
    return buffer_latlng

In [30]:
def parse_lon_lat(lon_lat):
    lon, lat = lon_lat.split(' ')
    return float(lat), float(lon)

In [31]:
def parse_polygon(polygon_wkt):
    idx = polygon_wkt.find('), (')
    if idx == -1:
        idx = -2
    items = polygon_wkt[10:idx].split(', ')
    try:
        locations = [parse_lon_lat(item) for item in items]
    except:
        print('Faulty Polygon:')
        print(polygon_wkt)
    return locations

In [32]:
def show_blob_map(df):
    map = folium.Map()
    
    gdf = generate_blob_clusters(df)
    for index, row in gdf.iterrows():
        locations = parse_polygon(row['polygon'].wkt)
        polygon = Polygon(locations, fill=True, opacity=0.6, color='#ff0000', 
                          fill_color='#ff0000', fill_opacity=0.6)
        polygon.add_to(map)
    
    return map

In [33]:
data = load_data()

In [34]:
data = create_radian_columns(data)

In [36]:
for hour in range(5, 24):
    for minute in range(0, 4):
        time = hour * 100 + minute * 15
        
        print('Time : {0}'.format(time))
    
        df = data[data['minute'] == time]
        df = df[df['congestion'] == 1].copy()

        if len(df) > 0:
            df['cluster'] = density_cluster(df, eps_in_meters=5, num_samples=5)

            print('Count: {0} / {1}'.format(len(df), len(df[df['cluster'] >= 0])))

            map = show_blob_map(df)
            map.save('maps/congestion_{0:04d}.html'.format(time))

Time : 500
Time : 515
Time : 530
Time : 545
Time : 600
Count: 239 / 222
Time : 615
Count: 793 / 745
Time : 630
Count: 1488 / 1397
Time : 645
Count: 3852 / 3716
Time : 700
Count: 7365 / 7180
Time : 715
Count: 8794 / 8476
Time : 730
Count: 9746 / 9220
Time : 745
Count: 9770 / 9189
Time : 800
Count: 11592 / 10661
Time : 815
Count: 12046 / 10467
Time : 830
Count: 12093 / 10001
Time : 845
Count: 12360 / 9899
Time : 900
Count: 9743 / 7994
Time : 915
Count: 9825 / 8644
Time : 930
Count: 10277 / 9630
Time : 945
Count: 10560 / 9938
Time : 1000
Count: 9594 / 9121
Time : 1015
Count: 9848 / 9351
Time : 1030
Count: 8312 / 7961
Time : 1045
Count: 9065 / 8758
Time : 1100
Count: 7571 / 7276
Time : 1115
Count: 7667 / 7399
Time : 1130
Count: 5916 / 5691
Time : 1145
Count: 6807 / 6568
Time : 1200
Count: 6376 / 6072
Time : 1215
Count: 8012 / 7698
Time : 1230
Count: 7985 / 7621
Time : 1245
Count: 8035 / 7665
Time : 1300
Count: 6486 / 6079
Time : 1315
Count: 7265 / 6895
Time : 1330
Count: 7478 / 7074
Time :