In [None]:
import numpy as np
import pandas as pd
import pandas.io.sql as pdsql
import pickle

import psycopg2

from numpy.random import uniform
from numpy.linalg import svd
from scipy.linalg import eigh
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans

from shapely.wkt import loads as wkt_loads
from mpl_toolkits.basemap import Basemap

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import LineCollection, PatchCollection

%matplotlib qt
axis_font = {'size':'20'}
mpl.rcParams['xtick.labelsize'] = 20
mpl.rcParams['ytick.labelsize'] = 20

In [None]:
# Read the pickup_count table and the day_info table
conn = psycopg2.connect("dbname='nyc_taxi' user='postgres' host='localhost' password='organon'")
cdf = pdsql.read_sql("SELECT * FROM count_by_hour_ct", conn, params=None)
#cdf = pdsql.read_sql("SELECT * FROM count_by_hour_ct_green", conn, params=None)
ct = pdsql.read_sql("SELECT gid, boro_name, shape_area, boro_ct201, ntaname FROM ct", conn, coerce_float=True, params=None)
conn.close()

In [None]:
# Remove the NaN entries
idx_nan_pickup = np.where(pd.isnull(cdf['pickup_gid']).values)[0].tolist()
idx_nan_dropoff = np.where(pd.isnull(cdf['dropoff_gid']).values)[0].tolist()

row = 0
for row in range(len(cdf.index)):
    if row in idx_nan_pickup:
        cdf.set_value(row, 'pickup_gid', cdf['dropoff_gid'][row])
        cdf.set_value(row, 'pickup_hour', cdf['dropoff_hour'][row])
        cdf.set_value(row, 'count_mean_pickup', 0.0)   
        
    if row in idx_nan_dropoff:
        cdf.set_value(row, 'dropoff_gid', cdf['pickup_gid'][row])
        cdf.set_value(row, 'dropoff_hour', cdf['pickup_hour'][row])
        cdf.set_value(row, 'count_mean_dropoff', 0.0)   
        
cdf['pickup_gid'] = cdf['pickup_gid'].astype(int)
cdf['pickup_hour'] = cdf['pickup_hour'].astype(int)
cdf['dropoff_gid'] = cdf['dropoff_gid'].astype(int)
cdf['dropoff_hour'] = cdf['dropoff_hour'].astype(int)   

# Construct the numpy array where each row corresponds to the average number of pickups in the 24 hours of a 
# day + number of dropoffs in the 24 hours of a averaged over 1 year
X = np.zeros((2166, 48), dtype='float64')

for row in range(len(cdf.index)):
    X[(cdf.pickup_gid[row]-1)][cdf.pickup_hour[row]] = cdf.count_mean_pickup[row]
    X[(cdf.pickup_gid[row]-1)][cdf.pickup_hour[row]+24] = cdf.count_mean_dropoff[row]

In [None]:
# Study only the census tracts where there are enough pickup and dropoffs per day
min_daily_count = 4500
#min_daily_count = 300
gids_popular = np.where(np.sum(X, axis=1) > min_daily_count)[0] + 1

X_popular = X[gids_popular-1, :]
Y = np.dot(np.diag(1 / np.sum(X_popular, axis=1)), X_popular) # Normalize each row of X_popular

[np.sum(label==k) for k in range(K)]

In [None]:
# Spectral clustering to cluster the hourly pickup-dropoff patterns for all 2166 census tracks
f =180 # The factor used to set epsilon according to the median of the square pairwise Euclidean distance
#f = 160

distance = pdist(Y, 'euclidean');
epsilon = np.median(distance ** 2) / f;
W = np.exp(-squareform(distance) ** 2 / epsilon) # The adjacency matrix

# Spectral clustering
K = 2
L = np.diag(sum(W)) - W
_, V_spec = eigh(L)
v = V_spec[:, 1:2] # The reduced dimension data, simply the second smallest eigen vector of L
label = KMeans(n_clusters=K, random_state=8).fit_predict(v)
fig, ax = plt.subplots(figsize=(10, 7))

c = ['b', 'g', 'r', 'purple', 'yellow', 'magenta', 'cyan', 'black', 'white']
for k in range(K):
    ax.hist(v[label==k, 0], 20, facecolor=c[k], alpha=1, label='cluster {0}'.format(k))
ax.legend(loc=1, fontsize=20)
ax.grid(True)
ax.set_xlabel('$v$', **axis_font)
ax.set_ylabel('count', **axis_font)

fig.savefig('./figures/clustering/hist_yellow.jpg', dpi=100, bbox_inches='tight')
#fig.savefig('./figures/clustering/hist_green.jpg', dpi=100, bbox_inches='tight')

In [None]:
# The function to evaluate the gap statistic for k-means with different number of clusters
def get_gapstat(X, k_range=np.arange(1, 9), n_sample=20, random_state=17):
    n, p = X.shape
    
    X_ref = np.empty((n, p, n_sample), dtype='float64')
    for col in range(p):
        X_ref[:, col, :] = uniform(low=X[:, col].min(), high=X[:, col].max(), size=(n, n_sample))
    
    gapstat = np.empty((n_sample, len(k_range)), dtype='float64')
    for i_k, k in enumerate(k_range):
        km = KMeans(n_clusters=k, random_state=random_state)
        
        # The inertia of the actual data
        km.fit(X)
        w = km.inertia_
        
        # The inertia of the reference data
        w_ref = np.empty(n_sample, dtype='float64')
        for i_sample in range(n_sample):
            km.fit(X_ref[:, :, i_sample])
            w_ref[i_sample] = km.inertia_
        
        gapstat[:, i_k] = np.log(w_ref) - np.log(w)
        
    return gapstat

k_range = np.arange(1, 20)
n_sample = 20

gap_spec = get_gapstat(v, k_range=k_range, n_sample=n_sample)

# Plot the Gap statistics curve
fig, ax = plt.subplots(figsize=(10, 7))
ax.errorbar(k_range, np.mean(gap_spec, axis=0), yerr=np.sqrt(1+1/n_sample)*np.std(gap_spec, axis=0))
ax.grid(True)
ax.set_xlabel('$k$', **axis_font)
ax.set_ylabel('Gap', **axis_font)

fig.savefig('./figures/clustering/gap_yellow.jpg', dpi=100, bbox_inches='tight')
#fig.savefig('./figures/clustering/gap_green.jpg', dpi=100, bbox_inches='tight')

In [None]:
west, south, east, north = -74.15, 40.50, -73.65, 40.95 # New York
west, south, east, north = -74.03, 40.7, -73.93, 40.8 # Manhattan

n_ct_popular = gids_popular.shape[0]
ct_color = {gids_popular[i]: c[label[i]] for i in range(n_ct_popular)} # The color of each census tract
ct2gid = {ct.boro_ct201[i]: ct.gid[i] for i in range(len(ct.gid))}

# Plot the CT boundaries
m = Basemap(llcrnrlon=west, llcrnrlat=south, urcrnrlon=east, urcrnrlat=north,
                 resolution='i', projection='merc', lat_0=(south+north)/2, lon_0=(west+east)/2)
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)

m.readshapefile('./datasets/ct_2010/geo_export_670644cc-4c81-49e9-9d6d-fb574f649fff', 'nyc', linewidth=2, zorder=1)

# Plot the coloring for each cluster
patches = []
for info, shape in zip(m.nyc_info, m.nyc):
    gid = ct2gid[info['boro_ct201']]
    if gid in gids_popular:
        patches.append(Polygon(np.array(shape), True, facecolor=ct_color[gid]))
        p = PatchCollection(patches, match_original=True, alpha=0.6, linewidths=1, zorder=2)

ax.add_collection(p)

# Plot the scatterplot of the top 20 most reviewed lounges, dance clubs and formal retaurants
conn = psycopg2.connect("dbname='nyc_taxi' user='postgres' host='localhost' password='organon'")
cur = conn.cursor()

cur.execute("""SELECT ST_X(lonlat), ST_Y(lonlat) from hotspot WHERE tag='bars' or tag='lounges'""")
lonlat = np.array(cur.fetchall())
x, y = m(lonlat[:, 0], lonlat[:, 1])
m.scatter(x, y, s=80, facecolor='r', lw = 1, zorder=3, label='bars/lounges')

cur.execute("""SELECT ST_X(lonlat), ST_Y(lonlat) from hotspot WHERE tag='dance clubs'""")
lonlat = np.array(cur.fetchall())
x, y = m(lonlat[:, 0], lonlat[:, 1])
m.scatter(x, y, s=80, facecolor='orange', lw = 1, zorder=3, label='dance clubs')

cur.execute("""SELECT ST_X(lonlat), ST_Y(lonlat) from hotspot WHERE tag='formals'""")
lonlat = np.array(cur.fetchall())
x, y = m(lonlat[:, 0], lonlat[:, 1])
m.scatter(x, y, s=80, facecolor='m', lw = 1, zorder=3, label='formal restaurants')

ax.legend(loc=1, fontsize=20)

conn.close()
fig.savefig('./figures/clustering/coloring_yellow_tagged.jpg', dpi=100, bbox_inches='tight')
#fig.savefig('./figures/clustering/coloring_green_tagged.jpg', dpi=100, bbox_inches='tight')

In [None]:
# Plot the daily pickup/dropoff pattern for each cluster
fig, ax = plt.subplots(figsize=(10, 7))
t = np.arange(24, dtype='int32')
for k in range(K):
    Y_subset = Y[label==k, :]
    pickup_mean, dropoff_mean = np.mean(Y_subset, axis=0)[0:24], np.mean(Y_subset, axis=0)[24:48]
    pickup_std, dropoff_std = np.std(Y_subset, axis=0)[0:24], np.std(Y_subset, axis=0)[24:48]
    
    ax.plot(t, pickup_mean, color=c[k], linewidth=2, label="pickup, $k={0}$".format(k))
    ax.plot(t, -dropoff_mean, color=c[k], linewidth=2, linestyle='--', label="-dropoff, $k={0}$".format(k))
    
    ax.fill_between(t, pickup_mean-pickup_std, pickup_mean+pickup_std, facecolor=c[k], alpha=0.3)
    ax.fill_between(t, -dropoff_mean-dropoff_std, -dropoff_mean+dropoff_std, facecolor=c[k], alpha=0.3)
    
ax.legend(prop={'size':16}, loc=0)
ax.grid()
ax.set_xlabel('hour', **axis_font)
ax.set_xlim([0, 23])
ax.set_ylabel('pickup-dropoff dstr.', **axis_font)

fig.savefig('./figures/clustering/curve_yellow.jpg', dpi=100, bbox_inches='tight')
#fig.savefig('./figures/clustering/curve_green.jpg', dpi=100, bbox_inches='tight')