In [3]:
import numpy as np
import pandas as pd
import pandas.io.sql as pdsql
import pickle

import psycopg2

from scipy.linalg import eigh
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans

from shapely.wkt import loads as wkt_loads
from mpl_toolkits.basemap import Basemap

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

%matplotlib qt
axis_font = {'size':'16'}
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

In [1]:
conn.close()

NameError: name 'conn' is not defined

In [4]:
conn = psycopg2.connect("dbname='nyc_taxi' user='postgres' host='localhost' password='organon'")
# Read the pickup_count table and the day_info table
cdf = pdsql.read_sql("SELECT * FROM count_by_hour", conn, params=None)
conn.close()

In [5]:
# cdf['pickup_gid'] = cdf['pickup_gid'].astype(int)
#cdf['pickup_hour'] = cdf['pickup_hour'].astype(int)
idx_nan_pickup = np.where(pd.isnull(cdf['pickup_gid']).values)[0].tolist()
idx_nan_dropoff = np.where(pd.isnull(cdf['dropoff_gid']).values)[0].tolist()

In [6]:
row = 0
for row in range(len(cdf.index)):
    if row in idx_nan_pickup:
        cdf.set_value(row, 'pickup_gid', cdf['dropoff_gid'][row])
        cdf.set_value(row, 'pickup_hour', cdf['dropoff_hour'][row])
        cdf.set_value(row, 'count_mean_pickup', 0.0)   

In [7]:
cdf['pickup_gid'] = cdf['pickup_gid'].astype(int)
cdf['pickup_hour'] = cdf['pickup_hour'].astype(int)

In [8]:
X = np.zeros((195, 48), dtype='float64')

# Add the rows where count is 0
gid_append = []
hour_append = []

row = 0
for gid in range(1, 196):
    for hour in range(24):
        if row < len(cdf.index) and\
        cdf.pickup_gid[row] == gid and\
        cdf.pickup_hour[row] == hour:
            X[gid-1][hour] = cdf.count_mean_pickup[row]
            X[gid-1][hour+24] = cdf.count_mean_dropoff[row]
            row += 1
        else:
            X[gid-1][hour] = 0.0
            X[gid-1][hour+24] = 0.0

In [62]:
Y = np.dot(np.diag(1 / np.sum(X, axis=1)), X)

In [86]:
# Spectral clustering to cluster the hourly pickup-dropoff patterns for all 195 NTAs
f = 100 # The factor used to set epsilon according to the median of the square pairwise Euclidean distance

distance = pdist(X, 'euclidean');
distance = pdist(X[:, 0:24]-X[:, 24:48], 'euclidean');
epsilon = np.median(distance ** 2) / f;
W = np.exp(-squareform(distance) ** 2 / epsilon)

In [87]:
# Spectral clustering
K = 4
L = np.diag(sum(W)) - W
_, V_spec = eigh(L)
v = V_spec[:, 1:2] # The reduced dimension data, simply the second smallest eigen vector of L
label = KMeans(n_clusters=K, random_state=8).fit_predict(v)

fig, ax = plt.subplots(figsize=(7, 6))

c = ['b', 'g', 'r', 'purple', 'yellow', 'magenta', 'cyan', 'black', 'white']
for k in range(K):
    ax.hist(v[label==k, 0], 20, facecolor=c[k], alpha=1, label='cluster {0}'.format(k))
ax.legend(loc=1, fontsize=16)
ax.set_xlabel('$v$', **axis_font)
ax.set_ylabel('count', **axis_font)

<matplotlib.text.Text at 0x7ff3be3ff4e0>

In [67]:
label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [51]:
n_nta = 195
nta_color = ['grey',] * 195 # The color of each census tract
for gid in range(1, 196):
    nta_color[gid-1] = c[label[gid-1]]

In [52]:
conn = psycopg2.connect("dbname='nyc_taxi' user='postgres' host='localhost' password='organon'")
cur = conn.cursor()

tol = 1e-4 # Tolerance on fetching the census tract boundary
cur.execute("""SELECT gid, ST_AsText(ST_SIMPLIFY(geom, {0})) FROM nta;""".format(tol))
vals = cur.fetchall()
cur.close()

In [53]:
def create_vectors_multipolygon(m, multipolygon):
    'Create the vectors for MultiPolygons'
    vectors = []
    for polygon in multipolygon:
        seg = []
        for coord in list(polygon.exterior.coords):
            seg.append(m(coord[0], coord[1]))
            vectors.append(np.asarray(seg))
    return vectors

In [54]:
west, south, east, north = -74.15, 40.50, -73.65, 40.95
m = Basemap(llcrnrlon=west, llcrnrlat=south, urcrnrlon=east, urcrnrlat=north,
             resolution='f', projection='merc', lat_0=(south+north)/2, lon_0=(west+east)/2)
fig = plt.figure(figsize=(30,30))
ax = fig.add_subplot(111)

m.drawmapboundary(fill_color='aqua')
m.fillcontinents(color='grey',lake_color='aqua',zorder=0)
m.drawcoastlines()
m.drawrivers()

<matplotlib.collections.LineCollection at 0x7ff3c1fa3c88>

In [55]:
for val in vals:
    vectors = create_vectors_multipolygon(m, wkt_loads(val[1]))
    lines = LineCollection(vectors, antialiaseds=(1, ))
    lines.set_facecolors(nta_color[val[0]-1])
    lines.set_edgecolors('black')
    lines.set_linewidth(0.5)
    ax.add_collection(lines)