In [18]:
import numpy as np
from scipy.linalg import eigh
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans

import psycopg2
from shapely.wkt import loads as wkt_loads
from mpl_toolkits.basemap import Basemap

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

%matplotlib qt
axis_font = {'size':'16'}
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

In [10]:
conn = psycopg2.connect("dbname='nyc_taxi' user='postgres' host='localhost' password='organon'")
cur = conn.cursor()

In [None]:
# Try plotting a heat map over census tract based on the total number of pickups from the region1

In [9]:
cur.close()

In [3]:
# Cluster NTAs and visualize based on the distribution of the dropoff location of all trips from there
cur.execute("""SELECT COUNT(*) FROM nta;""")
n_nta = cur.fetchall()[0][0] # Total number of census tracts
# The (i,j)-th element can be viewed as the probability for a trip from i to j, 
count_trip_from = np.zeros((n_nta, n_nta), dtype="int32")

In [11]:
for gid_nta in range(1, n_nta+1):
    cur.execute("""SELECT dropoff_gid, COUNT(trip_id)
    FROM trip_nta
    WHERE pickup_gid={0}
    GROUP BY dropoff_gid
    ORDER BY dropoff_gid;""".format(gid_nta))
    result = np.array(cur.fetchall(), dtype="int32")
    
    if result.size != 0:
        count_trip_from[gid_nta-1, result[:, 0]-1] = result[:, 1]

In [56]:
count_threshold = 200 # Minimum number of trips from one census tract
gid_nta_nz = np.where(np.sum(count_trip_from, axis=1) > count_threshold)[0] + 1 # The indices (gid-1) of the census tracts to be clustered
count_trip_from = count_trip_from[gid_nta_nz-1, :]
W = squareform(pdist(count_trip_from, 'correlation')) # Pairwise distance between census tracts

In [63]:
# Spectral clustering
K = 7
L = np.diag(sum(W)) - W
_, V_spec = eigh(L)
v = V_spec[:, 1:2] # The reduced dimension data, simply the second smallest eigen vector of L
label = KMeans(n_clusters=K, random_state=8).fit_predict(v)

fig, ax = plt.subplots(figsize=(7, 6))

c = ['b', 'g', 'r', 'purple', 'yellow', 'magenta', 'cyan', 'black', 'white']
for k in range(K):
    ax.hist(v[label==k, 0], 20, facecolor=c[k], alpha=1, label='cluster {0}'.format(k))
ax.legend(loc=1, fontsize=16)
ax.set_xlabel('$v$', **axis_font)
ax.set_ylabel('count', **axis_font)
#fig.savefig('spec_hist.pdf', dpi=10)

<matplotlib.text.Text at 0x7fa50edc2198>

In [58]:
nta_color = ['grey',] * n_nta # The color of each census tract
for i, gid in enumerate(gid_nta_nz):
    nta_color[gid-1] = c[label[i]]

In [64]:
label

array([0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       6, 6, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,
       0, 0, 0, 0, 0, 6, 6, 0, 0, 0, 0, 0, 0, 0, 2, 1, 6, 0, 0, 0, 0, 0, 3,
       2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0], dtype=int32)

In [59]:
tol = 1e-4 # Tolerance on fetching the census tract boundary
cur.execute("""SELECT gid, ST_AsText(ST_SIMPLIFY(geom, {0})) FROM nta;""".format(tol))
vals = cur.fetchall()

In [60]:
def create_vectors_multipolygon(m, multipolygon):
    'Create the vectors for MultiPolygons'
    vectors = []
    for polygon in multipolygon:
        seg = []
        for coord in list(polygon.exterior.coords):
            seg.append(m(coord[0], coord[1]))
            vectors.append(np.asarray(seg))
    return vectors


In [61]:
west, south, east, north = -74.15, 40.50, -73.65, 40.95
m = Basemap(llcrnrlon=west, llcrnrlat=south, urcrnrlon=east, urcrnrlat=north,
             resolution='f', projection='merc', lat_0=(south+north)/2, lon_0=(west+east)/2)
fig = plt.figure(figsize=(30,30))
ax = fig.add_subplot(111)

m.drawmapboundary(fill_color='aqua')
m.fillcontinents(color='grey',lake_color='aqua',zorder=0)
m.drawcoastlines()
m.drawrivers()

<matplotlib.collections.LineCollection at 0x7fa514253c50>

In [62]:
for val in vals:
    vectors = create_vectors_multipolygon(m, wkt_loads(val[1]))
    lines = LineCollection(vectors, antialiaseds=(1, ))
    lines.set_facecolors(nta_color[val[0]-1])
    lines.set_edgecolors('black')
    lines.set_linewidth(0.5)
    ax.add_collection(lines)
