# Use Case: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) 

In [1]:
!pip install folium



In [2]:
import pandas as pd
import numpy as np
import folium


In [3]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [4]:
#quick look at the data
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,pickup_borough,dropoff_borough,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
0,0,0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,...,0,0,0,0,0,0.640487,queens,queens,0,0
1,1,1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,...,0,0,0,0,0,5.25067,manhattan,manhattan,1,0
2,2,2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,...,0,0,0,0,0,0.863411,manhattan,manhattan,0,0
3,3,3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,...,0,0,0,0,0,1.739386,manhattan,manhattan,1,0
4,4,4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,...,0,0,0,0,0,1.242218,manhattan,manhattan,0,0


In [5]:
train.shape

(400000, 32)

In [6]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [7]:
coordinates.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.84161
1,40.711303,-74.016048,40.782004,-73.979268
2,40.76127,-73.982738,40.750562,-73.991242
3,40.733143,-73.98713,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655


## Clustering
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [8]:
from sklearn.cluster import KMeans

In [9]:
#define number of clusters and create instance
k=20
myKMeans=KMeans(n_clusters=k, n_jobs=-1)#parallelize to all cores

In [10]:
#train model
myKMeans.fit(coordinates.to_numpy())

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=20, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [11]:
#get cluster centers
centers=myKMeans.cluster_centers_
labels=myKMeans.labels_    

In [12]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(k):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [13]:
cluster_map

In [14]:
labels

array([10, 15,  3, ...,  9,  9,  1])

In [15]:
#add cluster labels to DataFrame
train['clusterID']=labels

In [16]:
#GroupBy Clusters
clusters=train.groupby('clusterID')

In [17]:
clusters['fare_amount'].count()

clusterID
0     36858
1     83922
2     42294
3      3261
4      8398
5        98
6      4787
7      3114
8      7324
9     37017
10     8616
11    48243
12    60413
13     8136
14       53
15     1705
16    40308
17     4639
18      721
19       93
Name: fare_amount, dtype: int64

In [18]:
clusters.mean()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,200449.239975,204792.150225,204792.150225,12.783697,-73.978046,40.758748,-73.998118,40.725842,1.711325,15.586657,...,-73.998116,0.0,0.0,0.0,0.0,0.0,0.0,2.60862,0.037061,0.932036
1,199821.832011,204151.197743,204151.197743,7.145418,-73.98447,40.754408,-73.984251,40.754737,1.683563,15.711029,...,-73.98425,0.0,0.0,0.0,0.0,0.0,0.0,0.854686,0.078716,0.088201
2,199890.64534,204221.267461,204221.267461,10.497251,-73.980783,40.755386,-73.959386,40.778945,1.673311,15.637514,...,-73.959389,0.0,0.0,0.0,0.0,0.0,0.0,2.143308,0.145458,0.0
3,203346.316467,207751.881938,207751.881938,12.687418,-73.884607,40.762208,-73.880419,40.755488,1.712052,15.729837,...,-73.880409,0.0,0.00184,0.0,0.0,0.351119,0.122355,1.892858,0.0,0.0
4,200965.175042,205319.178376,205319.178376,28.299266,-73.980247,40.752672,-73.874539,40.763262,1.685044,15.696952,...,-73.874542,0.0,0.0,0.0,0.0,0.0,0.580019,5.883421,0.246606,0.0
5,208037.27551,212544.540816,212544.540816,9.795102,-73.149918,41.366595,-73.147641,41.368939,1.387755,16.173469,...,-73.147255,0.0,0.0,0.0,0.0,0.0,0.0,0.242551,0.0,0.0
6,201548.479215,205915.184249,205915.184249,48.654126,-73.784774,40.646475,-73.9711,40.739403,1.800292,15.726551,...,-73.971108,0.975977,0.0,0.0,0.001044,0.0,0.011907,12.108439,0.0,0.17652
7,199682.375401,204008.953115,204008.953115,49.317871,-73.974369,40.753215,-73.785372,40.654195,1.75562,15.933526,...,-73.785326,0.0,0.838793,0.0,0.0,0.052344,0.0,12.314195,0.234746,0.0
8,198398.921491,202697.169443,202697.169443,31.206137,-73.871454,40.770442,-73.97803,40.747636,1.726242,15.751638,...,-73.978032,0.0,0.0,0.0,0.0,0.916166,0.0,6.158529,0.0,0.2003
9,199603.304049,203927.869492,203927.869492,7.125219,-73.960455,40.784254,-73.958885,40.785773,1.661048,15.657103,...,-73.958887,0.0,0.0,0.0,0.0,0.0,0.0,1.063618,0.0,2.7e-05


In [19]:
clusters.var()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13281890000.0,13859130000.0,13859130000.0,32.811603,0.000132,0.000126,0.000106,0.000161,1.743726,75.069803,...,0.000106,0.0,0.0,0.0,0.0,0.0,0.0,1.512946,0.035689,0.063346
1,13352880000.0,13933220000.0,13933220000.0,14.248006,7.3e-05,7.1e-05,8.2e-05,7.7e-05,1.716408,75.188179,...,8.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.175509,0.072521,0.080422
2,13335350000.0,13914860000.0,13914860000.0,23.79953,0.00011,0.000146,0.000141,0.00016,1.694728,74.915984,...,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,1.600355,0.124303,0.0
3,13476280000.0,14062030000.0,14062030000.0,193.189293,0.000826,0.000577,0.001425,0.001005,1.825342,76.675149,...,0.001425,0.0,0.001837,0.0,0.0,0.227904,0.107417,3.612828,0.0,0.0
4,13462010000.0,14047150000.0,14047150000.0,77.929454,0.000255,0.000485,0.000491,0.000354,1.685833,74.639483,...,0.000491,0.0,0.0,0.0,0.0,0.0,0.243626,2.486413,0.185814,0.0
5,13682800000.0,14276860000.0,14276860000.0,54.881506,0.005296,0.000353,0.004019,0.000394,0.610983,91.217021,...,0.004026,0.0,0.0,0.0,0.0,0.0,0.0,2.412272,0.0,0.0
6,13662490000.0,14256470000.0,14256470000.0,131.234187,0.000129,6.4e-05,0.001003,0.002172,1.902858,76.56562,...,0.001003,0.023451,0.0,0.0,0.001044,0.0,0.011768,3.441175,0.0,0.145391
7,13563880000.0,14153370000.0,14153370000.0,101.284053,0.000968,0.000585,0.00056,0.000629,1.803735,74.004896,...,0.000561,0.0,0.135263,0.0,0.0,0.04962,0.0,2.623336,0.179698,0.0
8,13289350000.0,13866860000.0,13866860000.0,78.731481,0.00015,0.000116,0.00033,0.001001,1.795455,75.978865,...,0.00033,0.0,0.0,0.0,0.0,0.076816,0.0,2.360374,0.0,0.160202
9,13347860000.0,13927990000.0,13927990000.0,29.421093,0.000143,0.000211,0.000153,0.000172,1.702079,75.17205,...,0.000153,0.0,0.0,0.0,0.0,0.0,0.0,0.648314,0.0,2.7e-05
