# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [5]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [6]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

Wall time: 1min 52s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [7]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [8]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [9]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [5]:
clusters=100
data = coordinates
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores
model = myKMeans.fit(data.to_numpy()[:100000,:])    


In [15]:
def show_cluster(cluster_number, data, model, my_map):
    cluster_pred = model.predict(data)
    cluster_data = data[cluster_pred == cluster_number].to_numpy()
    for i in range(len(cluster_data)):
        folium.CircleMarker([cluster_data[i,0], cluster_data[i,1]], radius=1,                
                            color="green", 
                            fill_opacity=0.5
                           ).add_to(my_map)
    
        folium.CircleMarker([cluster_data[i,2], cluster_data[i,3]], radius=1,                
                            color="red", 
                            fill_opacity=0.5
                            ).add_to(my_map)
    

In [20]:
cluster_map_2 = folium.Map(location = [40.730610,-73.935242],zoom_start = 11,)
show_cluster(99, data, model,cluster_map_2)
cluster_map_2

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [21]:
# k = 100
clusters_100=100
data = coordinates
myKMeans_100=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores
model_100 = myKMeans_100.fit(data.to_numpy()[:100000,:])    


In [22]:
# k = 10
clusters_10=10
data = coordinates
myKMeans_10=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores
model_10 = myKMeans_10.fit(data.to_numpy()[:100000,:])    

In [28]:
def show_var(cluster_number, data, model):

    cluster_pred = model.predict(data)
    cluster_data = data[cluster_pred == cluster_number].to_numpy()
    
    #get cluster centers
    centers= model.cluster_centers_
    cluster_data = data[cluster_pred == cluster_number].to_numpy()
    print("Intra cluster variance")
    print(np.var(cluster_data, axis=0))
    print("Extra cluster variance")
    print(np.var(centers, axis=0))

In [29]:
print("Variance with 100 clusters:")
show_var(55, data, model_100)

print("\nVariance with 10 clusters:")
show_var(55, data, model_10)

Variance with 100 clusters:
(1331, 4)
Intra cluster variance
[6.80964232e-05 7.71928824e-05 4.57576461e-04 2.08703672e-04]
Extra cluster variance
[0.02211261 0.02478358 0.01602791 0.02414183]

Variance with 10 clusters:
(3017, 4)
Intra cluster variance
[1.19381910e-04 8.98108200e-05 9.04893554e-05 7.56971382e-05]
Extra cluster variance
[0.02695083 0.02988812 0.02129943 0.03128663]


### Summary
clusters with k = 100 has lower variances than clusters with k = 10
