In [None]:
import os
import re

from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("nyc-taxi-hdfs").getOrCreate()

In [None]:
spark_url = 'hdfs://192.168.1.84:9000/'

In [None]:
def get_green_cab_df(month,year='2015'):
    m = ('0' + str(month))[-2:]
    url = spark_url+'green_tripdata_'+str(year)+'-'+m+'.csv.gz'
    return spark.read.load(url, inferSchema='true', format='csv', header='true')

In [None]:
january = get_green_cab_df(1)

In [None]:
january.columns

In [None]:
december = get_green_cab_df(12)

In [None]:
jan_dec = january.union(december)

In [None]:
jan_dec_sample = jan_dec.select('Pickup_latitude','Pickup_longitude',
    'Dropoff_latitude','Dropoff_longitude').sample(False,1.0/4000)

In [None]:
jan_dec_sample.count()

In [None]:
sample_points = jan_dec_sample.collect()

In [None]:
gpickup = [[p.Pickup_latitude,p.Pickup_longitude] for p in sample_points]
gdropoff = [[p.Dropoff_latitude,p.Dropoff_longitude] for p in sample_points]

In [None]:
import statistics
glattitudes = map(lambda p: p[0],gpickup+gdropoff)
glongitudes = map(lambda p: p[1],gpickup+gdropoff)
gcentre = [statistics.mean(glattitudes),statistics.mean(glongitudes)]

In [None]:
import folium
green_map = folium.Map(location=gcentre)
for p in gpickup:
    folium.CircleMarker(p ,fill_color='green',radius=40).add_to(green_map)
for p in gdropoff:
    folium.CircleMarker(p ,fill_color='blue',radius=40).add_to(green_map)       
green_map

In [None]:
from numpy import array
december_rdd = december.select('Pickup_latitude','Pickup_longitude').rdd.map(list).map(array)

In [None]:
december_rdd.take(5)

In [None]:
from pyspark.mllib.clustering import KMeans

cluster_sets = sc.parallelize([KMeans.train(december_rdd, k, maxIterations=10,initializationMode="random")
    for k in range(3,8)])

In [None]:
def cluster_map(k):
    centers = [list(c) for c in cluster_sets.collect()[k].centers if abs(sum(c)) > 0.1]
    cmap = folium.Map(location=gcentre)
    for p in centers:
        folium.CircleMarker(p ,fill_color='green',radius=200).add_to(cmap)
    return cmap

In [None]:
cluster_map(4)