In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# FIX ME: Root dir as argument
root_dir = "../../data/picklerCheckpoints/"
is_compressed = True;


In [2]:
import numpy as np
import pandas as pd
import nbimporter
import geopandas as gpd
import commons.utility.pickle_util as pickle
import Preprocessing.DataEnrichment as de
from shapely.geometry import Point

coord_system = {'init': 'epsg:4326'}


Importing Jupyter notebook from D:\Academics\MLRepoNew\ML2020-Uber-Data-Analysis\commons\utility\pickle_util.ipynb
Importing Jupyter notebook from D:\Academics\MLRepoNew\ML2020-Uber-Data-Analysis\Preprocessing\DataEnrichment.ipynb


In [3]:
# Import for spatial mapping operations
import bokeh, bokeh.plotting, bokeh.models
from bokeh.io import output_notebook, show
output_notebook()
import sklearn.preprocessing
from shapely.geometry import Point
import rtree


In [4]:
data = de.getUberDataforMonth('apr')

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Loading file ../data/picklerCheckpoints/enrichedData/uber-enriched-data-apr14.p
Decompressing ../data/picklerCheckpoints/enrichedData/uber-enriched-data-apr14.p.pbz2..............Done
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Dataset loaded


In [5]:
def loadNyCordData():
    nyCoordinates = gpd.read_file('../data/NewYorkCoords/taxi_zones.shp').to_crs(coord_system)
    nyCoordinates = nyCoordinates.drop(['Shape_Area', 'Shape_Leng', 'OBJECTID'], axis=1)
    return nyCoordinates

In [6]:
def generateBoundingBoxes(df):
    df2 = df.copy()
    df2['geometry'] = df.geometry.envelope
    df2['borough_categ'] = sklearn.preprocessing.LabelEncoder().fit_transform(df2['borough'])
    return df2

In [7]:
def displayBoundingBoxes(df):
    gjds2 = bokeh.models.GeoJSONDataSource(geojson=df.to_json())
    TOOLS = "pan,wheel_zoom,reset,hover,save"

    p = bokeh.plotting.figure(title="NYC Taxi Districts Bounding Boxes", tools=TOOLS,
        x_axis_location=None, y_axis_location=None)

    color_mapper = bokeh.models.LinearColorMapper(palette=bokeh.palettes.Viridis256)

    p.patches('xs', 'ys', 
              fill_color={'field': 'borough_categ', 'transform': color_mapper},
              fill_alpha=0.7, line_color="black", line_width=0.5,          
              source=gjds2)

    p.grid.grid_line_color = None

    hover = p.select_one(bokeh.models.HoverTool)
    hover.point_policy = "follow_mouse"
    hover.tooltips = u"""
    <div> 
        <div class="bokeh_hover_tooltip">Name : @zone</div>
        <div class="bokeh_hover_tooltip">Borough : @borough</div>
        <div class="bokeh_hover_tooltip">Zone ID : @LocationID</div>
        <div class="bokeh_hover_tooltip">(Lon, Lat) : ($x ˚E, $y ˚N)</div>
    </div>
    """

    p.circle([-73.966,], [40.78,], size=10, fill_color='magenta', line_color='yellow', line_width=1, alpha=1.0)

    show(p)

In [8]:
def spatialJoin(bbxDf, dataDf):
    gdf = gpd.GeoDataFrame(dataDf, geometry=gpd.points_from_xy(dataDf.Lon, dataDf.Lat), crs={'init': 'epsg:4326'})
    mappedData = gpd.sjoin(gdf, 
          bbxDf,
          how='left', op='within')
    return mappedData

In [9]:
def mapCoordinatesToNY(df):
    pickleStore = "../data/picklerCheckpoints/spatialJoinData/"
    path = pickleStore + 'NY_boundingBoxData.p'
    if (os.path.exists(path) or os.path.exists(path+'.pbz2')):
        bbxdf = pickle.load(path, True)
#         displayBoundingBoxes(bbxdf)
        return spatialJoin(bbxdf, df)
    else:
        nyCoordinates = loadNyCordData()
        bbxdf = generateBoundingBoxes(nyCoordinates)
        pickle.save(bbxdf, pickleStore, 'NY_boundingBoxData', True)
#         displayBoundingBoxes(bbxdf)
        return spatialJoin(bbxdf, df)

In [10]:
def run(months):
    validMonths = ['apr', 'may', 'jun', 'jul','aug', 'sep']
    pickleStore = "../data/picklerCheckpoints/spatialJoinData/"
    globalPath = pickleStore + 'uber-spatial-data-14.p'
    toMerge = []
    if(months.sort() == validMonths.sort()):
        if (os.path.exists(globalPath) or os.path.exists(globalPath+'.pbz2')):
            print("Spatial join of all months found...Loading...")
            return pickle.load(globalPath, True)
        
    for month in months:
        data = None
        path = pickleStore + 'uber-spatial-data-'+ month +'14.p'
        if(month in validMonths):
            if (os.path.exists(path) or os.path.exists(path+'.pbz2')):
                print("Data for the month :"+ month + " Found.....")
                toMerge.append(pickle.load(path, True))
            else:
                print("Data for the month :"+ month + " is Missing...")
                print("Regenerating data.....")
                data = de.getUberDataforMonth(month)
                sptialJoinData = mapCoordinatesToNY(data)
                pickle.save(sptialJoinData, pickleStore, 'uber-spatial-data-'+month+'14', True)
                toMerge.append(sptialJoinData)
        
    mergedData = pd.concat(toMerge).dropna()
    if (months.sort() == validMonths.sort()):
        pickle.save(mergedData, pickleStore, 'uber-spatial-data-14', True)
        
    return mergedData

In [11]:
outDf = run(['apr', 'may', 'jun', 'jul','aug', 'sep'])

Data for the month :apr Found.....
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Loading file ../data/picklerCheckpoints/spatialJoinData/uber-spatial-data-apr14.p
Decompressing ../data/picklerCheckpoints/spatialJoinData/uber-spatial-data-apr14.p.pbz2..............Done
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Data for the month :aug Found.....
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Loading file ../data/picklerCheckpoints/spatialJoinData/uber-spatial-data-aug14.p
Decompressing ../data/picklerCheckpoints/spatialJoinData/uber-spatial-data-aug14.p.pbz2..............Done
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Data for the month :jul Found.....
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Loading file

In [12]:
outDf

Unnamed: 0,Date/Time,Lat,Lon,Base,datetime,weatherCondition,humidity,pressure,temperature,windDirection,windSpeed,geometry,index_right,zone,LocationID,borough,borough_categ
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512,04-01-2014:00,scattered clouds,40.0,1016.0,282.96,330.0,5.0,POINT (-73.95490 40.76900),42.0,Central Park,43.0,Manhattan,3.0
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512,04-01-2014:00,scattered clouds,40.0,1016.0,282.96,330.0,5.0,POINT (-73.95490 40.76900),139.0,Lenox Hill East,140.0,Manhattan,3.0
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512,04-01-2014:00,scattered clouds,40.0,1016.0,282.96,330.0,5.0,POINT (-73.95490 40.76900),140.0,Lenox Hill West,141.0,Manhattan,3.0
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512,04-01-2014:00,scattered clouds,40.0,1016.0,282.96,330.0,5.0,POINT (-73.95490 40.76900),201.0,Roosevelt Island,202.0,Manhattan,3.0
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512,04-01-2014:00,scattered clouds,40.0,1016.0,282.96,330.0,5.0,POINT (-73.98730 40.73160),78.0,East Village,79.0,Manhattan,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028134,9/30/2014 22:58:00,40.7081,-74.0066,B02764,09-30-2014:22,broken clouds,72.0,1014.0,292.54,80.0,3.0,POINT (-74.00660 40.70810),86.0,Financial District North,87.0,Manhattan,3.0
1028134,9/30/2014 22:58:00,40.7081,-74.0066,B02764,09-30-2014:22,broken clouds,72.0,1014.0,292.54,80.0,3.0,POINT (-74.00660 40.70810),208.0,Seaport,209.0,Manhattan,3.0
1028135,9/30/2014 22:58:00,40.7140,-73.9496,B02764,09-30-2014:22,broken clouds,72.0,1014.0,292.54,80.0,3.0,POINT (-73.94960 40.71400),79.0,East Williamsburg,80.0,Brooklyn,1.0
1028135,9/30/2014 22:58:00,40.7140,-73.9496,B02764,09-30-2014:22,broken clouds,72.0,1014.0,292.54,80.0,3.0,POINT (-73.94960 40.71400),254.0,Williamsburg (North Side),255.0,Brooklyn,1.0
