In [1]:
## Import libraries
import os
import json
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon
import shapely.wkt
from itertools import permutations

%matplotlib inline

In [2]:
## Define the working directory. (This should be executed just once)
os.chdir(os.path.join('..'))
working_path = os.getcwd()

print('The working directory is {}'.format(working_path))

# Define the path where the data sets are located
datasets_path = os.path.join(working_path, 'Datasets')

print('The datasets folder is  {}'.format(datasets_path))

The working directory is C:\Users\Hector\Documents\DS4A_datathon
The datasets folder is  C:\Users\Hector\Documents\DS4A_datathon\Datasets


In [3]:
## Define paths for each dataset
boroughs_path = os.path.join(datasets_path, 'boroughs.json')
uber_trips_2014 = os.path.join(datasets_path, 'uber_trips_2014.csv')
uber_trips_2015 = os.path.join(datasets_path, 'uber_trips_2015.csv')
zones = os.path.join(datasets_path, 'zones.csv')


## Create dataframe for each dataset

df_uber_trips_2014 = pd.read_csv(uber_trips_2014)
df_uber_trips_2015 = pd.read_csv(uber_trips_2015)
df_zones = pd.read_csv(zones)

In [4]:
# Open json file to get Borough's Polygons
with open(boroughs_path) as json_file:
    boroughs_data = json.load(json_file)

In [5]:
# Create objects to store significant data
boroughs_polygon = dict()
boroughs_name = list()

# Fill objects
boroughs = boroughs_data['data']
for borough in boroughs:
    boroughs_polygon[borough[10]] = shapely.wkt.loads(borough[9])
    boroughs_name.append(borough[10])

In [6]:
# Create function to check if two polygons have intersection
def exists_intersection(p1, p2):
    '''
    Check wether or not polygons have an intersection area
    '''
    return p1.intersection(p2).area != 0

In [7]:
# Check the polygons don't intersect each other
for name in permutations(boroughs_name, 2):
    if exists_intersection(boroughs_polygon[name[0]], boroughs_polygon[name[1]]):
        print('The boroughs {} and {} are not disjunt'.format(name[0], name[1]))

In [8]:
def append_polygon(df, polygon_dict, latitud_col='lat', longitude_col='lon', 
                   error_msg='Does not belong to any', final_col_name='polygon',
                   sample_size=None):
    
    def which_polygon(row):
        
        lat = row[latitud_col]
        long = row[longitude_col]
        
        for id_, pol in polygon_dict.items():
            if pol.contains(Point(long, lat)):
                return id_
        return error_msg
    
    
    if sample_size is None:
        df2 = df.copy()
    else:
        df2 = df[:sample_size].copy()
    
    df2[final_col_name] = df2.apply(lambda row: which_polygon(row), axis=1)
    return df2

In [9]:
df_uber_trips_2014.columns

Index(['pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'base'], dtype='object')

In [10]:
df_uber_trips_2015.columns

Index(['pickup_datetime', 'pickup_location_id', 'dispatch_base',
       'affiliate_base'],
      dtype='object')

In [11]:
# Columns that will be in the final dataset
final_columns = ['pickup_datetime', 'affiliate_base', 'borough']

In [12]:
df_procesado_2014 = append_polygon(df_uber_trips_2014, boroughs_polygon,
                              latitud_col='pickup_latitude',
                              longitude_col='pickup_longitude',
                              error_msg='Does not belong to any borough',
                              final_col_name='borough', sample_size=None).rename(columns={'base': 'affiliate_base'})


df_procesado_2014['pickup_datetime'] = pd.to_datetime(df_procesado_2014['pickup_datetime'])
df_procesado_2014 = df_procesado_2014[final_columns]

In [13]:
df_raw = pd.merge(
            df_uber_trips_2015, 
            df_zones, 
            how='left', 
            left_on='pickup_location_id', 
            right_on='location_id')

df_procesado_2015 = df_raw[['pickup_datetime', 'dispatch_base', 'affiliate_base', 'zone', 'borough', 'service_zone', 'nta_code']]
df_procesado_2015['pickup_datetime'] = pd.to_datetime(df_procesado_2015['pickup_datetime'])
df_procesado_2015 = df_procesado_2015[final_columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [14]:
path_proccess_file = os.path.join(datasets_path, 'uber_processed.csv')

df_uber = pd.concat([df_procesado_2014, df_procesado_2015], ignore_index=True)
df_uber.to_csv(path_proccess_file)

In [15]:
df_uber.head()

Unnamed: 0,pickup_datetime,affiliate_base,borough
0,2014-04-01 00:11:00,B02512,Manhattan
1,2014-04-01 00:17:00,B02512,Does not belong to any borough
2,2014-04-01 00:21:00,B02512,Manhattan
3,2014-04-01 00:28:00,B02512,Manhattan
4,2014-04-01 00:33:00,B02512,Manhattan
