In [1]:
## Import libraries
import os
import json
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from IPython.display import Markdown
from shapely.geometry import Point, Polygon
import shapely.wkt
from sqlalchemy import create_engine

%matplotlib inline

In [2]:
## Define the working directory. (This should be executed just once)
os.chdir(os.path.join('..'))
working_path = os.getcwd()

print('The working directory will be {}'.format(working_path))

# Define the path where the data sets are located
datasets_path = os.path.join(working_path, 'Datasets')

print('The datasets will be placed in {}'.format(datasets_path))

The working directory will be C:\Users\Hector\Documents\DS4A_datathon
The datasets will be placed in C:\Users\Hector\Documents\DS4A_datathon\Datasets


In [3]:
## Define paths for each dataset
boroughs_path = os.path.join(datasets_path, 'boroughs.json')
yellow_trips = os.path.join(datasets_path, 'yellow_trips_new.csv')
green_trips = os.path.join(datasets_path, 'green_trips_new_2.csv')
weather = os.path.join(datasets_path, 'weather.csv')

In [4]:
df_yellow_trips = pd.read_csv(yellow_trips)
df_green_trips = pd.read_csv(green_trips)
df_weather = pd.read_csv(weather)

In [5]:
df_yellow_trips['type'] = 'Y'
df_green_trips['type'] = 'G'

In [6]:
df_taxi_raw = df_green_trips.append(df_yellow_trips, ignore_index=True)
df_taxi_raw.reset_index(inplace=True, drop=True)
df_taxi_raw['pickup_datetime'] = pd.to_datetime(df_taxi_raw['pickup_datetime'])

In [7]:
# Open json file to get Borough's Polygons
with open(boroughs_path) as json_file:
    boroughs_data = json.load(json_file)
    
# Create objects to store significant data
boroughs_polygon = dict()
boroughs_name = list()

# Fill objects
boroughs = boroughs_data['data']
for borough in boroughs:
    boroughs_polygon[borough[10]] = shapely.wkt.loads(borough[9])
    boroughs_name.append(borough[10])
    

In [8]:
def append_polygon(df, polygon_dict, latitud_col='lat', longitude_col='lon', 
                   error_msg='Does not belong to any', final_col_name='polygon',
                   sample_size=None):
    
    def which_polygon(row):
        lat = row[latitud_col]
        long = row[longitude_col]
        
        for id_, pol in polygon_dict.items():
            if pol.contains(Point(long, lat)):
                return id_
        return error_msg
    
    
    if sample_size is None:
        df2 = df.copy()
    else:
        df2 = df[:sample_size].copy()
    
    df2[final_col_name] = df2.apply(lambda row: which_polygon(row), axis=1)
    return df2

In [9]:
df_taxi_raw.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,total_amount,type
0,2015-02-01 01:26:45,2015-02-01 01:49:58,-73.953545,40.811172,-73.984764,40.728386,1,8.11,27.8,G
1,2015-01-02 20:06:28,2015-01-02 20:14:04,-73.946709,40.714321,-73.961571,40.711475,1,1.29,9.8,G
2,2014-09-27 17:55:38,2014-09-27 18:19:56,-73.957626,40.718094,-73.947304,40.777813,5,6.12,26.3,G
3,2014-04-27 02:27:04,2014-04-27 02:39:02,-73.949501,40.713997,-73.987785,40.718582,2,3.68,17.3,G
4,2014-05-26 18:32:19,2014-05-26 18:44:13,-73.944092,40.672195,-73.977325,40.664013,1,2.4,11.5,G


In [10]:
df_taxi = append_polygon(df_taxi_raw, boroughs_polygon,
                         latitud_col='pickup_latitude',
                         longitude_col='pickup_longitude',
                         error_msg='Does not belong to any borough',
                         final_col_name='borough', sample_size=None)

In [11]:
path_proccess_file = os.path.join(datasets_path, 'taxi_processed.csv')

df_taxi.to_csv(path_proccess_file)

In [13]:
df_taxi.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,total_amount,type,borough
0,2015-02-01 01:26:45,2015-02-01 01:49:58,-73.953545,40.811172,-73.984764,40.728386,1,8.11,27.8,G,Manhattan
1,2015-01-02 20:06:28,2015-01-02 20:14:04,-73.946709,40.714321,-73.961571,40.711475,1,1.29,9.8,G,Brooklyn
2,2014-09-27 17:55:38,2014-09-27 18:19:56,-73.957626,40.718094,-73.947304,40.777813,5,6.12,26.3,G,Brooklyn
3,2014-04-27 02:27:04,2014-04-27 02:39:02,-73.949501,40.713997,-73.987785,40.718582,2,3.68,17.3,G,Brooklyn
4,2014-05-26 18:32:19,2014-05-26 18:44:13,-73.944092,40.672195,-73.977325,40.664013,1,2.4,11.5,G,Brooklyn
