# NYC Taxi Trip Duration - Modeling


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

from shapely.geometry import Point, Polygon, LineString

%matplotlib inline
pd.set_option("display.float_format", "{:.2f}".format)

In [2]:
# Define constants
lib_loc = "/Users/nomic/Desktop/Nomi/develop/kiggle"
nyc_coord = [40.730610, -73.935242] #lat, long

In [3]:
# Load the Data
train_data = pd.read_csv(lib_loc + '/data/train.csv')
test_data = pd.read_csv(lib_loc +'/data/test.csv')
train_df = train_data.copy() # Create a copy df that we will edit instead of the raw data

In [4]:
# Define functions
def get_distances(df):
    '''
    takes in a dataframe containing columns with pickup and dropoff long an lat, converts them to points on the
    NYC coordinate system and returns distances in meters between pickup and dropoff.
    '''
    # Turn long and lat into points and convert to NYC coordinate system
    df['pickup_points']  = gpd.points_from_xy(df['pickup_longitude'], df['pickup_latitude'], crs = "EPSG:4326").to_crs('EPSG:2263')
    df['dropoff_points'] =  gpd.points_from_xy(df['dropoff_longitude'], df['dropoff_latitude'], crs = "EPSG:4326").to_crs('EPSG:2263')
    # Convert to geo data frame
    df = gpd.GeoDataFrame(df, geometry=df['pickup_points'])
    distances = df['pickup_points'].distance(df['dropoff_points']) # distances in feet
    distances = distances/ 3.281 # convert distance to meters
    return distances

def get_bearing_cat(lat1, lng1, lat2, lng2):
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    bearings = np.degrees(np.arctan2(y, x))
    return pd.cut(bearings, 8, labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])

def get_loc_cat(lat,long):
    return lat.round(1).astype(str)+long.round(1).astype(str)    

In [5]:
def feature_engineering(df):
    # Create time columns
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']).dt.round('15min')
    df['day'] = df['pickup_datetime'].dt.day_name()
    df['pickup_time'] = df['pickup_datetime'].dt.time
    df['pickup_month'] = df['pickup_datetime'].dt.month
    #df['pickup_date'] = df['pickup_datetime'].dt.date
    df['is_workday'] = df['pickup_datetime'].dt.dayofweek
    df['is_workday'] = df['is_workday'] < 5
    
    df['distances'] = get_distances(df)
    df['bearings_cats'] = get_bearing_cat(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_loc_cats'] = get_loc_cat(df['pickup_latitude'], df['pickup_longitude'])
    
    # Convert columnns to correct data types
    df['vendor_id'] = df['vendor_id'].astype('category')
    df['store_and_fwd_flag'] = df['store_and_fwd_flag'].astype('category')

In [6]:
feature_engineering(train_df)

In [7]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,day,pickup_time,pickup_month,is_workday,pickup_points,dropoff_points,distances,bearings_cats,pickup_loc_cats
0,id2875421,2,2016-03-14 17:30:00,2016-03-14 17:32:30,1,-73.98,40.77,-73.96,40.77,N,455,Monday,17:30:00,3,True,POINT (989193.010 219057.514),POINT (994047.681 218208.428),1502.09,G,40.8-74.0
1,id2377394,1,2016-06-12 00:45:00,2016-06-12 00:54:38,1,-73.98,40.74,-74.0,40.73,N,663,Sunday,00:45:00,6,False,POINT (989677.250 208356.028),POINT (984393.722 205654.999),1808.57,B,40.7-74.0
2,id3858529,2,2016-01-19 11:30:00,2016-01-19 12:10:48,1,-73.98,40.76,-74.01,40.71,N,2124,Tuesday,11:30:00,1,True,POINT (990059.826 217601.179),POINT (982771.449 197980.484),6379.35,A,40.8-74.0
3,id3504673,2,2016-04-06 19:30:00,2016-04-06 19:39:40,1,-74.01,40.72,-74.01,40.71,N,429,Wednesday,19:30:00,4,True,POINT (981466.797 201581.600),POINT (980848.593 196753.467),1483.56,A,40.7-74.0
4,id2181028,2,2016-03-26 13:30:00,2016-03-26 13:38:10,1,-73.97,40.79,-73.97,40.78,N,435,Saturday,13:30:00,3,False,POINT (991711.384 228265.723),POINT (991748.501 224371.453),1186.97,H,40.8-74.0


In [8]:
train_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'day', 'pickup_time', 'pickup_month', 'is_workday',
       'pickup_points', 'dropoff_points', 'distances', 'bearings_cats',
       'pickup_loc_cats'],
      dtype='object')

In [11]:
data = pd.get_dummies(train_df,columns=['vendor_id','store_and_fwd_flag','day','bearings_cats', 'pickup_loc_cats'])
data.shape

(1458644, 152)

In [12]:
data.head()

Unnamed: 0,id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_time,...,pickup_loc_cats_41.4-75.2,pickup_loc_cats_41.6-79.6,pickup_loc_cats_41.7-73.9,pickup_loc_cats_42.5-70.5,pickup_loc_cats_43.0-70.9,pickup_loc_cats_43.1-72.6,pickup_loc_cats_43.5-74.2,pickup_loc_cats_43.9-71.9,pickup_loc_cats_44.4-67.0,pickup_loc_cats_51.9-72.8
0,id2875421,2016-03-14 17:30:00,2016-03-14 17:32:30,1,-73.98,40.77,-73.96,40.77,455,17:30:00,...,False,False,False,False,False,False,False,False,False,False
1,id2377394,2016-06-12 00:45:00,2016-06-12 00:54:38,1,-73.98,40.74,-74.0,40.73,663,00:45:00,...,False,False,False,False,False,False,False,False,False,False
2,id3858529,2016-01-19 11:30:00,2016-01-19 12:10:48,1,-73.98,40.76,-74.01,40.71,2124,11:30:00,...,False,False,False,False,False,False,False,False,False,False
3,id3504673,2016-04-06 19:30:00,2016-04-06 19:39:40,1,-74.01,40.72,-74.01,40.71,429,19:30:00,...,False,False,False,False,False,False,False,False,False,False
4,id2181028,2016-03-26 13:30:00,2016-03-26 13:38:10,1,-73.97,40.79,-73.97,40.78,435,13:30:00,...,False,False,False,False,False,False,False,False,False,False
