# prepare

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pyproj import Geod
import scipy

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Data Clean

In [2]:
def cutomizedCoordinationFix(df):
    df = df.assign(rev=df.dropoff_latitude<df.dropoff_longitude)
    idx = (df['rev'] == 1)
    df.loc[idx,['dropoff_longitude','dropoff_latitude']] = df.loc[idx,['dropoff_latitude','dropoff_longitude']].values
    df.loc[idx,['pickup_longitude','pickup_latitude']] = df.loc[idx,['pickup_latitude','pickup_longitude']].values
    df = df.drop(columns=['rev'])
    return df

def clean_df(df):    
    #reverse incorrectly assigned longitude/latitude values
    df = cutomizedCoordinationFix(df)
    df = df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]
    
    return df

# Use Featuretools to create feature

In [3]:
import featuretools as ft
print(f"featuretools version is {ft.__version__}")

from featuretools.primitives import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Double, LatLong, Datetime, Boolean



featuretools version is 1.18.0


# featuretools related function

In [9]:
from woodwork.logical_types import Ordinal

def produce_featuretools_entityset(es, df):
    trip_logical_types = {
        'passenger_count': Ordinal(order=list(range(0, 10))), 
        'pickup_latlong': 'LatLong',
        'dropoff_latlong': 'LatLong',
    }

    es.add_dataframe(dataframe_name="trips",
                     dataframe=df,
                     index="id",
                     time_index='pickup_datetime',
                     logical_types=trip_logical_types)

    return es


In [10]:
from featuretools.primitives import IsInGeoBox

class Bearing(TransformPrimitive):
    name = "bearing"
    input_types = [ColumnSchema(logical_type=LatLong), ColumnSchema(logical_type=LatLong)]
    return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'})
    number_output_features = 1
    commutative=True
    def get_function(self):
        def bearing(latlong1, latlong2):
            lat1 = np.array([x[0] for x in latlong1])
            lon1 = np.array([x[1] for x in latlong1])
            lat2 = np.array([x[0] for x in latlong2])
            lon2 = np.array([x[1] for x in latlong2])
            delta_lon = np.radians(lon2 - lon1)
            lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
            x = np.cos(lat2) * np.sin(delta_lon)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(delta_lon)
            return np.degrees(np.arctan2(x, y))
        return bearing
    
class DistanceToLocation(TransformPrimitive):
    name = "distance_to_location"
    input_types = [ColumnSchema(logical_type=LatLong)]
    return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'})
    number_output_features = 1
    commutative=True
    def __init__(self, point=(0, 0)):
        self.point = point
        self.lat = point[0]
        self.lon = point[1]
        
    def get_function(self):
        def distance_to_location(latlong):
            lat = np.array([x[0] for x in latlong])
            lon = np.array([x[1] for x in latlong])
            tgt_lat = len(lat) * self.lat
            tgt_lon = len(lon) * self.lon
            return self.sphere_dist(tgt_lat, tgt_lon, lat, lon)
        return distance_to_location
    
    def sphere_dist(self, lat1, lon1, lat2, lon2):
        """
        Return distance along great radius between pickup and dropoff coordinates.
        """
        #Define earth radius (km)
        R_earth = 6371
        #Convert degrees to radians
        lat1, lon1, lat2, lon2 = map(np.radians,[lat1, lon1, lat2, lon2])
        #Compute distances along lat, lon dimensions
        dlat = lat2 - lat1
        dlon = lon2 - lon1

        #Compute haversine distance
        a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
        return 2 * R_earth * np.arcsin(np.sqrt(a))


def get_coordination(df):
    df["pickup_latlong"] = df[['pickup_latitude', 'pickup_longitude']].apply(tuple, axis=1)
    df["dropoff_latlong"] = df[['dropoff_latitude', 'dropoff_longitude']].apply(tuple, axis=1)
    df = df.drop(["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"], axis = 1)
    return df

def modelling_features(df, feature_list = None, features_only = False):
    df = get_coordination(df)
    print(df.dtypes)

    es = ft.EntitySet("nyc_taxi_fare")
    es = produce_featuretools_entityset(es, df)
    
    cutoff_time = es['trips'][['id', 'pickup_datetime']]
    
    if feature_list:
        df = ft.calculate_feature_matrix(feature_list, entityset=es, cutoff_time=cutoff_time, verbose=True)
        return df, es, feature_list
    
    # airport coordination
    coordination_dicts = {
        "jfk_coord": (40.639722, -73.778889),
        "ewr_coord": (40.6925, -74.168611),
        "lga_coord": (40.77725, -73.872611),
        "sol_coord": (40.6892,-74.0445), # Statue of Liberty
        "nyc_coord": (40.7141667,-74.0063889) 
    }
    
    trans_primitives = ["day", "year", "month", "weekday", "haversine", "hour", "is_weekend", "is_working_hours", "part_of_day"]
    trans_primitives += ["cityblock_distance", Bearing,
                         IsInGeoBox((40.62, -73.85), (40.70, -73.75)),
                         IsInGeoBox((40.70, -73.97), (40.77, -73.9))]
    trans_primitives += [DistanceToLocation(x) for n, x in coordination_dicts.items()]

    # calculate feature_matrix using deep feature synthesis
    
    ret = ft.dfs(entityset=es,
                      target_dataframe_name="trips",
                      trans_primitives=trans_primitives,
                      verbose=True,
                      cutoff_time=cutoff_time,
                      approximate='36d',
                      max_depth=3,
                      max_features=40, 
                      features_only = features_only)
    if features_only:
        features = ret
    else:
        features = ret[1]
        df = ret[0]
        #df_encoded, features_encoded = ft.encode_features(df, features)
    
    return df, es, features

# Pipeline

In [13]:
# small data to test
from utils import Timer
from IPython.display import display

path = "../data/"
TRAIN_PATH = f'{path}/train.csv'
TEST_PATH = f'{path}/test.csv'

cols = [
    'fare_amount', 'pickup_datetime','pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
]
 
#sampled_line = 100000
with Timer(f"Load train full"):
    train = pd.read_csv(TRAIN_PATH, usecols=cols)

with Timer("Data Wrangling for train"):
    train = clean_df(train)

top_features = ft.load_features("featuretools_humanknowledge_nyc_taxi_top_features.txt")
train, es, features = modelling_features(train)
    
es.plot()
display(train)

Load train full took 59.77826909907162 sec
Data Wrangling for train took 9.222446422092617 sec
fare_amount        float64
pickup_datetime     object
passenger_count      int64
pickup_latlong      object
dropoff_latlong     object
dtype: object
Built 27 features
Elapsed: 37:43 | Progress: 100%|██████████


Unnamed: 0_level_0,fare_amount,passenger_count,"BEARING(dropoff_latlong, pickup_latlong)","CITYBLOCK_DISTANCE(dropoff_latlong, pickup_latlong)",DAY(pickup_datetime),"DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.639722, -73.778889))","DISTANCE_TO_LOCATION(pickup_latlong, point=(40.639722, -73.778889))","DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.6892, -74.0445))","DISTANCE_TO_LOCATION(pickup_latlong, point=(40.6892, -74.0445))","DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.6925, -74.168611))",...,"IS_IN_GEOBOX(dropoff_latlong, point1=(40.62, -73.85), point2=(40.7, -73.75))","IS_IN_GEOBOX(pickup_latlong, point1=(40.62, -73.85), point2=(40.7, -73.75))","IS_IN_GEOBOX(dropoff_latlong, point1=(40.7, -73.97), point2=(40.77, -73.9))","IS_IN_GEOBOX(pickup_latlong, point1=(40.7, -73.97), point2=(40.77, -73.9))",IS_WEEKEND(pickup_datetime),IS_WORKING_HOURS(pickup_datetime),MONTH(pickup_datetime),PART_OF_DAY(pickup_datetime),WEEKDAY(pickup_datetime),YEAR(pickup_datetime)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43310508,30.2000,1,102.8172,11.6828,1,14469.4146,14467.4524,15033.0715,15027.9185,13095.9168,...,False,True,False,False,False,False,1,midnight,3,2009
862908,15.0000,1,35.9675,4.4392,1,14479.5095,14483.9264,15043.6104,15047.3952,13106.4920,...,False,False,False,False,False,False,1,midnight,3,2009
13073257,4.2000,1,-139.9329,0.2755,1,14475.1238,14474.8641,15039.2532,15039.0359,13102.2720,...,False,False,False,False,False,False,1,midnight,3,2009
647957,5.8000,2,11.8035,0.9387,1,14474.2722,14475.5415,15038.5334,15039.7469,13101.6663,...,False,False,False,False,False,False,1,midnight,3,2009
12655086,14.6000,1,-118.1861,4.3052,1,14479.5413,14476.6862,15042.4027,15040.4922,13104.4286,...,False,False,True,False,False,False,1,midnight,3,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40210315,24.5000,2,-15.5490,5.8763,30,5826.7865,5819.4497,18529.6489,18527.6189,14447.8442,...,False,False,False,False,False,False,6,midnight,1,2015
13957545,6.0000,2,38.2798,1.2413,30,5819.2635,5818.1230,18527.6739,18526.3431,14455.3819,...,False,False,False,False,False,False,6,midnight,1,2015
48940597,33.5000,1,-45.3376,10.3840,30,5822.5045,5814.4422,18520.8876,18523.8375,14452.0795,...,False,False,False,False,False,False,6,midnight,1,2015
22295217,9.5000,1,105.4674,2.5932,30,5821.5804,5822.3945,18528.4082,18526.0685,14453.0613,...,False,False,False,True,False,False,6,midnight,1,2015


In [14]:
train.to_parquet("featuretools_humanknowledge_nyc_taxi_55M.parquet")

# EvalML Train

In [None]:
train = pd.read_parquet("featuretools_process_nyc_taxi.parquet")
train = train.head(10000000)

def get_split_sets(train):
    x = train.drop(columns=['fare_amount'])
    y = train['fare_amount'].values
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=123)
    return x_train, x_val, y_train, y_val

with Timer("split train and val"):
    x_train, x_val, y_train, y_val = get_split_sets(train)
    
# looking for right ml pipeline
import evalml
from evalml import AutoMLSearch

automl = AutoMLSearch(X_train=x_train,
                      y_train=y_train,
                      X_holdout=X_val,
                      y_holdout=y_val,
                      problem_type="regression",
                      objective="root mean squared error",
                      verbose=True,)
automl.search()

best_pipeline = automl.best_pipeline
with Timer("train"):
    best_pipeline.fit(x_train, y_train)
    
best_pipeline.score(X_val, y_val, objectives=["root mean squared error"])

In [21]:
train.columns

Index(['fare_amount', 'passenger_count',
       'BEARING(dropoff_latlong, pickup_latlong)',
       'CITYBLOCK_DISTANCE(dropoff_latlong, pickup_latlong)',
       'DAY(pickup_datetime)',
       'DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.639722, -73.778889))',
       'DISTANCE_TO_LOCATION(pickup_latlong, point=(40.639722, -73.778889))',
       'DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.6892, -74.0445))',
       'DISTANCE_TO_LOCATION(pickup_latlong, point=(40.6892, -74.0445))',
       'DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.6925, -74.168611))',
       'DISTANCE_TO_LOCATION(pickup_latlong, point=(40.6925, -74.168611))',
       'DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.7141667, -74.0063889))',
       'DISTANCE_TO_LOCATION(pickup_latlong, point=(40.7141667, -74.0063889))',
       'DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.77725, -73.872611))',
       'DISTANCE_TO_LOCATION(pickup_latlong, point=(40.77725, -73.872611))',
       'HAVERSINE(dropoff_latlong, picku

# LGBM Train

In [20]:
from utils import Timer
import pandas as pd
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
import numpy as np
import re
            
# with Timer("read train"):
#     train = pd.read_parquet("featuretools_default_nyc_taxi.parquet")

x = train.drop(columns=['fare_amount'])
y = train['fare_amount'].values

def get_split_sets(x, y):
    from sklearn.model_selection import train_test_split

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=123)
    return x_train, x_val, y_train, y_val

with Timer("split train and val"):
    x_train, x_val, y_train, y_val = get_split_sets(x, y)
    x_train_save = x_train
    x_val_save = x_val

def fix_df_types(df):
    Int64_cols = list(n for n, x in zip(df.dtypes.index.to_list(), df.dtypes.to_list()) if x.name == 'Int64')
    df[Int64_cols] = df[Int64_cols].fillna(0).astype('int64')
    
    Boolean_cols = list(n for n, x in zip(df.dtypes.index.to_list(), df.dtypes.to_list()) if x.name == 'boolean')
    df[Boolean_cols] = df[Boolean_cols].fillna(0).astype('int8')
    
    return df

def fix_column_name(df):
    return df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
  
x_train = fix_df_types(x_train)
x_val = fix_df_types(x_val)

x_train = fix_column_name(x_train)
x_val = fix_column_name(x_val)
           
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':1000,
        'num_boost_round': 10000,
        'early_stopping_rounds': 50
    }


lgbm_train = lgbm.Dataset(x_train, y_train, silent=False)
lgbm_val = lgbm.Dataset(x_val, y_val, silent=False)

with Timer("train"):
    model = lgbm.train(params=params, train_set=lgbm_train, valid_sets=lgbm_val, verbose_eval=100)
    
with Timer("predict"):
    pred = model.predict(x_val, num_iteration=model.best_iteration)
    
with Timer("calculate rmse"):
    rmse = np.sqrt(mean_squared_error(y_val, pred))

print('LightGBM RMSE', rmse)

split train and val took 29.33571505965665 sec
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65110
[LightGBM] [Info] Number of data points in the train set: 48884359, number of used features: 26
[LightGBM] [Info] Start training from score 11.324502
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 4.12604
[200]	valid_0's rmse: 4.03279
[300]	valid_0's rmse: 3.99394
[400]	valid_0's rmse: 3.96793
[500]	valid_0's rmse: 3.94898
[600]	valid_0's rmse: 3.9338
[700]	valid_0's rmse: 3.92029
[800]	valid_0's rmse: 3.90816
[900]	valid_0's rmse: 3.89684
[1000]	valid_0's rmse: 3.88656
[1100]	valid_0's rmse: 3.87921
[1200]	valid_0's rmse: 3.87236
[1300]	valid_0's rmse: 3.86486
[1400]	valid_0's rmse: 3.85872
[1500]	valid_0's rmse: 3.85221
[1600]	valid_0's rmse: 3.84583
[1700]	valid_0's rmse: 3.83988
[1800]	valid_0's rmse: 3.83316
[1900]	valid_0's rmse: 3.82758
[2000]	valid_0's rmse: 3.82152
[2100]	valid_0's rmse: 3.81609
[2200]	v

# EDA

In [None]:
def exploration_features(df):
    """adds features for use in the EDA section"""
    df = shared_features(df)
    df = (
        df
        .assign(
            hour=df.pickup_datetime.dt.hour,
            close_to_airport='No',
            fare_per_km=df.fare_amount*1000/df.distance,
            direction_bucket = pd.cut(df.direction, np.linspace(-180, 180, 37)),

            #small location buckets
            pickup_long_bucket=pd.cut(df.pickup_longitude, bins=2550, labels=False),
            pickup_lat_bucket=pd.cut(df.pickup_latitude, bins=2200, labels=False),
            dropoff_long_bucket=pd.cut(df.dropoff_longitude, bins=2550, labels=False),
            dropoff_lat_bucket=pd.cut(df.dropoff_latitude, bins=2200, labels=False),


            #large location buckets
            pickup_long_bucket_big=pd.cut(df.pickup_longitude, bins=255, labels=False),
            pickup_lat_bucket_big=pd.cut(df.pickup_latitude, bins=220, labels=False),
            dropoff_long_bucket_big=pd.cut(df.dropoff_longitude, bins=255, labels=False),
            dropoff_lat_bucket_big=pd.cut(df.dropoff_latitude, bins=220, labels=False)
        )
        .drop(columns='pickup_datetime')
        .query("0 < distance")
    )

    df.loc[((df['pickup_dist_jfk']<1500) | (df['dropoff_dist_jfk']<1500)), 'close_to_airport'] = 'JFK'
    df.loc[((df['pickup_dist_lga']<1500) | (df['dropoff_dist_lga']<1500)), 'close_to_airport'] = 'LaGuardia'
    df.loc[((df['pickup_dist_nla']<1500) | (df['dropoff_dist_nla']<1500)), 'close_to_airport'] = 'Newark'  
    return df

