# prepare

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pyproj import Geod
import scipy

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Data Clean

In [7]:
def convertToDateTime(s):
    s = s.str.slice(0, 15)
    return pd.to_datetime(s, utc=True, format='%Y-%m-%d %H:%M')

def cutomizedCoordinationFix(df):
    df = df.assign(rev=df.dropoff_latitude<df.dropoff_longitude)
    idx = (df['rev'] == 1)
    df.loc[idx,['dropoff_longitude','dropoff_latitude']] = df.loc[idx,['dropoff_latitude','dropoff_longitude']].values
    df.loc[idx,['pickup_longitude','pickup_latitude']] = df.loc[idx,['pickup_latitude','pickup_longitude']].values
    return df

def cutomizedExcludeOutOfRangeLocation(df):
    criteria = (
    " 0 < fare_amount <= 500"
    " and 0 < passenger_count <= 6 "
    " and -75 <= pickup_longitude <= -72 "
    " and -75 <= dropoff_longitude <= -72 "
    " and 40 <= pickup_latitude <= 42 "
    " and 40 <= dropoff_latitude <= 42 "
    )
    df = (df
          .dropna()
          .query(criteria)
          .reset_index()
          .drop(columns=['rev', 'index'])          
         )
    return df

def clean_df(df):
    # format datetime to minute
    df['pickup_datetime'] = convertToDateTime(df['pickup_datetime'])
    
    #reverse incorrectly assigned longitude/latitude values
    df = cutomizedCoordinationFix(df)
    
    #remove data points outside appropriate ranges
    df = cutomizedExcludeOutOfRangeLocation(df)
    
    return df

# Customize Feature Definition

In [13]:
def distance(lon1,lat1,lon2,lat2):
    az12,az21,dist = Geod(ellps='WGS84').inv(lon1,lat1,lon2,lat2)
    return dist

def direction(lon1,lat1,lon2,lat2):
    az12,az21,dist = Geod(ellps='WGS84').inv(lon1,lat1,lon2,lat2)
    return az12

def date_features(df):
    df = df.assign(
        #time features
        year=df.pickup_datetime.dt.year,
        dayofyear=df.pickup_datetime.dt.dayofyear,
        weekday=df.pickup_datetime.dt.dayofweek,
        time=(df.pickup_datetime.dt.hour+df.pickup_datetime.dt.minute/5)
    )
    df = df.assign(
        sin_time=np.sin(2*np.pi*df['time']/24),
        cos_time=np.cos(2*np.pi*df['time']/24),
        sin_dayofyear=np.sin(2*np.pi*df['dayofyear']/365),
        cos_dayofyear=np.cos(2*np.pi*df['dayofyear']/365),
        sin_weekday=np.sin(2*np.pi*df['weekday']/6),
        cos_weekday=np.cos(2*np.pi*df['weekday']/6)
    )
    return df

def location_features(df):
    pickup_long = df.pickup_longitude.tolist()
    pickup_lat = df.pickup_latitude.tolist()
    dropoff_long = df.dropoff_longitude.tolist()
    dropoff_lat = df.dropoff_latitude.tolist()
    
    df = df.assign(
        #distance between pickup and dropoff, and bearing from pickup to dropoff
        distance=distance(pickup_long, pickup_lat, dropoff_long, dropoff_lat),
        direction=direction(pickup_long, pickup_lat, dropoff_long, dropoff_lat)
    )
    
    df = df.assign(
        sin_direction=np.sin(2*np.pi*df['direction']/360),
        cos_direction=np.cos(2*np.pi*df['direction']/360),
        direction_bucket=pd.cut(df['direction'], bins=37, labels=False)
    )
    return df

def customized_features_with_airport_coordination(df):
    """adds features that will be used by both the modelling and EDA dataframes"""
    # define coordination of airport
    rows = len(df)
    nyc_long, nyc_lat = [-74.001541]*rows, [40.724944]*rows    
    jfk_long, jfk_lat = [-73.785937]*rows, [40.645494]*rows
    lga_long, lga_lat = [-73.872067]*rows, [40.774071]*rows
    nla_long, nla_lat = [-74.177721]*rows, [40.690764]*rows
    chp_long, chp_lat = [-73.137393]*rows, [41.366138]*rows
    exp_long, exp_lat = [-74.0375]*rows, [40.736]*rows
 
    # add new features based on the distance to airport
    pickup_long = df.pickup_longitude.tolist()
    pickup_lat = df.pickup_latitude.tolist()
    dropoff_long = df.dropoff_longitude.tolist()
    dropoff_lat = df.dropoff_latitude.tolist()
    df = df.assign(     
        #distance from locations
        pickup_dist_nyc=pd.Series(distance(pickup_long, pickup_lat, nyc_long, nyc_lat)),
        dropoff_dist_nyc=pd.Series(distance(dropoff_long, dropoff_lat, nyc_long, nyc_lat)),
        pickup_dist_jfk=pd.Series(distance(pickup_long, pickup_lat, jfk_long, jfk_lat)),
        dropoff_dist_jfk=pd.Series(distance(dropoff_long, dropoff_lat, jfk_long, jfk_lat)),
        pickup_dist_lga=pd.Series(distance(pickup_long, pickup_lat, lga_long, lga_lat)),
        dropoff_dist_lga=pd.Series(distance(dropoff_long, dropoff_lat, lga_long, lga_lat)),
        pickup_dist_nla=pd.Series(distance(pickup_long, pickup_lat, nla_long, nla_lat)),
        dropoff_dist_nla=pd.Series(distance(dropoff_long, dropoff_lat, nla_long, nla_lat)),
        pickup_dist_chp=pd.Series(distance(pickup_long, pickup_lat, chp_long, chp_lat)),
        dropoff_dist_chp=pd.Series(distance(dropoff_long, dropoff_lat, chp_long, chp_lat)),
        pickup_dist_exp=pd.Series(distance(pickup_long, pickup_lat, exp_long, exp_lat)),
        dropoff_dist_exp=pd.Series(distance(dropoff_long, dropoff_lat, exp_long, exp_lat))
    )
    return df

def customized_features_with_normalized_distance(df):
    # create_dictionary_for_fares_by_direction
    df['fare_per_km'] = df['fare_amount']*1000/(df.distance+5)
    fares_by_direction = df.query('5 < distance').groupby('direction_bucket')['fare_per_km'].mean()

    # use mean_price of direction / max_price to do normalization
    df['normalized_value'] = [fares_by_direction[i] for i in df.direction_bucket] * fares_by_direction.max()
    df['normalized_distance'] = df.normalized_value * df.distance

    return df

def modelling_features(df):
    
    df = date_features(df)
    df = location_features(df)
    df = customized_features_with_airport_coordination(df)
    df = customized_features_with_normalized_distance(df)
    
    # drop off original features
    df = df.drop(columns=['pickup_datetime', 'time', 'direction', 'weekday', 'dayofyear'])
    df = df.drop(columns=['fare_per_km', 'direction_bucket', 'direction', 'weekday', 'dayofyear'])
    
    return df

# util func define

In [4]:
def get_split_sets(train):
    x = train.drop(columns=['fare_amount'])
    y = train['fare_amount'].values
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=0)
    return x_train, x_val, y_train, y_val

# Pipeline

In [15]:
from utils import Timer
from IPython.display import display

path = "../data/"
TRAIN_PATH = f'{path}/train.csv'
TEST_PATH = f'{path}/test.csv'

cols = [
    'fare_amount', 'pickup_datetime','pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
]
with Timer("Load test"):
    test = pd.read_csv(TEST_PATH)
    
with Timer(f"Load train full"):
    train = pd.read_csv(TRAIN_PATH, usecols=cols, numRows=1000)

print("Raw data")
display(train)

with Timer("Data Wrangling for train"):
    train = clean_df(train)
    
with Timer("enrich feature for train"):
    print(f"\n*** before enrich ***")
    print(train.dtypes)
    
    train = modelling_features(train)
    
    print(f"\n*** after enrich ***")
    print(train.dtypes)

with Timer("enrich feature for test"):
    test['pickup_datetime'] = test['pickup_datetime'].str.slice(0, 15)
    test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    test = modelling_features(test)

with Timer("sample train upon test"):
    train = (train
        .query(f'{test.pickup_longitude.min()-0.1} <= pickup_longitude <= {test.pickup_longitude.max()+0.1}')
        .query(f'{test.pickup_latitude.min()-0.1} <= pickup_latitude <= {test.pickup_latitude.max()+0.1}')
        .query(f'{test.dropoff_longitude.min()-0.1} <= dropoff_longitude <= {test.dropoff_longitude.max()+0.1}')
        .query(f'{test.dropoff_latitude.min()-0.1} <= dropoff_latitude <= {test.dropoff_latitude.max()+0.1}')
    )
     
print("Before Training")
display(train)


Raw data


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5000,2009-06-15 17:26:21 UTC,-73.8443,40.7213,-73.8416,40.7123,1
1,16.9000,2010-01-05 16:52:16 UTC,-74.0160,40.7113,-73.9793,40.7820,1
2,5.7000,2011-08-18 00:35:00 UTC,-73.9827,40.7613,-73.9912,40.7506,2
3,7.7000,2012-04-21 04:30:42 UTC,-73.9871,40.7331,-73.9916,40.7581,1
4,5.3000,2010-03-09 07:51:00 UTC,-73.9681,40.7680,-73.9567,40.7838,1
...,...,...,...,...,...,...,...
9999995,5.7000,2012-08-12 01:18:00 UTC,-73.9995,40.7285,-73.9933,40.7421,2
9999996,5.5000,2013-08-07 10:28:00 UTC,-73.9685,40.7594,-73.9650,40.7690,1
9999997,14.0000,2013-10-29 08:29:00 UTC,-73.9980,40.7337,-73.9734,40.7591,5
9999998,10.5000,2012-04-07 16:41:33 UTC,-73.9927,40.7520,-73.9647,40.7728,1


Data Wrangling for train took 5.578062933403999 sec

*** before enrich ***
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

*** after enrich ***
fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
year                   int64
sin_time             float64
cos_time             float64
sin_dayofyear        float64
cos_dayofyear        float64
sin_weekday          float64
cos_weekday          float64
distance             float64
sin_direction        float64
cos_direction        float64
direction_bucket       int64
pickup_dist_nyc      float64
dropoff_dist_nyc     float64
pickup_dist_jfk      float64
dropoff_dist_jfk     f

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,sin_time,cos_time,sin_dayofyear,...,pickup_dist_jfk,dropoff_dist_jfk,pickup_dist_lga,dropoff_dist_lga,pickup_dist_nla,dropoff_dist_nla,pickup_dist_chp,dropoff_dist_chp,pickup_dist_exp,dropoff_dist_exp
0,4.5000,-73.8443,40.7213,-73.8416,40.7123,1,2009,-0.9877,-0.1564,0.2802,...,9759.6783,8783.6979,6309.6620,7328.3744,28379.3227,28506.2184,93062.7752,93696.3858,16401.6895,16758.1056
1,16.9000,-74.0160,40.7113,-73.9793,40.7820,1,2010,-0.9659,-0.2588,0.0860,...,20781.5454,22286.0870,14016.8362,9092.3433,13852.7181,19587.3631,103667.4720,95989.2202,3287.3107,7090.6553
2,5.7000,-73.9827,40.7613,-73.9912,40.7506,2,2011,0.1564,0.9877,-0.7296,...,21021.7652,20909.8842,9451.4581,10395.8801,18238.7310,17097.7034,97779.7939,99120.9967,5409.6429,4228.4207
3,7.7000,-73.9871,40.7331,-73.9916,40.7581,1,2012,0.9336,0.3584,0.9369,...,19594.9373,21409.2351,10727.2331,10244.4636,16778.5257,17413.8976,100224.8950,98563.0739,4266.6567,4589.9636
4,5.3000,-73.9681,40.7680,-73.9567,40.7838,1,2010,0.8660,-0.5000,0.9210,...,20544.0568,21067.5987,8135.1366,7221.1964,19676.9620,21338.6360,96369.2999,94462.8004,6854.7979,8644.8414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9758845,5.7000,-73.9995,40.7285,-73.9933,40.7421,2,2012,0.3090,0.9511,-0.6681,...,20264.8380,20549.4720,11891.7956,10835.2563,15633.9114,16592.7379,101331.8105,99899.1747,3320.6128,3794.4430
9758846,5.5000,-73.9685,40.7594,-73.9650,40.7690,1,2013,0.4067,-0.9135,-0.5878,...,19946.6451,20422.4948,8301.3368,7863.0596,19250.1901,19963.9905,97058.9761,96100.4961,6381.5834,7139.4187
9758847,14.0000,-73.9980,40.7337,-73.9734,40.7591,5,2013,0.8090,-0.5878,-0.8841,...,20424.3702,20257.0185,11536.5786,8719.2194,15921.7568,18853.5257,100830.7620,97379.6582,3350.2825,5987.9902
9758848,10.5000,-73.9927,40.7520,-73.9647,40.7728,1,2012,-0.9511,-0.3090,0.9933,...,21102.4943,20693.4947,10475.9306,7821.9172,17048.0979,20171.6649,99097.6301,95790.6810,4181.1968,7384.7224


In [11]:
with Timer("split train and val"):
    x_train, x_val, y_train, y_val = get_split_sets(train)

lgbm_params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'reg_sqrt': True,
    'learning_rate': 0.03,
    'num_leaves': 1200,
    'max_depth': -1,
    'max_bin': 5000,
    'num_rounds': 1200,
    'early_stopping_round': 50,
    'metric': 'rmse'
}

lgbm_train = lgbm.Dataset(x_train, y_train, silent=False)
lgbm_val = lgbm.Dataset(x_val, y_val, silent=False)

with Timer("train"):
    model = lgbm.train(params=lgbm_params, train_set=lgbm_train, valid_sets=lgbm_val, verbose_eval=100)
    
with Timer("predict"):
    pred = model.predict(x_val, num_iteration=model.best_iteration)
    
with Timer("calculate rmse"):
    rmse = np.sqrt(mean_squared_error(y_val, pred))

print('LightGBM RMSE', rmse)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95910
[LightGBM] [Info] Number of data points in the train set: 8782965, number of used features: 28
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 3.66438
[200]	valid_0's rmse: 3.51499
[300]	valid_0's rmse: 3.47728
[400]	valid_0's rmse: 3.46114
[500]	valid_0's rmse: 3.45575
[600]	valid_0's rmse: 3.4524
[700]	valid_0's rmse: 3.44984
[800]	valid_0's rmse: 3.44767
[900]	valid_0's rmse: 3.44563
[1000]	valid_0's rmse: 3.44408
[1100]	valid_0's rmse: 3.44362
Early stopping, best iteration is:
[1054]	valid_0's rmse: 3.44347
LightGBM RMSE 3.443467779522016


# EDA

In [None]:
def exploration_features(df):
    """adds features for use in the EDA section"""
    df = shared_features(df)
    df = (
        df
        .assign(
            hour=df.pickup_datetime.dt.hour,
            close_to_airport='No',
            fare_per_km=df.fare_amount*1000/df.distance,
            direction_bucket = pd.cut(df.direction, np.linspace(-180, 180, 37)),

            #small location buckets
            pickup_long_bucket=pd.cut(df.pickup_longitude, bins=2550, labels=False),
            pickup_lat_bucket=pd.cut(df.pickup_latitude, bins=2200, labels=False),
            dropoff_long_bucket=pd.cut(df.dropoff_longitude, bins=2550, labels=False),
            dropoff_lat_bucket=pd.cut(df.dropoff_latitude, bins=2200, labels=False),


            #large location buckets
            pickup_long_bucket_big=pd.cut(df.pickup_longitude, bins=255, labels=False),
            pickup_lat_bucket_big=pd.cut(df.pickup_latitude, bins=220, labels=False),
            dropoff_long_bucket_big=pd.cut(df.dropoff_longitude, bins=255, labels=False),
            dropoff_lat_bucket_big=pd.cut(df.dropoff_latitude, bins=220, labels=False)
        )
        .drop(columns='pickup_datetime')
        .query("0 < distance")
    )

    df.loc[((df['pickup_dist_jfk']<1500) | (df['dropoff_dist_jfk']<1500)), 'close_to_airport'] = 'JFK'
    df.loc[((df['pickup_dist_lga']<1500) | (df['dropoff_dist_lga']<1500)), 'close_to_airport'] = 'LaGuardia'
    df.loc[((df['pickup_dist_nla']<1500) | (df['dropoff_dist_nla']<1500)), 'close_to_airport'] = 'Newark'  
    return df

