In [1]:
# data set schema
import pandas as pd
from pathlib import Path
from utils import Timer
import os, sys

file = "/mnt/DP_disk1/ht/datasets/autofe/nyc_taxi/train.csv"
cols = [
    'fare_amount', 'pickup_datetime','pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
]
train_data = pd.read_csv(file, usecols=cols)
train_data

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
...,...,...,...,...,...,...,...
55423851,14.0,2014-03-15 03:28:00 UTC,-74.005272,40.740027,-73.963280,40.762555,1
55423852,4.2,2009-03-24 20:46:20 UTC,-73.957784,40.765530,-73.951640,40.773959,1
55423853,14.1,2011-04-02 22:04:24 UTC,-73.970505,40.752325,-73.960537,40.797342,1
55423854,28.9,2011-10-26 05:57:51 UTC,-73.980901,40.764629,-73.870605,40.773963,1


In [2]:
import featuretools as ft
from featuretools.primitives import TransformPrimitive
from woodwork.logical_types import LatLong, Ordinal

import pandas as pd
from utils import Timer

def manual_coordination_convert(df):
    df["pickup_latlong"] = df[['pickup_latitude', 'pickup_longitude']].apply(tuple, axis=1)
    df["dropoff_latlong"] = df[['dropoff_latitude', 'dropoff_longitude']].apply(tuple, axis=1)
    df = df.drop(["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"], axis = 1)
    return df

def cutomizedCoordinationFix(df):
    df = df.assign(rev=df.dropoff_latitude<df.dropoff_longitude)
    idx = (df['rev'] == 1)
    df.loc[idx,['dropoff_longitude','dropoff_latitude']] = df.loc[idx,['dropoff_latitude','dropoff_longitude']].values
    df.loc[idx,['pickup_longitude','pickup_latitude']] = df.loc[idx,['pickup_latitude','pickup_longitude']].values
    df = df.drop(columns=['rev'])
    return df

def clean_df(df):    
    #reverse incorrectly assigned longitude/latitude values
    df = cutomizedCoordinationFix(df)
    df = df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]
    
    return df

with Timer("manually convert geo points to coordination"):
    #prepare feature tool entityset
    train_data = clean_df(train_data)
    train_data = manual_coordination_convert(train_data)

with Timer("Load data to entityset"):
    es = ft.EntitySet("nyc_taxi_fare")
    trip_logical_types = {
        'passenger_count': Ordinal(order=list(range(0, 10))), 
        'pickup_latlong': 'LatLong',
        'dropoff_latlong': 'LatLong',
    }
    es.add_dataframe(dataframe_name="trips",
                     dataframe=train_data,
                     index="id",
                     time_index='pickup_datetime',
                     logical_types=trip_logical_types)
    
with Timer("DFS feature generation"):
    cutoff_time = es['trips'][['id', 'pickup_datetime']]
    trans_primitives = ["day", "year", "month", "weekday", "hour", "is_weekend", "is_working_hours", "part_of_day"]
    trans_primitives += ["cityblock_distance", "haversine"]
    # calculate feature_matrix using deep feature synthesis
    ret_df, features = ft.dfs(entityset=es,
                      target_dataframe_name="trips",
                      trans_primitives=trans_primitives,
                      verbose=True,
                      cutoff_time=cutoff_time,
                      approximate='36d',
                      max_depth=3,
                      max_features=40)
ret_df



manually convert geo points to coordination took 470.0405107643455 sec




Load data to entityset took 1550.5524989403784 sec




Built 12 features
Elapsed: 12:52 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
DFS feature generation took 803.8471550680697 sec


Unnamed: 0_level_0,fare_amount,passenger_count,"CITYBLOCK_DISTANCE(dropoff_latlong, pickup_latlong)",DAY(pickup_datetime),"HAVERSINE(dropoff_latlong, pickup_latlong)",HOUR(pickup_datetime),IS_WEEKEND(pickup_datetime),IS_WORKING_HOURS(pickup_datetime),MONTH(pickup_datetime),PART_OF_DAY(pickup_datetime),WEEKDAY(pickup_datetime),YEAR(pickup_datetime)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
43310508,30.2,1,11.682842,1,9.756261,0,False,False,1,midnight,3,2009
862908,15.0,1,4.439169,1,3.177903,0,False,False,1,midnight,3,2009
13073257,4.2,1,0.275526,1,0.195552,0,False,False,1,midnight,3,2009
647957,5.8,2,0.938679,1,0.793177,0,False,False,1,midnight,3,2009
12655086,14.6,1,4.305175,1,3.180219,0,False,False,1,midnight,3,2009
...,...,...,...,...,...,...,...,...,...,...,...,...
40210315,24.5,2,5.876328,30,4.770929,23,False,False,6,midnight,1,2015
13957545,6.0,2,1.241293,30,0.883764,23,False,False,6,midnight,1,2015
48940597,33.5,1,10.384043,30,7.340707,23,False,False,6,midnight,1,2015
22295217,9.5,1,2.593244,30,2.107357,23,False,False,6,midnight,1,2015


In [3]:
ret_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54315955 entries, 43310508 to 9085761
Data columns (total 12 columns):
 #   Column                                               Dtype   
---  ------                                               -----   
 0   fare_amount                                          float64 
 1   passenger_count                                      category
 2   CITYBLOCK_DISTANCE(dropoff_latlong, pickup_latlong)  float64 
 3   DAY(pickup_datetime)                                 category
 4   HAVERSINE(dropoff_latlong, pickup_latlong)           float64 
 5   HOUR(pickup_datetime)                                category
 6   IS_WEEKEND(pickup_datetime)                          boolean 
 7   IS_WORKING_HOURS(pickup_datetime)                    boolean 
 8   MONTH(pickup_datetime)                               category
 9   PART_OF_DAY(pickup_datetime)                         category
 10  WEEKDAY(pickup_datetime)                             category
 11  YEA

In [4]:
for col in ret_df.columns:
    if isinstance(ret_df[col].dtype, pd.BooleanDtype):
        ret_df[col] = ret_df[col].astype(bool)
    feature_name = col
    if ' ' in feature_name:
        feature_name = feature_name.replace(' ', '_')
    if ',' in feature_name:
        feature_name = feature_name.replace(',', '')
    ret_df = ret_df.rename(columns={col: feature_name})
    
ret_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54315955 entries, 43310508 to 9085761
Data columns (total 12 columns):
 #   Column                                              Dtype   
---  ------                                              -----   
 0   fare_amount                                         float64 
 1   passenger_count                                     category
 2   CITYBLOCK_DISTANCE(dropoff_latlong_pickup_latlong)  float64 
 3   DAY(pickup_datetime)                                category
 4   HAVERSINE(dropoff_latlong_pickup_latlong)           float64 
 5   HOUR(pickup_datetime)                               category
 6   IS_WEEKEND(pickup_datetime)                         bool    
 7   IS_WORKING_HOURS(pickup_datetime)                   bool    
 8   MONTH(pickup_datetime)                              category
 9   PART_OF_DAY(pickup_datetime)                        category
 10  WEEKDAY(pickup_datetime)                            category
 11  YEAR(pickup_date

In [5]:
from utils import Timer
import pandas as pd
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
import numpy as np
           
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':2000,
        'num_boost_round': 2000,
        'early_stopping_rounds': 50
    }

with Timer("split data"):
    test_sample = ret_df.sample(frac = 0.1)
    train_sample = ret_df.drop(test_sample.index)

with Timer("prepare train and validate for lgbm"):
    x_train = train_sample.drop(columns=['fare_amount'])
    y_train = train_sample['fare_amount'].values

    x_val = test_sample.drop(columns=['fare_amount'])
    y_val = test_sample['fare_amount'].values

    lgbm_train = lgbm.Dataset(x_train, y_train, silent=False)
    lgbm_val = lgbm.Dataset(x_val, y_val, silent=False)

with Timer("train"):
    model = lgbm.train(params=params, train_set=lgbm_train, valid_sets=lgbm_val, verbose_eval=100)

split data took 9.893682170659304 sec
prepare train and validate for lgbm took 0.28838229924440384 sec




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10103
[LightGBM] [Info] Number of data points in the train set: 48884359, number of used features: 11




[LightGBM] [Info] Start training from score 11.324008
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 4.39228
[200]	valid_0's rmse: 4.33113
[300]	valid_0's rmse: 4.31135
[400]	valid_0's rmse: 4.30178
[500]	valid_0's rmse: 4.29572
[600]	valid_0's rmse: 4.29151
[700]	valid_0's rmse: 4.2877
[800]	valid_0's rmse: 4.28528
[900]	valid_0's rmse: 4.28245
[1000]	valid_0's rmse: 4.28056
[1100]	valid_0's rmse: 4.27782
[1200]	valid_0's rmse: 4.27599
[1300]	valid_0's rmse: 4.27415
[1400]	valid_0's rmse: 4.27285
[1500]	valid_0's rmse: 4.27231
[1600]	valid_0's rmse: 4.27082
[1700]	valid_0's rmse: 4.26996
[1800]	valid_0's rmse: 4.26869
[1900]	valid_0's rmse: 4.26769
[2000]	valid_0's rmse: 4.26718
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 4.26718
train took 961.6252472493798 sec
