# auto feature engineering on NYC Taxi Fare

* ### [example 1](#Example-1): use Featuretools to process, took 2908 secs, enrich from 6 features to 12 features
* ### [example 2](#Example-2): use RecDP w/pandas to process, took 5183 secs, enrich from 6 features to 15 features
* ### [example 3](#Example-3): use RecDP w/spark to process, took 350 secs, enrich from 6 features to 15 features

In [11]:
# data set schema
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54315955 entries, 43310508 to 9085761
Data columns (total 6 columns):
 #   Column           Dtype         
---  ------           -----         
 0   id               int64         
 1   fare_amount      float64       
 2   pickup_datetime  datetime64[ns]
 3   passenger_count  category      
 4   pickup_latlong   object        
 5   dropoff_latlong  object        
dtypes: category(1), datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 2.5+ GB


# Example 1

### Using Featuretools for 55M record, took 2908secs

In [8]:
import featuretools as ft
from featuretools.primitives import TransformPrimitive
from woodwork.logical_types import LatLong, Ordinal

import pandas as pd
from utils import Timer

def manual_coordination_convert(df):
    df["pickup_latlong"] = df[['pickup_latitude', 'pickup_longitude']].apply(tuple, axis=1)
    df["dropoff_latlong"] = df[['dropoff_latitude', 'dropoff_longitude']].apply(tuple, axis=1)
    df = df.drop(["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"], axis = 1)
    return df

with Timer("read train data from csv"):
    train_data = pd.read_csv("nyc_taxi_fare_cleaned.csv")
    print(f"train_data shape is {train_data.shape}")

with Timer("manually convert geo points to coordination"):
    #prepare feature tool entityset
    train_data = manual_coordination_convert(train_data)

with Timer("Load data to entityset"):
    es = ft.EntitySet("nyc_taxi_fare")
    trip_logical_types = {
        'passenger_count': Ordinal(order=list(range(0, 10))), 
        'pickup_latlong': 'LatLong',
        'dropoff_latlong': 'LatLong',
    }
    es.add_dataframe(dataframe_name="trips",
                     dataframe=train_data,
                     index="id",
                     time_index='pickup_datetime',
                     logical_types=trip_logical_types)
    
with Timer("DFS feature generation"):
    cutoff_time = es['trips'][['id', 'pickup_datetime']]
    trans_primitives = ["day", "year", "month", "weekday", "hour", "is_weekend", "is_working_hours", "part_of_day"]
    trans_primitives += ["cityblock_distance", "haversine"]
    # calculate feature_matrix using deep feature synthesis
    ret_df, features = ft.dfs(entityset=es,
                      target_dataframe_name="trips",
                      trans_primitives=trans_primitives,
                      verbose=True,
                      cutoff_time=cutoff_time,
                      approximate='36d',
                      max_depth=3,
                      max_features=40)
ret_df

train_data shape is (54315955, 7)
read train data from csv took 46.042598474770784 sec
manually convert geo points to coordination took 426.60777373984456 sec




Load data to entityset took 1607.2522095814347 sec




Built 12 features
Elapsed: 13:58 | Progress: 100%|██████████
DFS feature generation took 875.3592023644596 sec


Unnamed: 0_level_0,fare_amount,passenger_count,"CITYBLOCK_DISTANCE(dropoff_latlong, pickup_latlong)",DAY(pickup_datetime),"HAVERSINE(dropoff_latlong, pickup_latlong)",HOUR(pickup_datetime),IS_WEEKEND(pickup_datetime),IS_WORKING_HOURS(pickup_datetime),MONTH(pickup_datetime),PART_OF_DAY(pickup_datetime),WEEKDAY(pickup_datetime),YEAR(pickup_datetime)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
43310508,30.2,1,11.682842,1,9.756261,0,False,False,1,midnight,3,2009
862908,15.0,1,4.439169,1,3.177903,0,False,False,1,midnight,3,2009
13073257,4.2,1,0.275526,1,0.195552,0,False,False,1,midnight,3,2009
647957,5.8,2,0.938679,1,0.793177,0,False,False,1,midnight,3,2009
12655086,14.6,1,4.305175,1,3.180219,0,False,False,1,midnight,3,2009
...,...,...,...,...,...,...,...,...,...,...,...,...
40210315,24.5,2,5.876328,30,4.770929,23,False,False,6,midnight,1,2015
13957545,6.0,2,1.241293,30,0.883764,23,False,False,6,midnight,1,2015
48940597,33.5,1,10.384043,30,7.340707,23,False,False,6,midnight,1,2015
22295217,9.5,1,2.593244,30,2.107357,23,False,False,6,midnight,1,2015


In [9]:
ret_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54315955 entries, 43310508 to 9085761
Data columns (total 12 columns):
 #   Column                                               Dtype   
---  ------                                               -----   
 0   fare_amount                                          float64 
 1   passenger_count                                      category
 2   CITYBLOCK_DISTANCE(dropoff_latlong, pickup_latlong)  float64 
 3   DAY(pickup_datetime)                                 category
 4   HAVERSINE(dropoff_latlong, pickup_latlong)           float64 
 5   HOUR(pickup_datetime)                                category
 6   IS_WEEKEND(pickup_datetime)                          boolean 
 7   IS_WORKING_HOURS(pickup_datetime)                    boolean 
 8   MONTH(pickup_datetime)                               category
 9   PART_OF_DAY(pickup_datetime)                         category
 10  WEEKDAY(pickup_datetime)                             category
 11  YEA

# Example 2

### Using Pandas for 55M record, took about 5183secs

In [1]:
import sys
sys.path.append('../../../')

import pandas as pd
from utils import Timer

with Timer("read train data from csv"):
    train_data = pd.read_csv("nyc_taxi_fare_cleaned.csv")
    print(f"train_data shape is {train_data.shape}")

from pyrecdp.autofe import FeatureWrangler
with Timer("initiate autofe pipeline"):
    pipeline = FeatureWrangler(dataset=train_data, label="fare_amount")

with Timer("transform"):
    ret = pipeline.fit_transform(engine_type = 'pandas')
    
print(f"transformed shape is {ret.shape}")
ret

train_data shape is (54315955, 7)
read train data from csv took 48.9096081731841 sec




initiate autofe pipeline took 5.635765580460429 sec
Transformation of <function DataframeConvertFeatureGenerator.get_function_pd.<locals>.convert_df at 0x7fa273f0c310> took 0.000 secs
Transformation of <function FillNaFeatureGenerator.get_function_pd.<locals>.fill_na at 0x7fa273f0c670> took 3.765 secs
Transformation of <function TypeInferFeatureGenerator.get_function_pd.<locals>.type_infer at 0x7fa273f0c5e0> took 4681.568 secs
Transformation of <function CoordinatesInferFeatureGenerator.get_function_pd.<locals>.type_infer at 0x7fa273f0c550> took 417.431 secs
Transformation of <function FeaturetoolsBasedFeatureGenerator.get_function_pd.<locals>.generate_ft_feature at 0x7fa273f0c700> took 43.267 secs
Transformation of <function GeoFeatureGenerator.get_function_pd.<locals>.generate_ft_feature at 0x7fa273f0c790> took 37.485 secs
Transformation of <function DataframeTransformFeatureGenerator.get_function_pd.<locals>.transform_df at 0x7fa273f0c820> took 0.000 secs
transform took 5183.5176176

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_coordinates,dropoff_coordinates,pickup_datetime.day,pickup_datetime.month,pickup_datetime.weekday,pickup_datetime.year,pickup_datetime.hour,pickup_datetime.part_of_day,haversine_pickup_coordinates_dropoff_coordinates
0,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.841610,40.712278,1,"(40.721319, -73.844311)","(40.712278, -73.84161)",15,6,0,2009,17,evening,0.640488
1,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,"(40.711303, -74.016048)","(40.782004, -73.979268)",5,1,1,2010,16,afternoon,5.250677
2,2011-08-18 00:35:00+00:00,-73.982738,40.761270,-73.991242,40.750562,2,"(40.76127, -73.982738)","(40.750562, -73.991242)",18,8,3,2011,0,midnight,0.863412
3,2012-04-21 04:30:42+00:00,-73.987130,40.733143,-73.991567,40.758092,1,"(40.733143, -73.98713)","(40.758092, -73.991567)",21,4,5,2012,4,dawn,1.739388
4,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,"(40.768008, -73.968095)","(40.783762, -73.956655)",9,3,1,2010,7,early morning,1.242220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54315950,2014-03-15 03:28:00+00:00,-74.005272,40.740027,-73.963280,40.762555,1,"(40.740027, -74.005272)","(40.762555, -73.96328)",15,3,5,2014,3,midnight,2.693273
54315951,2009-03-24 20:46:20+00:00,-73.957784,40.765530,-73.951640,40.773959,1,"(40.76553, -73.957784)","(40.773959, -73.95164)",24,3,1,2009,20,night,0.665235
54315952,2011-04-02 22:04:24+00:00,-73.970505,40.752325,-73.960537,40.797342,1,"(40.752325, -73.970505)","(40.797342, -73.960537)",2,4,5,2011,22,night,3.153803
54315953,2011-10-26 05:57:51+00:00,-73.980901,40.764629,-73.870605,40.773963,1,"(40.764629, -73.980901)","(40.773963, -73.870605)",26,10,2,2011,5,dawn,5.807441


In [14]:
ret.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54315955 entries, 0 to 54315954
Data columns (total 15 columns):
 #   Column                                            Dtype              
---  ------                                            -----              
 0   pickup_datetime                                   datetime64[ns, UTC]
 1   pickup_longitude                                  float64            
 2   pickup_latitude                                   float64            
 3   dropoff_longitude                                 float64            
 4   dropoff_latitude                                  float64            
 5   passenger_count                                   int64              
 6   pickup_coordinates                                object             
 7   dropoff_coordinates                               object             
 8   pickup_datetime.day                               int64              
 9   pickup_datetime.month                             int64

# Example 3

### Using spark for 55M records, took about 350secs

In [13]:
import sys
sys.path.append('../../../')

import pandas as pd
from utils import Timer

with Timer("read train data from csv"):
    train_data = pd.read_csv("nyc_taxi_fare_cleaned.csv")
    print(f"train_data shape is {train_data.shape}")

from pyrecdp.autofe import FeatureWrangler
with Timer("initiate autofe pipeline"):
    pipeline = FeatureWrangler(dataset=train_data, label="fare_amount")

with Timer("transform"):
    ret = pipeline.fit_transform(engine_type = 'spark')
    
print(f"transformed shape is {ret.shape}")
ret

train_data shape is (54315955, 7)
read train data from csv took 45.0155632654205 sec
pipeline includes below steps:
["Stage 0: [<class 'pyrecdp.primitives.generators.dataframe.DataframeConvertFeatureGenerator'>]", "Stage 1: [<class 'pyrecdp.primitives.generators.fillna.FillNaFeatureGenerator'>, <class 'pyrecdp.primitives.generators.type.TypeInferFeatureGenerator'>, <class 'pyrecdp.primitives.generators.geograph.CoordinatesInferFeatureGenerator'>]", "Stage 2: [<class 'pyrecdp.primitives.generators.datetime.DatetimeFeatureGenerator'>, <class 'pyrecdp.primitives.generators.geograph.GeoFeatureGenerator'>]", "Stage 3: [<class 'pyrecdp.primitives.generators.dataframe.DataframeTransformFeatureGenerator'>]", 'Stage 4: []', 'Stage 5: []', 'Stage 6: []']
initiate autofe pipeline took 3.5842366172000766 sec


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/12 17:54:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/12 17:54:45 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


per core memory size is 10.417 GB and shuffle_disk maximum capacity is 8589934592.000 GB


23/01/12 17:55:21 WARN TaskSetManager: Stage 0 contains a task of very large size (78034 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

DataframeConvert partition pandas dataframe to spark RDD took 42.148 secs


23/01/12 17:55:30 WARN TaskSetManager: Stage 1 contains a task of very large size (78034 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

DataframeTransform took 292.536 secs, processed 54315955 rows with num_partitions as 200
DataframeTransform combine to one pandas dataframe took 6.072 secs
transform took 348.05814038962126 sec
transformed shape is (54315955, 15)


Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_coordinates,dropoff_coordinates,pickup_datetime.day,pickup_datetime.month,pickup_datetime.weekday,pickup_datetime.year,pickup_datetime.hour,pickup_datetime.part_of_day,haversine_pickup_coordinates_dropoff_coordinates
0,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.841610,40.712278,1,"(40.721319, -73.844311)","(40.712278, -73.84161)",15,6,0,2009,17,evening,0.640488
1,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,"(40.711303, -74.016048)","(40.782004, -73.979268)",5,1,1,2010,16,afternoon,5.250677
2,2011-08-18 00:35:00+00:00,-73.982738,40.761270,-73.991242,40.750562,2,"(40.76127, -73.982738)","(40.750562, -73.991242)",18,8,3,2011,0,midnight,0.863412
3,2012-04-21 04:30:42+00:00,-73.987130,40.733143,-73.991567,40.758092,1,"(40.733143, -73.98713)","(40.758092, -73.991567)",21,4,5,2012,4,dawn,1.739388
4,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,"(40.768008, -73.968095)","(40.783762, -73.956655)",9,3,1,2010,7,early morning,1.242220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54315950,2014-03-15 03:28:00+00:00,-74.005272,40.740027,-73.963280,40.762555,1,"(40.740027, -74.005272)","(40.762555, -73.96328)",15,3,5,2014,3,midnight,2.693273
54315951,2009-03-24 20:46:20+00:00,-73.957784,40.765530,-73.951640,40.773959,1,"(40.76553, -73.957784)","(40.773959, -73.95164)",24,3,1,2009,20,night,0.665235
54315952,2011-04-02 22:04:24+00:00,-73.970505,40.752325,-73.960537,40.797342,1,"(40.752325, -73.970505)","(40.797342, -73.960537)",2,4,5,2011,22,night,3.153803
54315953,2011-10-26 05:57:51+00:00,-73.980901,40.764629,-73.870605,40.773963,1,"(40.764629, -73.980901)","(40.773963, -73.870605)",26,10,2,2011,5,dawn,5.807441


In [14]:
ret.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54315955 entries, 0 to 54315954
Data columns (total 15 columns):
 #   Column                                            Dtype              
---  ------                                            -----              
 0   pickup_datetime                                   datetime64[ns, UTC]
 1   pickup_longitude                                  float64            
 2   pickup_latitude                                   float64            
 3   dropoff_longitude                                 float64            
 4   dropoff_latitude                                  float64            
 5   passenger_count                                   int64              
 6   pickup_coordinates                                object             
 7   dropoff_coordinates                               object             
 8   pickup_datetime.day                               int64              
 9   pickup_datetime.month                             int64