# prepare

In [1]:
# define function
import numpy as np 
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc
from utils import Timer


def clean_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          # (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]


# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

    
def sphere_dist_bear(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = pickup_lon - dropoff_lon
    
    #Compute bearing distance
    a = np.arctan2(np.sin(dlon * np.cos(dropoff_lat)),np.cos(pickup_lat) * np.sin(dropoff_lat) - np.sin(pickup_lat) * np.cos(dropoff_lat) * np.cos(dlon))
    return a

def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)    

def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
     
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset
    
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset



# Menu
* [Full Scale Pipeline](#Full-Scale-Pipeline)
* [Sampled Pipeline](#Sampled-Pipeline)
* [Sampled Pipeline(automl)](#Sampled-Pipeline-automl)

# Full Scale Pipeline

In [1]:
%%time
# === main ===

path = "../data/"
with Timer("read train data"):
    # Reading Data
    train_df =  pd.read_csv(f'{path}/train.csv')
    #train_df.sample(frac=0.1, replace=True, random_state=1)

with Timer("Train data wrangling"):    
    #Drop rows with null values
    train_df = train_df.dropna(how = 'any', axis = 'rows')
    train_df = clean_df(train_df)
    train_df = add_datetime_info(train_df)
    train_df = add_airport_dist(train_df)
    train_df['distance'] = sphere_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                       train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 

    train_df['bearing'] = sphere_dist_bear(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                       train_df['dropoff_latitude'] , train_df['dropoff_longitude'])                                    
    train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
    train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
    train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
    train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])
    train_df.drop(columns=['key', 'pickup_datetime'], inplace=True)

    y = train_df['fare_amount']
    train_df = train_df.drop(columns=['fare_amount'])
    print(train_df.head())

with Timer("Train/Valid split"):
    x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)

del train_df
del y
gc.collect()

params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

with Timer("Prepare DataLoader"):
    train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
    valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
    
with Timer("train lgbm model"):
    model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)

del x_train
del y_train
del x_test
del y_test
gc.collect()
   
with Timer("read test data"):
    test_df =  pd.read_csv(f'{path}/test.csv')
print(test_df.head())

with Timer("test data wrangling"):
    test_df = add_datetime_info(test_df)
    test_df = add_airport_dist(test_df)
    test_df['distance'] = sphere_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                       test_df['dropoff_latitude'] , test_df['dropoff_longitude'])

    test_df['bearing'] = sphere_dist_bear(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                        test_df['dropoff_latitude'] , test_df['dropoff_longitude'])  
    test_df['pickup_latitude'] = radian_conv(test_df['pickup_latitude'])
    test_df['pickup_longitude'] = radian_conv(test_df['pickup_longitude'])
    test_df['dropoff_latitude'] = radian_conv(test_df['dropoff_latitude'])
    test_df['dropoff_longitude'] = radian_conv(test_df['dropoff_longitude'])

    test_key = test_df['key']
    test_df = test_df.drop(columns=['key', 'pickup_datetime'])

with Timer("predict lgbm model"):
    #Predict from test set
    prediction = model.predict(test_df, num_iteration = model.best_iteration)
    
with Timer("save prediction"):
    submission = pd.DataFrame({
            "key": test_key,
            "fare_amount": prediction
    })

    submission.to_csv(f'{path}/taxi_fare_submission.csv',index=False)

read train data took 89.32287148898467 sec
   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
0         -1.288826         0.710721          -1.288779          0.710563   
1         -1.291824         0.710546          -1.291182          0.711780   
2         -1.291242         0.711418          -1.291391          0.711231   
3         -1.291319         0.710927          -1.291396          0.711363   
4         -1.290987         0.711536          -1.290787          0.711811   

   passenger_count  hour  day  month  weekday  year   jfk_dist   ewr_dist  \
0                1    17   15      6        0  2009  20.265840  55.176046   
1                1    16    5      1        1  2010  44.667679  31.832358   
2                2     0   18      8        3  2011  43.597686  33.712082   
3                1     4   21      4        5  2012  42.642965  32.556289   
4                1     7    9      3        1  2010  43.329953  39.406828   

    lga_dist   sol_dist   nyc_d



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55093
[LightGBM] [Info] Number of data points in the train set: 48884102, number of used features: 17




[LightGBM] [Info] Start training from score 11.323812
Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.63438
[1000]	valid_0's rmse: 3.57345
[1500]	valid_0's rmse: 3.54202
[2000]	valid_0's rmse: 3.52315
[2500]	valid_0's rmse: 3.51118
[3000]	valid_0's rmse: 3.50184
[3500]	valid_0's rmse: 3.4938
[4000]	valid_0's rmse: 3.48643
[4500]	valid_0's rmse: 3.48115
[5000]	valid_0's rmse: 3.47663
[5500]	valid_0's rmse: 3.47138
[6000]	valid_0's rmse: 3.4681
[6500]	valid_0's rmse: 3.46498
[7000]	valid_0's rmse: 3.46262
[7500]	valid_0's rmse: 3.45953
[8000]	valid_0's rmse: 3.45725
[8500]	valid_0's rmse: 3.4553
[9000]	valid_0's rmse: 3.45308
[9500]	valid_0's rmse: 3.45192
[10000]	valid_0's rmse: 3.45018
[10500]	valid_0's rmse: 3.44849
[11000]	valid_0's rmse: 3.44704
[11500]	valid_0's rmse: 3.44548
[12000]	valid_0's rmse: 3.44411
[12500]	valid_0's rmse: 3.44248
[13000]	valid_0's rmse: 3.44126
[13500]	valid_0's rmse: 3.44005
[14000]	valid_0's rmse: 3.43887
[14500]	val

# Sampled Pipeline

In [3]:
%%time
# === main ===

path = "../data/"
with Timer("read train data"):
    # Reading Data
    train_df =  pd.read_csv(f'{path}/train.csv')
    train_df = train_df.sample(frac=0.1, replace=True, random_state=1)

with Timer("Train data wrangling"):    
    #Drop rows with null values
    train_df = train_df.dropna(how = 'any', axis = 'rows')
    train_df = clean_df(train_df)
    train_df = add_datetime_info(train_df)
    train_df = add_airport_dist(train_df)
    train_df['distance'] = sphere_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                       train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 

    train_df['bearing'] = sphere_dist_bear(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                       train_df['dropoff_latitude'] , train_df['dropoff_longitude'])                                    
    train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
    train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
    train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
    train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])
    train_df.drop(columns=['key', 'pickup_datetime'], inplace=True)

    y = train_df['fare_amount']
    train_df = train_df.drop(columns=['fare_amount'])
    print(train_df.head())

with Timer("Train/Valid split"):
    x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)

del train_df
del y
gc.collect()

params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

with Timer("Prepare DataLoader"):
    train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
    valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
    
with Timer("train lgbm model"):
    model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)

del x_train
del y_train
del x_test
del y_test
gc.collect()
   
with Timer("read test data"):
    test_df =  pd.read_csv(f'{path}/test.csv')
print(test_df.head())

with Timer("test data wrangling"):
    test_df = add_datetime_info(test_df)
    test_df = add_airport_dist(test_df)
    test_df['distance'] = sphere_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                       test_df['dropoff_latitude'] , test_df['dropoff_longitude'])

    test_df['bearing'] = sphere_dist_bear(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                        test_df['dropoff_latitude'] , test_df['dropoff_longitude'])  
    test_df['pickup_latitude'] = radian_conv(test_df['pickup_latitude'])
    test_df['pickup_longitude'] = radian_conv(test_df['pickup_longitude'])
    test_df['dropoff_latitude'] = radian_conv(test_df['dropoff_latitude'])
    test_df['dropoff_longitude'] = radian_conv(test_df['dropoff_longitude'])

    test_key = test_df['key']
    test_df = test_df.drop(columns=['key', 'pickup_datetime'])

with Timer("predict lgbm model"):
    #Predict from test set
    prediction = model.predict(test_df, num_iteration = model.best_iteration)
    
with Timer("save prediction"):
    submission = pd.DataFrame({
            "key": test_key,
            "fare_amount": prediction
    })

    submission.to_csv(f'{path}/taxi_fare_submission.csv',index=False)

read train data took 93.80535237956792 sec
          pickup_longitude  pickup_latitude  dropoff_longitude  \
46265381         -1.291350         0.710727          -1.291295   
55017707         -1.291284         0.711086          -1.291520   
6762380          -1.291555         0.711088          -1.291458   
45880392         -1.291709         0.710533          -1.291314   
491263           -1.291047         0.711509          -1.291177   

          dropoff_latitude  passenger_count  hour  day  month  weekday  year  \
46265381          0.711553                1    18    2      3        5  2013   
55017707          0.711466                3    23   12      6        4  2009   
6762380           0.711222                6    11   27      6        5  2015   
45880392          0.711286                1    15    8      6        0  2009   
491263            0.711776                1    18   19      9        4  2014   

           jfk_dist   ewr_dist   lga_dist   sol_dist   nyc_dist  distance  \
46



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55092
[LightGBM] [Info] Number of data points in the train set: 4888315, number of used features: 17




[LightGBM] [Info] Start training from score 11.325111
Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.6887
[1000]	valid_0's rmse: 3.63159
[1500]	valid_0's rmse: 3.60437
[2000]	valid_0's rmse: 3.58597
[2500]	valid_0's rmse: 3.5734
[3000]	valid_0's rmse: 3.56274
[3500]	valid_0's rmse: 3.55445
[4000]	valid_0's rmse: 3.54866
[4500]	valid_0's rmse: 3.54413
[5000]	valid_0's rmse: 3.53884
[5500]	valid_0's rmse: 3.53441
[6000]	valid_0's rmse: 3.52975
[6500]	valid_0's rmse: 3.52494
[7000]	valid_0's rmse: 3.52273
[7500]	valid_0's rmse: 3.51989
[8000]	valid_0's rmse: 3.51725
[8500]	valid_0's rmse: 3.51448
[9000]	valid_0's rmse: 3.51071
[9500]	valid_0's rmse: 3.50907
[10000]	valid_0's rmse: 3.50696
[10500]	valid_0's rmse: 3.50558
[11000]	valid_0's rmse: 3.5041
[11500]	valid_0's rmse: 3.50279
[12000]	valid_0's rmse: 3.50129
[12500]	valid_0's rmse: 3.49986
[13000]	valid_0's rmse: 3.49898
[13500]	valid_0's rmse: 3.49791
[14000]	valid_0's rmse: 3.49664
[14500]	val















Early stopping, best iteration is:
[36373]	valid_0's rmse: 3.4731
train lgbm model took 2274.639255899936 sec
read test data took 0.022678366862237453 sec
                           key          pickup_datetime  pickup_longitude  \
0  2015-01-27 13:08:24.0000002  2015-01-27 13:08:24 UTC        -73.973320   
1  2015-01-27 13:08:24.0000003  2015-01-27 13:08:24 UTC        -73.986862   
2  2011-10-08 11:53:44.0000002  2011-10-08 11:53:44 UTC        -73.982524   
3  2012-12-01 21:12:12.0000002  2012-12-01 21:12:12 UTC        -73.981160   
4  2012-12-01 21:12:12.0000003  2012-12-01 21:12:12 UTC        -73.966046   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
0        40.763805         -73.981430         40.743835                1  
1        40.719383         -73.998886         40.739201                1  
2        40.751260         -73.979654         40.746139                1  
3        40.767807         -73.990448         40.751635                1  
4      

# Sampled Pipeline automl

In [2]:
%%time
# === main ===
path = "../data/"
with Timer("read train data"):
    # Reading Data
    train_df =  pd.read_csv(f'{path}/train.csv')
    train_df = train_df.sample(frac=0.1, replace=True, random_state=1)

with Timer("Train data wrangling"):    
    #Drop rows with null values
    train_df = train_df.dropna(how = 'any', axis = 'rows')
    train_df = clean_df(train_df)
    train_df = add_datetime_info(train_df)
    train_df = add_airport_dist(train_df)
    train_df['distance'] = sphere_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                       train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 

    train_df['bearing'] = sphere_dist_bear(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                       train_df['dropoff_latitude'] , train_df['dropoff_longitude'])                                    
    train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
    train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
    train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
    train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])
    train_df.drop(columns=['key', 'pickup_datetime'], inplace=True)

    y = train_df['fare_amount']
    train_df = train_df.drop(columns=['fare_amount'])
    print(train_df.head())

with Timer("Train/Valid split"):
    x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)

del train_df
del y
gc.collect()

# looking for right ml pipeline
import evalml
from evalml import AutoMLSearch

automl = AutoMLSearch(X_train=x_train,
                      y_train=y_train,
                      problem_type="regression",
                      objective="root mean squared error")
with Timer("find best ml pipeline"):
    automl.search()
    
automl.describe_pipeline(automl.rankings.iloc[1]["id"])

with Timer("train ml pipeline"):
    pipeline = automl.best_pipeline
    print
    pipeline.fit(x_train, y_train)

read train data took 87.87712919898331 sec
          pickup_longitude  pickup_latitude  dropoff_longitude  \
46265381         -1.291350         0.710727          -1.291295   
55017707         -1.291284         0.711086          -1.291520   
6762380          -1.291555         0.711088          -1.291458   
45880392         -1.291709         0.710533          -1.291314   
491263           -1.291047         0.711509          -1.291177   

          dropoff_latitude  passenger_count  hour  day  month  weekday  year  \
46265381          0.711553                1    18    2      3        5  2013   
55017707          0.711466                3    23   12      6        4  2009   
6762380           0.711222                6    11   27      6        5  2015   
45880392          0.711286                1    15    8      6        0  2009   
491263            0.711776                1    18   19      9        4  2014   

           jfk_dist   ewr_dist   lga_dist   sol_dist   nyc_dist  distance  \
46



Train/Valid split took 1.9631664017215371 sec


pandas.core.index is deprecated and will be removed in a future version. The public classes are available in the top-level namespace.
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.235e+07, tolerance: 1.136e+04
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.239e+07, tolerance: 1.136e+04


find best ml pipeline took 306.7217131811194 sec

***************************************************************************


INFO:evalml.pipelines.pipeline_base.describe:
***************************************************************************


* Random Forest Regressor w/ Replace Nullable Types Transformer + Imputer *


INFO:evalml.pipelines.pipeline_base.describe:* Random Forest Regressor w/ Replace Nullable Types Transformer + Imputer *


***************************************************************************


INFO:evalml.pipelines.pipeline_base.describe:***************************************************************************





INFO:evalml.pipelines.pipeline_base.describe:


Problem Type: regression


INFO:evalml.pipelines.pipeline_base.describe:Problem Type: regression


Model Family: Random Forest


INFO:evalml.pipelines.pipeline_base.describe:Model Family: Random Forest





INFO:evalml.pipelines.pipeline_base.describe:


Pipeline Steps


INFO:evalml.pipelines.pipeline_base.describe:Pipeline Steps






1. Replace Nullable Types Transformer


INFO:evalml.pipelines.component_graph.describe:1. Replace Nullable Types Transformer


2. Imputer


INFO:evalml.pipelines.component_graph.describe:2. Imputer


	 * categorical_impute_strategy : most_frequent


INFO:evalml.pipelines.components.component_base.describe:	 * categorical_impute_strategy : most_frequent


	 * numeric_impute_strategy : mean


INFO:evalml.pipelines.components.component_base.describe:	 * numeric_impute_strategy : mean


	 * boolean_impute_strategy : most_frequent


INFO:evalml.pipelines.components.component_base.describe:	 * boolean_impute_strategy : most_frequent


	 * categorical_fill_value : None


INFO:evalml.pipelines.components.component_base.describe:	 * categorical_fill_value : None


	 * numeric_fill_value : None


INFO:evalml.pipelines.components.component_base.describe:	 * numeric_fill_value : None


	 * boolean_fill_value : None


INFO:evalml.pipelines.components.component_base.describe:	 * boolean_fill_value : None


3. Random Forest Regressor


INFO:evalml.pipelines.component_graph.describe:3. Random Forest Regressor


	 * n_estimators : 100


INFO:evalml.pipelines.components.component_base.describe:	 * n_estimators : 100


	 * max_depth : 6


INFO:evalml.pipelines.components.component_base.describe:	 * max_depth : 6


	 * n_jobs : -1


INFO:evalml.pipelines.components.component_base.describe:	 * n_jobs : -1





INFO:evalml.automl.automl_search.describe_pipeline:


Training


INFO:evalml.automl.automl_search.describe_pipeline:Training






Training for regression problems.


INFO:evalml.automl.automl_search.describe_pipeline:Training for regression problems.


Total training time (including CV): 38.5 seconds


INFO:evalml.automl.automl_search.describe_pipeline:Total training time (including CV): 38.5 seconds





INFO:evalml.automl.automl_search.describe_pipeline:


Cross Validation


INFO:evalml.automl.automl_search.describe_pipeline:Cross Validation


----------------


INFO:evalml.automl.automl_search.describe_pipeline:----------------


            Root Mean Squared Error ExpVariance MaxError MedianAE    MSE   MAE    R2 # Training # Validation
0                             4.294       0.803  450.959    1.343 18.440 2.134 0.803  1,222,078    3,666,237
mean                          4.294       0.803  450.959    1.343 18.440 2.134 0.803          -            -
std                               -           -        -        -      -     -     -          -            -
coef of var                       -           -        -        -      -     -     -          -            -


INFO:evalml.automl.automl_search.describe_pipeline:            Root Mean Squared Error ExpVariance MaxError MedianAE    MSE   MAE    R2 # Training # Validation
0                             4.294       0.803  450.959    1.343 18.440 2.134 0.803  1,222,078    3,666,237
mean                          4.294       0.803  450.959    1.343 18.440 2.134 0.803          -            -
std                               -           -        -        -      -     -     -          -            -
coef of var                       -           -        -        -      -     -     -          -            -


train ml pipeline took 11.103634589817375 sec
CPU times: user 1h 22min 3s, sys: 4min 32s, total: 1h 26min 36s
Wall time: 7min 14s
