In [1]:
import pandas as pd
from utils import Timer

def cutomizedCoordinationFix(df):
    df = df.assign(rev=df.dropoff_latitude<df.dropoff_longitude)
    idx = (df['rev'] == 1)
    df.loc[idx,['dropoff_longitude','dropoff_latitude']] = df.loc[idx,['dropoff_latitude','dropoff_longitude']].values
    df.loc[idx,['pickup_longitude','pickup_latitude']] = df.loc[idx,['pickup_latitude','pickup_longitude']].values
    df = df.drop(columns=['rev'])
    return df

def clean_df(df):    
    #reverse incorrectly assigned longitude/latitude values
    df = cutomizedCoordinationFix(df)
    df = df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]
    
    return df

cols = [
    'fare_amount', 'pickup_datetime','pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
]

file = "/mnt/DP_disk1/ht/datasets/autofe/nyc_taxi/train.csv"
with Timer(f"Load train full"):
    train_data = pd.read_csv(file, usecols=cols)

with Timer("Data Wrangling for train"):
    train_data = clean_df(train_data)


Load train full took 51.84882614016533 sec
Data Wrangling for train took 6.58887935988605 sec


In [2]:
train_data

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
...,...,...,...,...,...,...,...
55423851,14.0,2014-03-15 03:28:00 UTC,-74.005272,40.740027,-73.963280,40.762555,1
55423852,4.2,2009-03-24 20:46:20 UTC,-73.957784,40.765530,-73.951640,40.773959,1
55423853,14.1,2011-04-02 22:04:24 UTC,-73.970505,40.752325,-73.960537,40.797342,1
55423854,28.9,2011-10-26 05:57:51 UTC,-73.980901,40.764629,-73.870605,40.773963,1


In [3]:
from autogluon.tabular import TabularDataset, TabularPredictor

model = TabularPredictor(label="fare_amount")
predictor = model.fit(train_data, hyperparameters={
    "GBM": {},
    "RF": {},
    "XT": {},
    "KNN": {},
    "LR": {},
})

No path specified. Models will be saved in: "AutogluonModels/ag-20230628_062921/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230628_062921/"
AutoGluon Version:  0.7.0
Python Version:     3.8.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #74-Ubuntu SMP Wed Feb 22 14:14:39 UTC 2023
Train Data Rows:    54315955
Train Data Columns: 6
Label Column: fare_amount
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (500.0, 0.01, 11.32425, 9.68662)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may s

[1000]	valid_set's rmse: 3.98709
[2000]	valid_set's rmse: 3.92522
[3000]	valid_set's rmse: 3.9037
[4000]	valid_set's rmse: 3.88413
[5000]	valid_set's rmse: 3.87263
[6000]	valid_set's rmse: 3.86414
[7000]	valid_set's rmse: 3.85463
[8000]	valid_set's rmse: 3.85004
[9000]	valid_set's rmse: 3.84715
[10000]	valid_set's rmse: 3.84339


	-3.8434	 = Validation score   (-root_mean_squared_error)
	1424.7s	 = Training   runtime
	6.34s	 = Validation runtime
Fitting model: RandomForest ...
	-3.7168	 = Validation score   (-root_mean_squared_error)
	4721.84s	 = Training   runtime
	0.71s	 = Validation runtime
Fitting model: ExtraTrees ...
	-8.3763	 = Validation score   (-root_mean_squared_error)
	1033.71s	 = Training   runtime
	0.5s	 = Validation runtime
Fitting model: LinearModel ...
	-8.9437	 = Validation score   (-root_mean_squared_error)
	167.24s	 = Training   runtime
	0.48s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-3.6748	 = Validation score   (-root_mean_squared_error)
	3.38s	 = Training   runtime
	0.01s	 = Validation runtime
AutoGluon training complete, total runtime = 11838.16s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230628_062921/")


In [4]:
info = predictor.info()
info['model_info'][info['best_model']]

{'name': 'WeightedEnsemble_L2',
 'model_type': 'WeightedEnsembleModel',
 'problem_type': 'regression',
 'eval_metric': 'root_mean_squared_error',
 'stopping_metric': 'root_mean_squared_error',
 'fit_time': 3.3817596435546875,
 'num_classes': None,
 'quantile_levels': None,
 'predict_time': 0.0055277347564697266,
 'val_score': -3.6748304739901507,
 'hyperparameters': {'use_orig_features': False,
  'max_base_models': 25,
  'max_base_models_per_type': 5,
  'save_bag_folds': True},
 'hyperparameters_fit': {},
 'hyperparameters_nondefault': ['save_bag_folds'],
 'ag_args_fit': {'max_memory_usage_ratio': 1.0,
  'max_time_limit_ratio': 1.0,
  'max_time_limit': None,
  'min_time_limit': 0,
  'valid_raw_types': None,
  'valid_special_types': None,
  'ignored_type_group_special': None,
  'ignored_type_group_raw': None,
  'get_features_kwargs': None,
  'get_features_kwargs_extra': None,
  'predict_1_batch_size': None,
  'temperature_scalar': None,
  'drop_unique': False},
 'num_features': 2,
 'fea