# Model Selection

## Selecting Models

To be analyzed:
- missing values imputation? Regression should not have missing values.
- restricting boundaries?

In [1]:
from playground import *

In [2]:
df = pd.read_csv("csv_ml/eda_01.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 14 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id                       1458644 non-null  object 
 1   vendor_id                1458644 non-null  int64  
 2   pickup_datetime          1458644 non-null  object 
 3   dropoff_datetime         1458644 non-null  object 
 4   passenger_count          1458644 non-null  int64  
 5   pickup_longitude         1458644 non-null  float64
 6   pickup_latitude          1458644 non-null  float64
 7   dropoff_longitude        1458644 non-null  float64
 8   dropoff_latitude         1458644 non-null  float64
 9   store_and_fwd_flag       1458644 non-null  object 
 10  trip_duration            1458644 non-null  int64  
 11  distance_osrm            1458627 non-null  float64
 12  pickup_dist_NYC_center   1458644 non-null  float64
 13  dropoff_dist_NYC_center  1458644 non-null 

In [4]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_osrm,pickup_dist_NYC_center,dropoff_dist_NYC_center
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,2.1587,6.445619,6.825864
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,2.5113,3.585708,2.111028
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,9.9354,6.118901,0.306521
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.7776,0.866388,0.85828
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.6064,9.352795,8.230915


In [5]:
X = df.drop(columns="trip_duration", axis=1)
y = df["trip_duration"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [7]:
%%time
rm_duplicates = RemoveDuplicates()
X_train, y_train = rm_duplicates.transform(X_train, y_train)

>>>> Starting the process of removing duplicates ...
No duplicates found.
CPU times: total: 5.88 s
Wall time: 6.38 s


In [8]:
%%time
to_dtypes = ToDataTypes()
X_train, y_train = to_dtypes.transform(X_train, y_train)
X_test, y_test = to_dtypes.transform(X_test, y_test)

>>>> Starting data type conversion process...
Column 'vendor_id' changed from int64 to object
Column 'pickup_datetime' changed from object to datetime64[ns]
Column 'dropoff_datetime' changed from object to datetime64[ns]
>>>> Starting data type conversion process...
Column 'vendor_id' changed from int64 to object
Column 'pickup_datetime' changed from object to datetime64[ns]
Column 'dropoff_datetime' changed from object to datetime64[ns]
CPU times: total: 5.22 s
Wall time: 5.39 s


In [9]:
%%time
datetime_break = DateTimeBreak()
X_train = datetime_break.transform(X_train)
X_test = datetime_break.transform(X_test)

>>>> Starting datetime feature extraction...
Extracted features: ['pickup_month', 'pickup_day', 'pickup_day_of_week', 'pickup_hour']
Dropped columns: ['pickup_datetime', 'dropoff_datetime']
>>>> Starting datetime feature extraction...
Extracted features: ['pickup_month', 'pickup_day', 'pickup_day_of_week', 'pickup_hour']
Dropped columns: ['pickup_datetime', 'dropoff_datetime']
CPU times: total: 5.44 s
Wall time: 5.61 s


In [10]:
%time
miss_val_input = MissValInput()
X_train, y_train = miss_val_input.fit_transform(X_train, y_train)
X_test, y_test = miss_val_input.transform(X_test, y_test)

CPU times: total: 0 ns
Wall time: 0 ns
>>>> Starting missing value imputation...
Dropped 16 entries (0.00%) from column 'distance_osrm' due to missing values.
Initial data length: 1166915
Removed data: 16 (0.00%)
Final data length: 1166899
>>>> Starting missing value imputation...
Dropped 1 entries (0.00%) from column 'distance_osrm' due to missing values.
Initial data length: 291729
Removed data: 1 (0.00%)
Final data length: 291728


In [11]:
%%time
feature_restriction = FeatureRestriction()
X_train, y_train = feature_restriction.transform(X_train, y_train)

>>>> Starting features restriction ...
The dataset size: 1166899 rows
trip_duration (old) -> [min, max]: [1, 3526282]
trip_duration (new) -> [min, max]: [60, 86392]
distance_osrm (old) -> [min, max]: [0.0, 765.6445]
distance_osrm (new) -> [min, max]: [0.1001, 97.7243]
speed_osrm column not found, skipping restriction on 'speed_osrm'.
passenger_count (old) -> [min, max]: [0, 8]
passenger_count (new) -> [min, max]: [1, 8]
Total removed data: 11632 (1.00%)
CPU times: total: 2.02 s
Wall time: 2.07 s


In [12]:
# %%time
# outlier_mapper = OutlierMapper(map_title="outliers_map_ml_baseline", csv_dir="csv_ml_baseline", html_dir="html_ml_baseline")
# X_train, y_train = outlier_mapper.transform(X_train, y_train)
# X_test.drop(columns=['pickup_dist_NYC_center', 'dropoff_dist_NYC_center'], inplace=True)

In [13]:
# without restricting boundary area
X_train.drop(columns=['pickup_dist_NYC_center', 'dropoff_dist_NYC_center'], inplace=True)
X_test.drop(columns=['pickup_dist_NYC_center', 'dropoff_dist_NYC_center'], inplace=True)

In [14]:
%%time
feature_encoding = FeatureEncoding()
X_train = feature_encoding.transform(X_train)
X_test = feature_encoding.transform(X_test)

>>>> Starting to encode the features ...
Starting transformations...
Dropping 'id' column...
Performing dummy encoding on 'vendor_id' column...
Performing dummy encoding on 'store_and_fwd_flag' column...
Performing cyclical encoding on 'pickup_month' column...
Cyclical encoding for 'pickup_month' completed.
Performing cyclical encoding on 'pickup_day' column...
Cyclical encoding for 'pickup_day' completed.
Performing cyclical encoding on 'pickup_day_of_week' column...
Cyclical encoding for 'pickup_day_of_week' completed.
Performing cyclical encoding on 'pickup_hour' column...
Cyclical encoding for 'pickup_hour' completed.
Dropping original columns after encoding...
Transformation completed.
>>>> Starting to encode the features ...
Starting transformations...
Dropping 'id' column...
Performing dummy encoding on 'vendor_id' column...
Performing dummy encoding on 'store_and_fwd_flag' column...
Performing cyclical encoding on 'pickup_month' column...
Cyclical encoding for 'pickup_month' co

In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1155267 entries, 1053743 to 121958
Data columns (total 16 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   passenger_count         1155267 non-null  int64  
 1   pickup_longitude        1155267 non-null  float64
 2   pickup_latitude         1155267 non-null  float64
 3   dropoff_longitude       1155267 non-null  float64
 4   dropoff_latitude        1155267 non-null  float64
 5   distance_osrm           1155267 non-null  float64
 6   vendor_id_2             1155267 non-null  uint8  
 7   store_and_fwd_flag_Y    1155267 non-null  uint8  
 8   pickup_month_sin        1155267 non-null  float64
 9   pickup_month_cos        1155267 non-null  float64
 10  pickup_day_sin          1155267 non-null  float64
 11  pickup_day_cos          1155267 non-null  float64
 12  pickup_day_of_week_sin  1155267 non-null  float64
 13  pickup_day_of_week_cos  1155267 non-null  float64
 1

In [16]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291728 entries, 67250 to 589044
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   passenger_count         291728 non-null  int64  
 1   pickup_longitude        291728 non-null  float64
 2   pickup_latitude         291728 non-null  float64
 3   dropoff_longitude       291728 non-null  float64
 4   dropoff_latitude        291728 non-null  float64
 5   distance_osrm           291728 non-null  float64
 6   vendor_id_2             291728 non-null  uint8  
 7   store_and_fwd_flag_Y    291728 non-null  uint8  
 8   pickup_month_sin        291728 non-null  float64
 9   pickup_month_cos        291728 non-null  float64
 10  pickup_day_sin          291728 non-null  float64
 11  pickup_day_cos          291728 non-null  float64
 12  pickup_day_of_week_sin  291728 non-null  float64
 13  pickup_day_of_week_cos  291728 non-null  float64
 14  pickup_hour_sin 

In [17]:
models = [
    ('AB', AdaBoostRegressor()),
    ('XGBR', XGBRegressor()),
    ('LGBM', LGBMRegressor(verbose=0))
]

In [18]:
result = evaluate_models(models, X_train, y_train, X_test, y_test)

AB - RMSLE: 0.6384, Fit time: 28.4388 seconds
XGBR - RMSLE: 0.5942, Fit time: 11.3917 seconds
LGBM - RMSLE: 0.5452, Fit time: 11.8531 seconds


## Result

### With Restricting Boundary Area

In [26]:
print(result)

AB - RMSLE: 0.5954, Fit time: 38.6425 seconds
XGBR - RMSLE: 0.5948, Fit time: 11.9088 seconds
LGBM - RMSLE: 0.5461, Fit time: 11.8881 seconds


### Without Restricting Boundary Area

In [19]:
print(result)

AB - RMSLE: 0.6384, Fit time: 28.4388 seconds
XGBR - RMSLE: 0.5942, Fit time: 11.3917 seconds
LGBM - RMSLE: 0.5452, Fit time: 11.8531 seconds


## Final Model

LGBM

In [1]:
from playground import *
df = pd.read_csv("csv_ml/eda_01.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 14 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id                       1458644 non-null  object 
 1   vendor_id                1458644 non-null  int64  
 2   pickup_datetime          1458644 non-null  object 
 3   dropoff_datetime         1458644 non-null  object 
 4   passenger_count          1458644 non-null  int64  
 5   pickup_longitude         1458644 non-null  float64
 6   pickup_latitude          1458644 non-null  float64
 7   dropoff_longitude        1458644 non-null  float64
 8   dropoff_latitude         1458644 non-null  float64
 9   store_and_fwd_flag       1458644 non-null  object 
 10  trip_duration            1458644 non-null  int64  
 11  distance_osrm            1458627 non-null  float64
 12  pickup_dist_NYC_center   1458644 non-null  float64
 13  dropoff_dist_NYC_center  1458644 non-null 

In [2]:
X = df.drop(columns="trip_duration", axis=1)
y = df["trip_duration"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [4]:
%%time
rm_duplicates = RemoveDuplicates()
X_train, y_train = rm_duplicates.transform(X_train, y_train)

>>>> Starting the process of removing duplicates ...
No duplicates found.
CPU times: total: 5.41 s
Wall time: 5.49 s


In [5]:
%%time
to_dtypes = ToDataTypes()
X_train, y_train = to_dtypes.transform(X_train, y_train)
X_test, y_test = to_dtypes.transform(X_test, y_test)

>>>> Starting data type conversion process...
Column 'vendor_id' changed from int64 to object
Column 'pickup_datetime' changed from object to datetime64[ns]
Column 'dropoff_datetime' changed from object to datetime64[ns]
>>>> Starting data type conversion process...
Column 'vendor_id' changed from int64 to object
Column 'pickup_datetime' changed from object to datetime64[ns]
Column 'dropoff_datetime' changed from object to datetime64[ns]
CPU times: total: 5.17 s
Wall time: 5.26 s


In [6]:
%%time
datetime_break = DateTimeBreak()
X_train = datetime_break.transform(X_train)
X_test = datetime_break.transform(X_test)

>>>> Starting datetime feature extraction...
Extracted features: ['pickup_month', 'pickup_day', 'pickup_day_of_week', 'pickup_hour']
Dropped columns: ['pickup_datetime', 'dropoff_datetime']
>>>> Starting datetime feature extraction...
Extracted features: ['pickup_month', 'pickup_day', 'pickup_day_of_week', 'pickup_hour']
Dropped columns: ['pickup_datetime', 'dropoff_datetime']
CPU times: total: 5.67 s
Wall time: 5.72 s


In [7]:
%time
miss_val_input = MissValInput()
X_train, y_train = miss_val_input.fit_transform(X_train, y_train)
X_test, y_test = miss_val_input.transform(X_test, y_test)

CPU times: total: 0 ns
Wall time: 0 ns
>>>> Starting missing value imputation...
Dropped 16 entries (0.00%) from column 'distance_osrm' due to missing values.
Initial data length: 1166915
Removed data: 16 (0.00%)
Final data length: 1166899
>>>> Starting missing value imputation...
Dropped 1 entries (0.00%) from column 'distance_osrm' due to missing values.
Initial data length: 291729
Removed data: 1 (0.00%)
Final data length: 291728


In [8]:
%%time
feature_restriction = FeatureRestriction()
X_train, y_train = feature_restriction.transform(X_train, y_train)

>>>> Starting features restriction ...
The dataset size: 1166899 rows
trip_duration (old) -> [min, max]: [1, 3526282]
trip_duration (new) -> [min, max]: [60, 86392]
distance_osrm (old) -> [min, max]: [0.0, 765.6445]
distance_osrm (new) -> [min, max]: [0.1001, 97.7243]
speed_osrm column not found, skipping restriction on 'speed_osrm'.
passenger_count (old) -> [min, max]: [0, 8]
passenger_count (new) -> [min, max]: [1, 8]
Total removed data: 11632 (1.00%)
CPU times: total: 1.91 s
Wall time: 1.94 s


In [9]:
%%time
outlier_mapper = OutlierMapper(map_title="outliers_map_ml_baseline", csv_dir="csv_ml_baseline", html_dir="html_ml_baseline")
X_train, y_train = outlier_mapper.transform(X_train, y_train)
X_test.drop(columns=['pickup_dist_NYC_center', 'dropoff_dist_NYC_center'], inplace=True)

>>>> Starting New York City map restriction ...
Outliers saved to 'csv_ml_baseline\outliers_map_ml_baseline.csv'
Map saved as 'html_ml_baseline/outliers_map_ml_baseline.html'
Removed 71 (0.01%) records outside NYC boundaries.
CPU times: total: 2.45 s
Wall time: 2.51 s


In [10]:
%%time
feature_encoding = FeatureEncoding()
X_train = feature_encoding.transform(X_train)
X_test = feature_encoding.transform(X_test)

>>>> Starting to encode the features ...
Starting transformations...
Dropping 'id' column...
Performing dummy encoding on 'vendor_id' column...
Performing dummy encoding on 'store_and_fwd_flag' column...
Performing cyclical encoding on 'pickup_month' column...
Cyclical encoding for 'pickup_month' completed.
Performing cyclical encoding on 'pickup_day' column...
Cyclical encoding for 'pickup_day' completed.
Performing cyclical encoding on 'pickup_day_of_week' column...
Cyclical encoding for 'pickup_day_of_week' completed.
Performing cyclical encoding on 'pickup_hour' column...
Cyclical encoding for 'pickup_hour' completed.
Dropping original columns after encoding...
Transformation completed.
>>>> Starting to encode the features ...
Starting transformations...
Dropping 'id' column...
Performing dummy encoding on 'vendor_id' column...
Performing dummy encoding on 'store_and_fwd_flag' column...
Performing cyclical encoding on 'pickup_month' column...
Cyclical encoding for 'pickup_month' co

In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1155196 entries, 1053743 to 121958
Data columns (total 16 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   passenger_count         1155196 non-null  int64  
 1   pickup_longitude        1155196 non-null  float64
 2   pickup_latitude         1155196 non-null  float64
 3   dropoff_longitude       1155196 non-null  float64
 4   dropoff_latitude        1155196 non-null  float64
 5   distance_osrm           1155196 non-null  float64
 6   vendor_id_2             1155196 non-null  uint8  
 7   store_and_fwd_flag_Y    1155196 non-null  uint8  
 8   pickup_month_sin        1155196 non-null  float64
 9   pickup_month_cos        1155196 non-null  float64
 10  pickup_day_sin          1155196 non-null  float64
 11  pickup_day_cos          1155196 non-null  float64
 12  pickup_day_of_week_sin  1155196 non-null  float64
 13  pickup_day_of_week_cos  1155196 non-null  float64
 1

In [12]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291728 entries, 67250 to 589044
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   passenger_count         291728 non-null  int64  
 1   pickup_longitude        291728 non-null  float64
 2   pickup_latitude         291728 non-null  float64
 3   dropoff_longitude       291728 non-null  float64
 4   dropoff_latitude        291728 non-null  float64
 5   distance_osrm           291728 non-null  float64
 6   vendor_id_2             291728 non-null  uint8  
 7   store_and_fwd_flag_Y    291728 non-null  uint8  
 8   pickup_month_sin        291728 non-null  float64
 9   pickup_month_cos        291728 non-null  float64
 10  pickup_day_sin          291728 non-null  float64
 11  pickup_day_cos          291728 non-null  float64
 12  pickup_day_of_week_sin  291728 non-null  float64
 13  pickup_day_of_week_cos  291728 non-null  float64
 14  pickup_hour_sin 

In [13]:
# Define models
models = [
    ('LGBM', LGBMRegressor(verbose=0))
]

In [14]:
k_range = range(1, X_train.shape[1] + 1)  # From 1 to number of features
results = {}

In [15]:
for k in k_range:
    print(f"Evaluating for k={k}...")
    
    # Feature selection
    selector = SelectKBest(score_func=f_regression, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Evaluate models
    stage_results = evaluate_models(models, X_train_selected, y_train, X_test_selected, y_test)
    
    # Store results
    results[k] = stage_results
    

Evaluating for k=1...
LGBM - RMSLE: 0.5918, Fit time: 5.6630 seconds
Evaluating for k=2...
LGBM - RMSLE: 0.5847, Fit time: 3.4909 seconds
Evaluating for k=3...
LGBM - RMSLE: 0.5846, Fit time: 4.1136 seconds
Evaluating for k=4...
LGBM - RMSLE: 0.5789, Fit time: 4.2470 seconds
Evaluating for k=5...
LGBM - RMSLE: 0.5724, Fit time: 5.2788 seconds
Evaluating for k=6...
LGBM - RMSLE: 0.5805, Fit time: 5.8019 seconds
Evaluating for k=7...
LGBM - RMSLE: 0.5663, Fit time: 6.6725 seconds
Evaluating for k=8...
LGBM - RMSLE: 0.5637, Fit time: 7.2219 seconds
Evaluating for k=9...
LGBM - RMSLE: 0.5602, Fit time: 7.6097 seconds
Evaluating for k=10...
LGBM - RMSLE: 0.5518, Fit time: 7.7974 seconds
Evaluating for k=11...
LGBM - RMSLE: 0.5528, Fit time: 8.7655 seconds
Evaluating for k=12...
LGBM - RMSLE: 0.5533, Fit time: 8.8822 seconds
Evaluating for k=13...
LGBM - RMSLE: 0.5516, Fit time: 10.3007 seconds
Evaluating for k=14...
LGBM - RMSLE: 0.5516, Fit time: 9.4740 seconds
Evaluating for k=15...
LGBM 

In [16]:
# Parsing the string results into dictionaries
parsed_results = {}

for k, metrics_str in results.items():
    print(f"Results for k={k}:")
    metrics = {}
    # Split the string by newline and then by " - "
    for line in metrics_str.split("\n"):
        model_name, metric_str = line.split(" - ")
        rmsle_value = float(metric_str.split(": ")[1].split(",")[0])  # Extract RMSLE value
        metrics[model_name] = rmsle_value
        print(f"{model_name}: RMSLE = {rmsle_value}")
    parsed_results[k] = metrics

Results for k=1:
LGBM: RMSLE = 0.5918
Results for k=2:
LGBM: RMSLE = 0.5847
Results for k=3:
LGBM: RMSLE = 0.5846
Results for k=4:
LGBM: RMSLE = 0.5789
Results for k=5:
LGBM: RMSLE = 0.5724
Results for k=6:
LGBM: RMSLE = 0.5805
Results for k=7:
LGBM: RMSLE = 0.5663
Results for k=8:
LGBM: RMSLE = 0.5637
Results for k=9:
LGBM: RMSLE = 0.5602
Results for k=10:
LGBM: RMSLE = 0.5518
Results for k=11:
LGBM: RMSLE = 0.5528
Results for k=12:
LGBM: RMSLE = 0.5533
Results for k=13:
LGBM: RMSLE = 0.5516
Results for k=14:
LGBM: RMSLE = 0.5516
Results for k=15:
LGBM: RMSLE = 0.5457
Results for k=16:
LGBM: RMSLE = 0.5461
