## Picking the right model

Creating a basic linear regression model using the insights captured from the NYC Taxi EDA.

In [None]:
import requests

import datetime as dt

from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import holidays

import pandas as pd
import numpy as np

from geopy.distance import geodesic
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
taxi_train_df = pd.read_csv("data/train.csv")

In [4]:
BLIZZARD_DATES = [dt.datetime(2016, 1, 23).date(), dt.datetime(2016, 1, 24).date()]
HOURLY_BIN_EDGES = [-1, 3, 6, 9, 12, 15, 18, 21, 24]
HOURLY_BUCKETS = [
    "0-3",
    "3-6",
    "6-9",
    "9-12",
    "12-15",
    "15-18",
    "18-21",
    "21-24",
]
PASSENGER_BIN_EDGES = [-1, 0, 3, 6, 9]
PASSENGER_BUCKETS = ["0", "1-3", "4-6", "7-9"]

In [5]:
def osrm_distance(lat1, lon1, lat2, lon2, max_retries=3):
    url = f"http://127.0.0.1:5000/route/v1/driving/{lon1},{lat1};{lon2},{lat2}?overview=false"
    retries = Retry(
        total=max_retries, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]
    )
    session = requests.Session()
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    for attempt in range(max_retries + 1):
        try:
            response = session.get(url)
            response.raise_for_status()

            if response.status_code == 200:
                distance = response.json()["routes"][0]["distance"] / 1000
                return distance
            else:
                print(f"Error: {response.status_code}")
                return np.nan

        except requests.exceptions.RequestException as e:
            print(f"Request failed on attempt {attempt + 1}/{max_retries + 1}: {e}")

    print(f"Maximum number of retries reached. Unable to complete the request.")
    return np.nan

In [6]:
def haversine_distance(lat1, lon1, lat2, lon2):
    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)
    return geodesic(coords_1, coords_2).kilometers

In [7]:
def calculate_osrm_distance(lat1, lon1, lat2, lon2, dist):
    if np.isnan(dist):
        distance = osrm_distance(lat1, lon1, lat2, lon2)
        if np.isnan(distance):
            distance = haversine_distance(lat1, lon1, lat2, lon2)
        return distance
    else:
        return dist

In [8]:
def display_results(grid_result):
    results = pd.DataFrame(grid_result.cv_results_)
    columns_of_interest = ["params", "mean_test_score", "std_test_score"]
    results_subset = results[columns_of_interest]
    results_subset.sort_values(by="mean_test_score", ascending=False, inplace=True)
    results_subset.reset_index(drop=True, inplace=True)
    return results_subset

## Data Preparation

### Datetime Columns

In [9]:
taxi_train_df["pickup_datetime"] = pd.to_datetime(taxi_train_df["pickup_datetime"])

In [10]:
taxi_train_df["pickup_dayofweek"] = taxi_train_df["pickup_datetime"].dt.dayofweek
taxi_train_df["pickup_date"] = taxi_train_df["pickup_datetime"].dt.date
taxi_train_df["pickup_hour"] = taxi_train_df[
    "pickup_datetime"
].dt.hour
taxi_train_df["pickup_timeofday"] = pd.cut(
    taxi_train_df["pickup_hour"],
    bins=HOURLY_BIN_EDGES,
    labels=HOURLY_BUCKETS,
    right=False,
)

In [11]:
taxi_train_df = taxi_train_df.loc[
    ~taxi_train_df["pickup_date"].isin(BLIZZARD_DATES)
]

In [12]:
taxi_train_df.drop(
    ["pickup_datetime", "pickup_date", "pickup_hour", "dropoff_datetime"],
    axis=1,
    inplace=True,
)

### Categorical Data

In [13]:
taxi_train_df["passenger_count_bucket"] = pd.cut(
    taxi_train_df["passenger_count"],
    bins=PASSENGER_BIN_EDGES,
    labels=PASSENGER_BUCKETS,
)

In [14]:
columns_to_encode = [
    "vendor_id",
    "store_and_fwd_flag",
    "pickup_dayofweek",
    "pickup_timeofday",
    "passenger_count_bucket",
]
taxi_ohe_train_df = pd.get_dummies(taxi_train_df, columns=columns_to_encode, dtype=int)

### Distances

In [15]:
# these were calculated previously using a local OSRM instance
precalculated_osrm_distances_df = pd.read_csv("data/train_osrm_distances.csv")

In [16]:
taxi_ohe_train_df = pd.merge(
    taxi_ohe_train_df,
    precalculated_osrm_distances_df[["id", "distance_osrm"]],
    on="id",
    how="left",
)

In [17]:
taxi_ohe_train_df["distance_osrm"] = taxi_ohe_train_df.apply(
    lambda x: calculate_osrm_distance(
        x["pickup_latitude"],
        x["pickup_longitude"],
        x["dropoff_latitude"],
        x["dropoff_longitude"],
        x["distance_osrm"],
    ),
    axis=1,
)

Request failed on attempt 1/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-74.04054260253906,40.71708679199219;-74.04055023193358,40.717090606689446?overview=false
Request failed on attempt 2/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-74.04054260253906,40.71708679199219;-74.04055023193358,40.717090606689446?overview=false
Request failed on attempt 3/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-74.04054260253906,40.71708679199219;-74.04055023193358,40.717090606689446?overview=false
Request failed on attempt 4/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-74.04054260253906,40.71708679199219;-74.04055023193358,40.717090606689446?overview=false
Maximum number of retries reached. Unable to complete the request.
Request failed on attempt 1/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-74.00809478759764,40.725883

Request failed on attempt 2/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-73.993896484375,40.75139617919922;-73.99386596679686,40.75139617919922?overview=false
Request failed on attempt 3/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-73.993896484375,40.75139617919922;-73.99386596679686,40.75139617919922?overview=false
Request failed on attempt 4/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-73.993896484375,40.75139617919922;-73.99386596679686,40.75139617919922?overview=false
Maximum number of retries reached. Unable to complete the request.
Request failed on attempt 1/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-74.0278091430664,40.753456115722656;-74.02771759033203,40.75362014770508?overview=false
Request failed on attempt 2/4: 400 Client Error: Bad Request for url: http://127.0.0.1:5000/route/v1/driving/-74.0278091430664,40.753456115722656;-

* Now we know that this data is normally distributed, given the distribution plot we saw previously.
* Let's scale this so the regression algorithms can converge.

In [18]:
distance_scaler = StandardScaler()
distance_data = taxi_ohe_train_df["distance_osrm"].values.reshape(-1, 1)
distance_scaler.fit(distance_data)
taxi_ohe_train_df["scaled_distance_osrm"] = distance_scaler.transform(distance_data)

In [19]:
taxi_ohe_train_df.drop(
    [
        "passenger_count",
        "id",
        "pickup_longitude",
        "distance_osrm",
        "pickup_latitude",
        "dropoff_longitude",
        "dropoff_latitude",
    ],
    axis=1,
    inplace=True,
)

In [20]:
taxi_ohe_train_df.sample(2).T

Unnamed: 0,22353,636913
trip_duration,241.0,1813.0
vendor_id_1,0.0,1.0
vendor_id_2,1.0,0.0
store_and_fwd_flag_N,1.0,1.0
store_and_fwd_flag_Y,0.0,0.0
pickup_dayofweek_0,0.0,0.0
pickup_dayofweek_1,0.0,0.0
pickup_dayofweek_2,0.0,1.0
pickup_dayofweek_3,1.0,0.0
pickup_dayofweek_4,0.0,0.0


## Hyperparameter Tuning

In [21]:
# Splitting data into features and target variable
X = taxi_ohe_train_df.drop("trip_duration", axis=1)
y = taxi_ohe_train_df["trip_duration"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

### Linear Regression

* Since we don't know the kind of impact some of these features will have on the predicted value, we'll use linear regression with various forms of regularization to check how well our model generalizes.
* We'll also use cross validation for selecting our hyperparameters.

In [22]:
linear_model = LinearRegression()
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

In [23]:
grid_search = GridSearchCV(
    linear_model, param_grid, scoring="neg_mean_squared_error", cv=kfold, verbose=2
)
grid_result = grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.0s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END ..................fit_intercept=True, positive=True; total time=   1.1s
[CV] END .................fit_intercept=True, positive=False; total time=   0.5s
[CV] END .................fit_intercept=True, po

In [24]:
print("Linear Regression Results:")
display_results(grid_result)

Linear Regression Results:


Unnamed: 0,params,mean_test_score,std_test_score
0,"{'fit_intercept': True, 'positive': False}",-27277240.0,32265210.0
1,"{'fit_intercept': True, 'positive': True}",-27277260.0,32265220.0
2,"{'fit_intercept': False, 'positive': True}",-27277270.0,32265230.0
3,"{'fit_intercept': False, 'positive': False}",-27277270.0,32265220.0


In [25]:
best_params = grid_result.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'fit_intercept': True, 'positive': False}


In [26]:
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_pred)))
mae = mean_absolute_error(y_val, y_pred)

In [27]:
print("RMSE:", rmse)
print("RMSLE:", rmsle)
print("MAE:", mae)

RMSE: 5179.400797576766
RMSLE: 0.6614995742307068
MAE: 449.5094728826522


### Ridge Regression Model

In [28]:
ridge_model = Ridge()

In [29]:
ridge_param_grid = {
    "alpha": [0.1, 1, 10],
    "fit_intercept": [True, False],
    "solver": ["auto", "svd", "lsqr"],
}

In [30]:
ridge_grid_search = GridSearchCV(
    ridge_model, ridge_param_grid, scoring="neg_mean_squared_error", cv=kfold, verbose=2
)
ridge_grid_result = ridge_grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.3s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.3s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END .........alpha=0.1, fit_intercept=True, solver=auto; total time=   0.2s
[CV] END ..........alpha=0.1, fit_intercept=True, solver=svd; total time=   0.6s
[CV] END ..........alpha=0.1, fit_intercept=Tr

[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.6s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.6s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.7s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.7s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.6s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.6s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.6s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.7s
[CV] END ...........alpha=1, fit_intercept=False, solver=svd; total time=   0.7s
[CV] END ..........alpha=1, fit_intercept=False, solver=lsqr; total time=   0.4s
[CV] END ..........alpha=1, fit_intercept=False, solver=lsqr; total time=   0.2s
[CV] END ..........alpha=1, fit_intercept=False, solver=lsqr; total time=   0.2s
[CV] END ..........alpha=1, 

In [34]:
print("Ridge Regression Results:")
display_results(ridge_grid_result)

Ridge Regression Results:


Unnamed: 0,params,mean_test_score,std_test_score
0,"{'alpha': 10, 'fit_intercept': True, 'solver':...",-27277070.0,32265270.0
1,"{'alpha': 1, 'fit_intercept': True, 'solver': ...",-27277070.0,32265270.0
2,"{'alpha': 0.1, 'fit_intercept': True, 'solver'...",-27277070.0,32265270.0
3,"{'alpha': 10, 'fit_intercept': False, 'solver'...",-27277080.0,32265240.0
4,"{'alpha': 1, 'fit_intercept': False, 'solver':...",-27277080.0,32265240.0
5,"{'alpha': 0.1, 'fit_intercept': False, 'solver...",-27277080.0,32265240.0
6,"{'alpha': 10, 'fit_intercept': False, 'solver'...",-27277220.0,32265240.0
7,"{'alpha': 10, 'fit_intercept': False, 'solver'...",-27277220.0,32265240.0
8,"{'alpha': 10, 'fit_intercept': True, 'solver':...",-27277230.0,32265240.0
9,"{'alpha': 10, 'fit_intercept': True, 'solver':...",-27277230.0,32265240.0


In [32]:
best_params = ridge_grid_result.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'alpha': 10, 'fit_intercept': True, 'solver': 'lsqr'}


In [40]:
best_model = ridge_grid_result.best_estimator_
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_pred)))
mae = mean_absolute_error(y_val, y_pred)

In [41]:
print("RMSE:", rmse)
print("RMSLE:", rmsle)
print("MAE:", mae)

RMSE: 5179.390374015261
RMSLE: 0.6613375620022275
MAE: 449.5095178212238


### Lasso Regression Model

In [37]:
lasso_model = Lasso()

In [42]:
lasso_param_grid = {
    'alpha': [0.1, 1, 10],
    'fit_intercept': [True, False],
    'selection': ['cyclic', 'random']
}

In [44]:
lasso_grid_search = GridSearchCV(
    lasso_model, lasso_param_grid, scoring="neg_mean_squared_error", cv=kfold, verbose=2
)
lasso_grid_result = lasso_grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   2.2s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   2.1s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   2.5s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   2.6s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   4.3s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   2.8s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   3.8s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   2.6s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   5.9s
[CV] END ....alpha=0.1, fit_intercept=True, selection=cyclic; total time=   2.9s
[CV] END ....alpha=0.1, fit_intercept=True, selection=random; total time=   2.4s
[CV] END ....alpha=0.1, fit_intercept=True, se

[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   3.1s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   4.8s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   3.4s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   2.9s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   2.8s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   3.0s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   2.8s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   3.5s
[CV] END ....alpha=10, fit_intercept=False, selection=cyclic; total time=   3.0s
[CV] END ....alpha=10, fit_intercept=False, selection=random; total time=   6.5s
[CV] END ....alpha=10, fit_intercept=False, selection=random; total time=  10.5s
[CV] END ....alpha=10, fit_intercept=False, selection=random; total time=   3.5s
[CV] END ....alpha=10, fit_i

In [45]:
print("\nLasso Regression Results:")
display_results(lasso_grid_result)


Lasso Regression Results:


Unnamed: 0,params,mean_test_score,std_test_score
0,"{'alpha': 0.1, 'fit_intercept': False, 'select...",-27277060.0,32265270.0
1,"{'alpha': 0.1, 'fit_intercept': True, 'selecti...",-27277060.0,32265260.0
2,"{'alpha': 0.1, 'fit_intercept': True, 'selecti...",-27277060.0,32265250.0
3,"{'alpha': 0.1, 'fit_intercept': False, 'select...",-27277090.0,32265280.0
4,"{'alpha': 1, 'fit_intercept': True, 'selection...",-27277110.0,32265170.0
5,"{'alpha': 1, 'fit_intercept': True, 'selection...",-27277110.0,32265170.0
6,"{'alpha': 1, 'fit_intercept': False, 'selectio...",-27277200.0,32265310.0
7,"{'alpha': 1, 'fit_intercept': False, 'selectio...",-27277270.0,32265300.0
8,"{'alpha': 10, 'fit_intercept': True, 'selectio...",-27283960.0,32265440.0
9,"{'alpha': 10, 'fit_intercept': True, 'selectio...",-27283960.0,32265440.0


In [46]:
best_params = ridge_grid_result.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'alpha': 10, 'fit_intercept': True, 'solver': 'lsqr'}


In [47]:
best_model = ridge_grid_result.best_estimator_
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_pred)))
mae = mean_absolute_error(y_val, y_pred)

In [48]:
print("RMSE:", rmse)
print("RMSLE:", rmsle)
print("MAE:", mae)

RMSE: 5179.390374015261
RMSLE: 0.6613375620022275
MAE: 449.5095178212238


### Random Forests

* We know that the data has some non-linear patterns while we performed our EDA.
* Rather than using linear regression with L1/L2 regularization, let's use a tree-based algorithm to capture these patterns.

In [50]:
rf_model = RandomForestRegressor()

In [54]:
rf_param_grid = {
    "n_estimators": [100],
    "max_depth": [10, 20],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [2, 4],
    "bootstrap": [True],
}

In [55]:
rf_grid_search = GridSearchCV(
    rf_model, rf_param_grid, scoring="neg_mean_squared_error", cv=kfold, verbose=2
)
rf_grid_result = rf_grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.1min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV

[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.1min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.1min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.1min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.2min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.1min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.1min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.1min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 3.1min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=

In [56]:
print("\nRandom Forest Regression Results:")
display_results(rf_grid_result)


Random Forest Regression Results:


Unnamed: 0,params,mean_test_score,std_test_score
0,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",-28442790.0,31974790.0
1,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",-28557870.0,31912560.0
2,"{'bootstrap': True, 'max_depth': 20, 'min_samp...",-28776970.0,31902320.0
3,"{'bootstrap': True, 'max_depth': 20, 'min_samp...",-28898780.0,31885150.0
4,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",-29234710.0,31443140.0
5,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",-29365840.0,31468180.0
6,"{'bootstrap': True, 'max_depth': 20, 'min_samp...",-30079150.0,31389040.0
7,"{'bootstrap': True, 'max_depth': 20, 'min_samp...",-30187250.0,31340470.0


In [57]:
best_params = rf_grid_result.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}


In [58]:
best_model = rf_grid_result.best_estimator_
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_pred)))
mae = mean_absolute_error(y_val, y_pred)

In [59]:
print("RMSE:", rmse)
print("RMSLE:", rmsle)
print("MAE:", mae)

RMSE: 5451.827282354736
RMSLE: 0.5691504801989471
MAE: 425.3022511058619


Random Forest looks promising, so let's create a mini-pipeline using that.