# Building a model

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    RandomizedSearchCV,
    GridSearchCV,
    train_test_split,
    cross_validate,
)
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
from boruta import BorutaPy

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
TUNING = 0

<IPython.core.display.Javascript object>

In [3]:
train_df = pd.read_csv(
    "data/train_cleaned.csv", parse_dates=["Scheduled Date", "Delivery Date"]
)
test_df = pd.read_csv(
    "data/test_cleaned.csv", parse_dates=["Scheduled Date", "Delivery Date"]
)

<IPython.core.display.Javascript object>

## Boruta for Feature importance

Features we're going to be using:
```python
[
    "Artist Reputation",
    "Height",
    "Width",
    "Weight",
    "Material",
    "Price Of Sculpture",
    "Base Shipping Price",
    "International",
    "Express Shipment",
    "Installation Included",
    "Transport",
    "Fragile",
    "Customer Information",
    "Remote Location",
    "Cost",
    "delivery_offset",
    "scheduled_year_month",
    "scheduled_month",
    "Customer State",
    "Area",
    "Price per unit weight",
]
```

In [4]:
train_X_dummified = pd.get_dummies(
    train_df[
        [
            "Artist Reputation",
            "Height",
            "Width",
            "Weight",
            "Material",
            "Price Of Sculpture",
            "Base Shipping Price",
            "International",
            "Express Shipment",
            "Installation Included",
            "Transport",
            "Fragile",
            "Customer Information",
            "Remote Location",
            "delivery_offset",
            "scheduled_year_month",
            "scheduled_month",
            "Customer State",
            "Area",
            "Price per unit weight",
        ]
    ]
)
train_y = train_df[["Cost"]]

<IPython.core.display.Javascript object>

In [5]:
if TUNING:
    # initialize Boruta
    rfr_boruta = RandomForestRegressor(n_jobs=-1, max_depth=5)
    boruta = BorutaPy(
        estimator=rfr_boruta,
        n_estimators="auto",
        max_iter=100,  # number of trials to perform
    )

    # fit Boruta (it accepts np.array, not pd.DataFrame)
    boruta.fit(np.array(train_X_dummified), np.array(train_y))

    green_area = train_X_dummified.columns[boruta.support_].to_list()
    blue_area = train_X_dummified.columns[boruta.support_weak_].to_list()

    print("features in the green area:", green_area)
    print("features in the blue area:", blue_area)

<IPython.core.display.Javascript object>

According to Boruta:

* features in the green area: ['Artist Reputation', 'Price Of Sculpture', 'Base Shipping Price', 'Customer State_ID']
* features in the blue area: ['Weight', 'scheduled_month']

In [6]:
if TUNING:
    pd.DataFrame(
        {"column": train_X_dummified.columns.to_list(), "rank": boruta.ranking_}
    ).sort_values(by="rank").to_csv("data/boruta_feature_ranking.csv", index=False)

<IPython.core.display.Javascript object>

Based on the rankings, these are the features we'll be using:

```python
[
    "Artist Reputation",
    "Customer State",
    "Price Of Sculpture",
    "Base Shipping Price",
    "scheduled_month",
    "Area",
    "Weight",
    "delivery_offset",
    "Transport",
    "Material",
    "Customer Information",
    "Installation Included",
    "Express Shipment",
    "Fragile",
    "Remote Location",
    "International",
]
```

In [7]:
keep_features_X = [
    "Artist Reputation",
    "Customer State",
    "Price Of Sculpture",
    "Base Shipping Price",
    "scheduled_month",
    "Area",
    "Weight",
    "delivery_offset",
    "Transport",
    "Material",
    "Customer Information",
    "Installation Included",
    "Express Shipment",
    "Fragile",
    "Remote Location",
    "International",
]

<IPython.core.display.Javascript object>

In [8]:
keep_cat_features_X = [
    "Customer State",
    "scheduled_month",
    "Transport",
    "Material",
    "Customer Information",
    "Installation Included",
    "Express Shipment",
    "Fragile",
    "Remote Location",
    "International",
]

<IPython.core.display.Javascript object>

In [9]:
train_df[keep_cat_features_X] = train_df[keep_cat_features_X].astype(object)

<IPython.core.display.Javascript object>

## Random Forests

We're going using a Random Search and then a Grid Search to find the optimal hyperparameters for a Random Forests model.

### Train-Validation Split

In [10]:
(train_X_df, val_X_df, train_y_df, val_y_df,) = train_test_split(
    train_df[keep_features_X],
    train_df[["Cost"]],
    test_size=0.35,
    random_state=42,
)

<IPython.core.display.Javascript object>

In [11]:
# dummifying variables
train_X_df = pd.get_dummies(train_X_df)
val_X_df = pd.get_dummies(val_X_df)

<IPython.core.display.Javascript object>

### Random Search

In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]

# Number of features to consider at every split
max_features = ["auto", "sqrt"]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

<IPython.core.display.Javascript object>

In [13]:
random_rf_reg = RandomForestRegressor()

<IPython.core.display.Javascript object>

In [14]:
rf_random_search = RandomizedSearchCV(
    estimator=random_rf_reg,
    param_distributions=random_grid,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)

<IPython.core.display.Javascript object>

In [15]:
if TUNING:
    rf_random_search.fit(train_X_df, train_y_df)

<IPython.core.display.Javascript object>

In [16]:
if TUNING:
    print(rf_random_search.best_params_)

<IPython.core.display.Javascript object>

Random Search best params:

```python
{
    "n_estimators": 400,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
    "max_depth": None,
    "bootstrap": False,
}
```

In [17]:
rf_random_search = RandomForestRegressor(
    **{
        "n_estimators": 400,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "max_features": "sqrt",
        "max_depth": None,
        "bootstrap": False,
    }
)

<IPython.core.display.Javascript object>

In [18]:
if TUNING:
    scores = cross_validate(
        rf_random_search,
        pd.get_dummies(train_df[keep_features_X]),
        train_df["Cost"],
        scoring=("r2", "neg_mean_squared_error"),
        return_train_score=True,
        cv=5,
    )
    print(scores["test_r2"].mean())
    print(scores["test_neg_mean_squared_error"].mean())

<IPython.core.display.Javascript object>

In [19]:
if TUNING:
    rf_base_reg = RandomForestRegressor()
    scores = cross_validate(
        rf_base_reg,
        pd.get_dummies(train_df[keep_features_X]),
        train_df["Cost"],
        scoring=("r2", "neg_mean_squared_error"),
        return_train_score=True,
        cv=5,
    )
    print(scores["test_r2"].mean())
    print(scores["test_neg_mean_squared_error"].mean())

<IPython.core.display.Javascript object>

Since the MSE are very high, we'll be using a different model.

## CatBoost

In [20]:
cat_features = [
    "Customer State_AK",
    "Customer State_AL",
    "Customer State_AP",
    "Customer State_AR",
    "Customer State_AZ",
    "Customer State_CA",
    "Customer State_CO",
    "Customer State_CT",
    "Customer State_DC",
    "Customer State_DE",
    "Customer State_DP",
    "Customer State_FL",
    "Customer State_FP",
    "Customer State_GA",
    "Customer State_HI",
    "Customer State_IA",
    "Customer State_ID",
    "Customer State_IL",
    "Customer State_IN",
    "Customer State_KS",
    "Customer State_KY",
    "Customer State_LA",
    "Customer State_MA",
    "Customer State_MD",
    "Customer State_ME",
    "Customer State_MI",
    "Customer State_MN",
    "Customer State_MO",
    "Customer State_MS",
    "Customer State_MT",
    "Customer State_NC",
    "Customer State_ND",
    "Customer State_NE",
    "Customer State_NH",
    "Customer State_NJ",
    "Customer State_NM",
    "Customer State_NV",
    "Customer State_NY",
    "Customer State_OH",
    "Customer State_OK",
    "Customer State_OR",
    "Customer State_PA",
    "Customer State_RI",
    "Customer State_SC",
    "Customer State_SD",
    "Customer State_TN",
    "Customer State_TX",
    "Customer State_UT",
    "Customer State_VA",
    "Customer State_VT",
    "Customer State_WA",
    "Customer State_WI",
    "Customer State_WV",
    "Customer State_WY",
    "scheduled_month_1",
    "scheduled_month_2",
    "scheduled_month_3",
    "scheduled_month_4",
    "scheduled_month_5",
    "scheduled_month_6",
    "scheduled_month_7",
    "scheduled_month_8",
    "scheduled_month_9",
    "scheduled_month_10",
    "scheduled_month_11",
    "scheduled_month_12",
    "Transport_Airways",
    "Transport_Roadways",
    "Transport_Waterways",
    "Material_Aluminium",
    "Material_Brass",
    "Material_Bronze",
    "Material_Clay",
    "Material_Marble",
    "Material_Stone",
    "Material_Wood",
    "Customer Information_Wealthy",
    "Customer Information_Working Class",
    "Installation Included_No",
    "Installation Included_Yes",
    "Express Shipment_No",
    "Express Shipment_Yes",
    "Fragile_No",
    "Fragile_Yes",
    "Remote Location_No",
    "Remote Location_Yes",
    "International_No",
    "International_Yes",
]

<IPython.core.display.Javascript object>

In [21]:
train_pool = Pool(data=train_X_df, label=train_y_df, cat_features=cat_features)

<IPython.core.display.Javascript object>

In [22]:
cat_reg = CatBoostRegressor(
    iterations=2500,
    silent=True,
    loss_function="RMSE",
    random_seed=2504,
)

<IPython.core.display.Javascript object>

In [23]:
grid = {
    "learning_rate": [0.03, 0.06, 0.1],
    "depth": [4, 6, 10],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
}

<IPython.core.display.Javascript object>

In [24]:
if TUNING:
    grid_search_result = cat_reg.grid_search(grid, cv=4, X=train_pool, plot=True)

<IPython.core.display.Javascript object>

RMSE is very high. Hence will have to revisit feature engineering.