### Backpack Prediction Challenge using CatBoostRegressor
This notebook runs CatBoostRegressor for the regression. The Dataset preprocessing includes handling null values and handling categorical using LabelEncoder. Also, cross validation for 5 folds.  

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import warnings
import optuna
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s5e2/train.csv").drop('id', axis=1)
test = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv").drop('id', axis=1)
submission = pd.read_csv('../input/playground-series-s5e2/sample_submission.csv')
train.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [3]:
cols_to_update = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color']
cols_to_update

['Brand',
 'Material',
 'Size',
 'Compartments',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [4]:
train[cols_to_update] = train[cols_to_update].fillna('None').astype('string').astype('category')
median_weight = train['Weight Capacity (kg)'].median()

# The "Weight Capacity (kg)" feature is crucial and is split into two: 
# one categorical and one numerical column
train['Weight Capacity (kg) categorical'] = train['Weight Capacity (kg)'].fillna(median_weight).astype('string')
train['Weight Capacity (kg)'] = train['Weight Capacity (kg)'].fillna(median_weight).astype('float64')

X = train.drop('Price', axis=1)
y = train.Price

# Same goes for test data
test[cols_to_update] = test[cols_to_update].fillna('None').astype('string').astype('category')
test['Weight Capacity (kg) categorical'] = test['Weight Capacity (kg)'].fillna(median_weight).astype('string')
test['Weight Capacity (kg)'] = test['Weight Capacity (kg)'].fillna(median_weight)

In [5]:
train.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,Weight Capacity (kg) categorical
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875,11.611722805222309
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056,27.07853658053123
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732,16.643759949103497
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793,12.937220306632067
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312,17.749338465908988


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                            Non-Null Count   Dtype   
---  ------                            --------------   -----   
 0   Brand                             300000 non-null  category
 1   Material                          300000 non-null  category
 2   Size                              300000 non-null  category
 3   Compartments                      300000 non-null  category
 4   Laptop Compartment                300000 non-null  category
 5   Waterproof                        300000 non-null  category
 6   Style                             300000 non-null  category
 7   Color                             300000 non-null  category
 8   Weight Capacity (kg)              300000 non-null  float64 
 9   Price                             300000 non-null  float64 
 10  Weight Capacity (kg) categorical  300000 non-null  string  
dtypes: category(8), float64(2), string(1)
m

In [7]:
cat_cols = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg) categorical']

trial = False

if trial:

    def objective(trial):
        params = {
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'iterations': 2000,
        'depth': trial.suggest_int('depth', 3, 10),
        'random_strength': 0,
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'task_type':'GPU',
        'random_seed':42,
        'verbose':False
        }
        
        
        cv = KFold(5, shuffle=True, random_state=0)
        cv_splits = cv.split(X, y)
        scores = list()
        for train_idx, val_idx in cv_splits:
            model = CatBoostRegressor(**params)
            X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
            y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
            X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=cat_cols)
            X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=cat_cols)
            model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=200)
            #model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=False)
    
            val_pred = model.predict(X_valid_pool)
            score = np.sqrt(mean_squared_error(y_val_fold, val_pred))
            scores.append(score)  
        return np.mean(scores)
    
    sqlite_db = "sqlite:///catboost.db"
    study_name = "catboost"
    
    study = optuna.create_study(direction="minimize", study_name = study_name, storage = sqlite_db,
                                load_if_exists = True)
    study.optimize(objective, n_trials=100)
    
    # Best Hyperparameters
    
    study.optimize(objective, n_trials=100)
    print(f"best optimized RMSE: {study.best_value:0.5f}") 
    print(f"best hyperparameters: {study.best_params}") 
    catboost_params = study.best_params

else: 
    best_params = {
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'learning_rate': 0.10907108735803546,
        'iterations': 2000,
        'depth': 3,
        'random_strength': 0,
        'l2_leaf_reg': 0.002735573074939131,
        'task_type':'GPU',
        'random_seed':42,
        'verbose':False
        }

In [8]:
best_params

{'loss_function': 'RMSE',
 'eval_metric': 'RMSE',
 'learning_rate': 0.10907108735803546,
 'iterations': 2000,
 'depth': 3,
 'random_strength': 0,
 'l2_leaf_reg': 0.002735573074939131,
 'task_type': 'GPU',
 'random_seed': 42,
 'verbose': False}

In [9]:
cv = KFold(5, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores = list()
y_preds = np.zeros(len(test))
X_test_pool = Pool(test, cat_features=cat_cols)
for train_idx, val_idx in cv_splits:
    model = CatBoostRegressor(**best_params)
    X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=cat_cols)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=cat_cols)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=200)
    #model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=False)

    val_pred = model.predict(X_valid_pool)
    score = np.sqrt(mean_squared_error(y_val_fold, val_pred))
    scores.append(score)  
    y_preds += model.predict(X_test_pool) / 5
print(f'Mean Score: {np.mean(scores)}')

Mean Score: 38.96467261942098


In [10]:
submission.shape, y_preds.shape

((200000, 2), (200000,))

In [11]:
submission['Price'] = y_preds
submission.to_csv('submission.csv', index = False)