In [31]:
import numpy as np
import plotly.express as px

df_tips = px.data.tips() # categorical data
df_tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [32]:
size_n = np.unique(df_tips['size'])
print(size_n)

[1 2 3 4 5 6]


In [33]:
from sklearn.model_selection import train_test_split

X = df_tips.drop('total_bill', axis=1)
y = df_tips['total_bill']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=1
)

In [34]:
from catboost import CatBoostRegressor, Pool

cat_features = [1, 2, 3, 4] # sex, smoker, day, time
train_dataset = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features
)
eval_dataset = Pool(
    data=X_test,
    label=y_test,
    cat_features=cat_features
)

model = CatBoostRegressor(
    l2_leaf_reg = 0.01,
    iterations = 10,
    depth=3,
    eval_metric='RMSE',
    one_hot_max_size=3 # if less than 3 categories, apply one-hot encoding
)
model.fit(train_dataset, use_best_model=True, eval_set = eval_dataset)

0:	learn: 8.6647662	test: 9.0884054	best: 9.0884054 (0)	total: 1.01ms	remaining: 9.14ms
1:	learn: 8.5539401	test: 8.9938637	best: 8.9938637 (1)	total: 1.64ms	remaining: 6.58ms
2:	learn: 8.4200742	test: 8.8475760	best: 8.8475760 (2)	total: 1.89ms	remaining: 4.41ms
3:	learn: 8.3332551	test: 8.7623686	best: 8.7623686 (3)	total: 2.2ms	remaining: 3.3ms
4:	learn: 8.2293126	test: 8.6804390	best: 8.6804390 (4)	total: 2.46ms	remaining: 2.46ms
5:	learn: 8.1133363	test: 8.5654370	best: 8.5654370 (5)	total: 2.7ms	remaining: 1.8ms
6:	learn: 8.0278360	test: 8.4957680	best: 8.4957680 (6)	total: 2.92ms	remaining: 1.25ms
7:	learn: 7.9347924	test: 8.4034721	best: 8.4034721 (7)	total: 3.17ms	remaining: 791us
8:	learn: 7.8431808	test: 8.3006628	best: 8.3006628 (8)	total: 3.34ms	remaining: 370us
9:	learn: 7.7611528	test: 8.2219878	best: 8.2219878 (9)	total: 3.53ms	remaining: 0us

bestTest = 8.221987823
bestIteration = 9



<catboost.core.CatBoostRegressor at 0x335c83350>

In [35]:
print(f'best_iteration: {model.get_best_iteration()}, best_score: {model.get_best_score()}')

best_iteration: 9, best_score: {'learn': {'RMSE': 7.761152757557254}, 'validation': {'RMSE': 8.22198782325997}}


In [36]:
model.feature_importances_

array([47.37119581,  0.88319658,  4.1361321 ,  0.        ,  0.        ,
       47.60947551])

In [37]:
import pandas as pd

# Convert categorical columns to numerical using one-hot encoding
X_encoded = pd.get_dummies(X, columns=['sex', 'smoker', 'day', 'time'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y,
    test_size=0.3,
    random_state=1
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# Plot partial dependence for a specific feature
feature_idx = X_encoded.columns.get_loc('sex_Female')  # Example for 'sex_Female' column
model.plot_partial_dependence(
    features=feature_idx,
    data=Pool(data=X_train, label=y_train),
    plot=True
)

0:	learn: 8.6525777	test: 9.0382690	best: 9.0382690 (0)	total: 120us	remaining: 1.08ms
1:	learn: 8.5196705	test: 8.8869550	best: 8.8869550 (1)	total: 350us	remaining: 1.4ms
2:	learn: 8.4089188	test: 8.7898770	best: 8.7898770 (2)	total: 498us	remaining: 1.16ms
3:	learn: 8.2927753	test: 8.6916347	best: 8.6916347 (3)	total: 570us	remaining: 855us
4:	learn: 8.2028161	test: 8.5998895	best: 8.5998895 (4)	total: 686us	remaining: 686us
5:	learn: 8.1033265	test: 8.5021647	best: 8.5021647 (5)	total: 829us	remaining: 552us
6:	learn: 8.0040874	test: 8.4226692	best: 8.4226692 (6)	total: 893us	remaining: 383us
7:	learn: 7.9132607	test: 8.3519107	best: 8.3519107 (7)	total: 1.27ms	remaining: 318us
8:	learn: 7.8253016	test: 8.2638534	best: 8.2638534 (8)	total: 1.47ms	remaining: 162us
9:	learn: 7.7616153	test: 8.2043149	best: 8.2043149 (9)	total: 1.57ms	remaining: 0us

bestTest = 8.204314877
bestIteration = 9



(array([ 0.01367642, -0.03735586]),
 Figure({
     'data': [{'mode': 'lines+markers', 'type': 'scatter', 'y': array([ 0.01367642, -0.03735586])}],
     'layout': {'template': '...',
                'title': {'text': "Partial dependence plot for feature '2'"},
                'xaxis': {'showticklabels': False,
                          'tickmode': 'array',
                          'ticktext': [(-inf, 0.5000], (0.5000, +inf)],
                          'tickvals': [0, 1],
                          'title': {'text': 'Bins'}},
                'yaxis': {'side': 'left', 'title': {'text': 'Mean Prediction'}}}
 }))