In [50]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, metrics

In [51]:
dataset = "gs://vtxdemos-datasets-public/ecommerce/ecommerce_balanced.csv"
df = pd.read_csv(dataset)

In [52]:
df.head()

Unnamed: 0,will_buy_on_return_visit,latest_ecommerce_progress,bounces,time_on_site,pageviews,source,medium,channelGrouping,deviceCategory,country,random_value,split_set
0,1,3,0,875,11,google,organic,Organic Search,desktop,United States,0.780625,test
1,1,5,0,386,14,google,organic,Organic Search,mobile,Canada,0.154017,train
2,1,2,0,374,4,google,organic,Organic Search,mobile,Canada,0.554595,train
3,0,0,0,115,4,google,organic,Organic Search,mobile,Australia,0.719678,test
4,1,2,0,176,6,google,organic,Organic Search,desktop,United States,0.5767,train


In [53]:
train_df, test_df, val_df = [df[df['split_set'] == s].copy() for s in ['train', 'test', 'val']]
train_y, test_y, val_y  = [df['will_buy_on_return_visit'] for df in [train_df, test_df, val_df]]
# Now, drop the unnecessary columns
for df in [train_df, test_df, val_df]:
    df.drop(['split_set', 'random_value', 'will_buy_on_return_visit'], axis=1, inplace=True)  # Exclude 'will_buy_on_return_visit'

# Assign the feature DataFrames
train_x, test_x, val_x = train_df, test_df, val_df

# Identify categorical features
categorical_features_indices = np.where(train_x.dtypes != float)[0]

In [56]:
# Define the Model
model = CatBoostClassifier(
    allow_writing_files=False,
    train_dir="/tmp",
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
)

# Fit Model (Train)
model.fit(
    train_x, train_y,
    cat_features=categorical_features_indices,
    eval_set=(val_x, val_y),
)

Learning rate set to 0.016657
0:	learn: 0.6826522	test: 0.6833859	best: 0.6833859 (0)	total: 10.6ms	remaining: 10.6s
1:	learn: 0.6751218	test: 0.6795291	best: 0.6795291 (1)	total: 15.3ms	remaining: 7.65s
2:	learn: 0.6623540	test: 0.6757978	best: 0.6757978 (2)	total: 20ms	remaining: 6.64s
3:	learn: 0.6450592	test: 0.6691197	best: 0.6691197 (3)	total: 23.8ms	remaining: 5.92s
4:	learn: 0.6312822	test: 0.6632218	best: 0.6632218 (4)	total: 29.4ms	remaining: 5.85s
5:	learn: 0.6206706	test: 0.6565087	best: 0.6565087 (5)	total: 33.8ms	remaining: 5.6s
6:	learn: 0.6104320	test: 0.6513378	best: 0.6513378 (6)	total: 38ms	remaining: 5.39s
7:	learn: 0.6042000	test: 0.6489105	best: 0.6489105 (7)	total: 41.3ms	remaining: 5.12s
8:	learn: 0.5916943	test: 0.6424353	best: 0.6424353 (8)	total: 46.9ms	remaining: 5.16s
9:	learn: 0.5811683	test: 0.6397641	best: 0.6397641 (9)	total: 52.9ms	remaining: 5.24s
10:	learn: 0.5706989	test: 0.6341967	best: 0.6341967 (10)	total: 58.2ms	remaining: 5.23s
11:	learn: 0.561

<catboost.core.CatBoostClassifier at 0x79d0fe535450>

In [57]:
predictions = model.predict(test_x)
print(predictions)

[1 0 0 0 1 1 0 0 1]


In [58]:
train_x.columns

Index(['latest_ecommerce_progress', 'bounces', 'time_on_site', 'pageviews',
       'source', 'medium', 'channelGrouping', 'deviceCategory', 'country'],
      dtype='object')

In [None]:
{
    "instances": [
        {
            "latest_ecommerce_progress": [0],
            "bounces": [1],
            "time_on_site": [0],
            "pageviews": [1],
            "source": ["google"],
            "medium": ["organic"],
            "channelGrouping": ["Organic Search"],
            "deviceCategory": ["mobile"],
            "country": ["Sri Lanka"]
        }
    ]
}