In [13]:
import numpy as np
import pandas as pd
import polars as pl
from category_encoders import CatBoostEncoder
from catboost import CatBoostRegressor , Pool, cv
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split


In [2]:
def train_test_split(dataframe, target_colnames, train_ratio = 0.8):
    if type(target_colnames) == str:
        target_colnames = [target_colnames]
    
    assert type(target_colnames) in [str, list]

    shuffled_dataset = dataframe.sample(fraction=1.0, shuffle=True)
    upper_train_index = int(len(shuffled_dataset) * train_ratio)

    train_dataset = shuffled_dataset[:upper_train_index]
    test_dataset = shuffled_dataset[upper_train_index:]

    X_train, y_train = train_dataset.drop(target_colnames), train_dataset.select(target_colnames)
    X_test, y_test = test_dataset.drop(target_colnames), test_dataset.select(target_colnames)

    return X_train.to_pandas(), y_train.to_pandas(), X_test.to_pandas(), y_test.to_pandas()


In [3]:
def ordered_encoding(data, categorical_colnames, target_colname, keep_not_encoded=False):

    def alias_label(col_name):
        return f"{col_name}_encoded" if keep_not_encoded else col_name

    selected_columns = categorical_colnames + [target_colname]

    data_copy = data.select(selected_columns)
    data_copy = data_copy.with_columns(pl.col(categorical_colnames).cast(pl.Utf8))

    cum_sums = [pl.col(target_colname).cum_sum().over(col_name).alias(f"{col_name}_cum_sum") for col_name in categorical_colnames]
    cum_counts = [pl.col(target_colname).cum_count().over(col_name).alias(f"{col_name}_cum_count") - 1 for col_name in categorical_colnames]

    # Perform the aggregation
    temp = data_copy.with_columns(cum_sums + cum_counts)
    temp = temp.with_columns([
        ((pl.col(f"{col_name}_cum_sum") - pl.col(target_colname) + pl.col(target_colname).mean()) 
         / (pl.col(f"{col_name}_cum_count") + 1)).alias(alias_label(col_name)) 
        for col_name in categorical_colnames])
    temp = temp.select([alias_label(col_name) for col_name in categorical_colnames])

    if not keep_not_encoded:
        return data.drop(categorical_colnames).hstack(temp).select(data.columns)
    
    return data.hstack(temp)

In [4]:
def generate_dataset():
    num_samples = 20

    data = pd.DataFrame({
        'Gender': np.random.choice(['Male', 'Female', 'Other'], num_samples),
        'Occupation': np.random.choice(['Doctor', 'Engineer', 'Teacher', 'Artist'], num_samples),
        'Age': np.random.randint(20, 60, num_samples),
        'Income': np.random.randint(30000, 100000, num_samples),
        'Label': np.random.uniform(0, 1, num_samples)
    })

    # Define categorical features
    categorical_features = ['Gender', 'Occupation']

    features = data.drop(columns=['Label'])
    targets = data[['Label']]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

    # return X_train, X_test, y_train, y_test

    return features, targets, data

In [5]:
features, targets, data = generate_dataset()

In [6]:
col = ["Gender", "Occupation"]
encoder = CatBoostEncoder(cols=col ,return_df=True)
encoder.fit(features, targets)
encoder.transform(features, targets)

Unnamed: 0,Gender,Occupation,Age,Income
0,0.406547,0.406547,59,49194
1,0.473378,0.406547,54,44174
2,0.330796,0.473378,30,90604
3,0.406547,0.22609,29,92309
4,0.406547,0.525873,52,37804
5,0.405813,0.176304,38,90939
6,0.241639,0.406547,30,60455
7,0.512026,0.370098,52,70244
8,0.439356,0.548781,56,33125
9,0.373423,0.278804,26,93864


In [7]:
test_data = pl.from_pandas(data)
ordered_encoding(test_data, col, "Label", keep_not_encoded=False)

Gender,Occupation,Age,Income,Label
f64,f64,i64,i64,f64
0.406547,0.406547,59,49194,0.540208
0.473378,0.406547,54,44174,0.045633
0.330796,0.473378,30,90604,0.630863
0.406547,0.22609,29,92309,0.076732
0.406547,0.525873,52,37804,0.617505
…,…,…,…,…
0.352842,0.291306,43,65996,0.452719
0.36711,0.51282,42,91355,0.498406
0.383522,0.318208,56,62088,0.968013
0.448465,0.411038,32,88075,0.353439


In [12]:
X_train, y_train, X_test, y_test = train_test_split(test_data, "Label")
train_pool = Pool(X_train, y_train, cat_features=col)
test_pool = Pool(X_test, y_test, cat_features=col)

In [31]:
params = {"iterations": 1,
          "depth": 8,
          "loss_function": "RMSE",
          "verbose": False}

scores = cv(train_pool, params, fold_count=5, plot=False)

Training on fold [0/5]

bestTest = 0.4698557627
bestIteration = 0

Training on fold [1/5]

bestTest = 0.413998732
bestIteration = 0

Training on fold [2/5]

bestTest = 0.6380394642
bestIteration = 0

Training on fold [3/5]

bestTest = 0.2823167964
bestIteration = 0

Training on fold [4/5]

bestTest = 0.4786517408
bestIteration = 0

