# Reducing category size
Fewer categorical values -> Less split possibilities in tree based learnes -> Less overfit and quicker learning?

## Load data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [3]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

In [4]:
x_train = train.drop(columns=['id','target'])
y_train = train['target']

## Reduction of columns in one hot encoding by category size reduction

In [5]:
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
import pandas as pd

In [6]:
min_cat_size = [0, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
data_size = []
for min_data_portion in min_cat_size:
    ct = CategoricalTransform(cat_cols, min_data_portion = min_data_portion)
    pipe = Pipeline([('categorical_transform', ct), 
                    ('one hot', OneHotTransform())])
    data_size.append(pipe.fit_transform(x_train).shape[1])
pd.DataFrame(dict(min_cat_size=min_cat_size, num_columns=data_size))

Unnamed: 0,min_cat_size,num_columns
0,0.0,634
1,0.0005,399
2,0.001,297
3,0.005,186
4,0.01,150
5,0.05,70
6,0.1,48


## Analysis Tools

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
kfold = KFold(n_splits=5, shuffle=True)

In [8]:
def analyse_cat_size(pipe, x_train=x_train, y_train = y_train):
    gscv = GridSearchCV(pipe, dict(trans__min_data_portion=min_cat_size), cv=kfold, scoring='roc_auc', verbose=1)
    gscv.fit(x_train, y_train)
    return pd.DataFrame(dict(cat_size = min_cat_size, 
                  fit_time=gscv.cv_results_['mean_fit_time'], 
                  auc=gscv.cv_results_['mean_test_score']))

## Lightgbm default model performance with different min_cat_sizes

In [9]:
from lightgbm.sklearn import LGBMClassifier

In [10]:
lightgbm_pipe = Pipeline([("trans", CategoricalTransform(cat_cols)),
                          ("lgbm", LGBMClassifier(n_jobs = -2))])

In [11]:
analyse_cat_size(lightgbm_pipe)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


Unnamed: 0,cat_size,fit_time,auc
0,0.0,2.1657,0.8914
1,0.0005,2.191659,0.891332
2,0.001,2.181433,0.891659
3,0.005,2.140324,0.891567
4,0.01,2.161886,0.891188
5,0.05,2.042021,0.888094
6,0.1,1.974346,0.886271


## Catboost performance

In [12]:
from catboost import CatBoostClassifier
from categorical_transform import IntegerCategoricalTransform
cbc = CatBoostClassifier(cat_features=cat_cols, thread_count=6, verbose=0)
catboost_pipe = Pipeline([("trans", IntegerCategoricalTransform(cat_cols)),
                          ("catboost", cbc)])

In [13]:
analyse_cat_size(catboost_pipe)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


Unnamed: 0,cat_size,fit_time,auc
0,0.0,233.908585,0.894194
1,0.0005,232.178789,0.89424
2,0.001,230.55374,0.894172
3,0.005,226.410832,0.893885
4,0.01,225.874341,0.893526
5,0.05,222.777288,0.891076
6,0.1,187.820731,0.889827


## Default xgboost, integer encoding

In [12]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs=-2, eval_metric='auc', use_label_encoder=False)
from categorical_transform import IntegerCategoricalTransform
from sklearn.pipeline import Pipeline
xgb_pipe = Pipeline([('trans',IntegerCategoricalTransform(cat_cols=cat_cols)),('xgboost', xgb)])

In [13]:
analyse_cat_size(xgb_pipe)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


Unnamed: 0,cat_size,fit_time,auc
0,0.0,21.552116,0.888785
1,0.0005,21.57933,0.888817
2,0.001,21.694252,0.8891
3,0.005,21.741728,0.889012
4,0.01,21.667129,0.888755
5,0.05,21.302458,0.886827
6,0.1,21.009021,0.885818


## Default xgboost, one-hot encoding

In [14]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs=-2, eval_metric='auc', use_label_encoder=False)
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
xgb_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                     ('oht',OneHotTransform()),
                     ('xgboost', xgb)])

In [None]:
analyse_cat_size(xgb_pipe)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


One hot slightly better than ordinal encoding. Reducing category size speeds up training significantly.

## Randomforest default integer encoding

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-2)
from categorical_transform import IntegerCategoricalTransform
from sklearn.pipeline import Pipeline
rf_pipe = Pipeline([('trans',IntegerCategoricalTransform(cat_cols=cat_cols)),('rf', rf)])

In [None]:
analyse_cat_size(rf_pipe)

## Randomforest one hot encoding

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-2)
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
rf_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                    ('oht', OneHotTransform()),
                    ('rf', rf)])

In [None]:
analyse_cat_size(rf_pipe)

One hot slightly better than ordinal encoding. Reducing category size speeds up training slightly.

## Categorical naive bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB
catnb = CategoricalNB()
from categorical_transform import NonNegativeIntegerCategoricalTransform
from sklearn.pipeline import Pipeline
catnb_pipe = Pipeline([('trans',NonNegativeIntegerCategoricalTransform(cat_cols=cat_cols)),
                       ('catnb', catnb)])

In [None]:
analyse_cat_size(catnb_pipe, x_train=x_train[cat_cols])

# Simple MLP classifier

In [9]:
from sklearn.neural_network import MLPClassifier
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
mlp = MLPClassifier()
mlp_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                     ('oht', OneHotTransform()),
                     ('mlp', mlp)])

In [10]:
analyse_cat_size(mlp_pipe, x_train=x_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits




Unnamed: 0,cat_size,fit_time,auc
0,0.0,1424.134396,0.843667
1,0.0005,918.262694,0.845433
2,0.001,710.710654,0.851954
3,0.005,557.310637,0.865078
4,0.01,513.42423,0.869841
5,0.05,290.348982,0.879277
6,0.1,292.237727,0.879355


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
lr_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                     ('oht', OneHotTransform()),
                     ('lr', LogisticRegression(n_jobs=-2)])

In [None]:
analyse_cat_size(lr_pipe, x_train=x_train)