# Reducing category size
Fewer categorical values -> Less split possibilities in tree based learnes -> Less overfit and quicker learning?

## Load data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [3]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

In [4]:
x_train = train.drop(columns=['id','target'])
y_train = train['target']

## Reduction of columns in one hot encoding by category size reduction

In [5]:
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
import pandas as pd

NameError: name 'CategoricalTransform' is not defined

In [None]:
min_cat_size = [0, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
data_size = []
for min_data_portion in min_cat_size:
    ct = CategoricalTransform(cat_cols, min_data_portion = min_data_portion)
    pipe = Pipeline([('categorical_transform', ct), 
                    ('one hot', OneHotTransform())])
    data_size.append(pipe.fit_transform(x_train).shape[1])
pd.DataFrame(dict(min_cat_size=min_cat_size, num_columns=data_size))

## Analysis Tools

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
kfold = KFold(n_splits=5, shuffle=True)

In [None]:
def analyse_cat_size(pipe, x_train=x_train, y_train = y_train):
    gscv = GridSearchCV(pipe, dict(trans__min_data_portion=min_cat_size), cv=kfold, scoring='roc_auc', verbose=1)
    gscv.fit(x_train, y_train)
    return pd.DataFrame(dict(cat_size = min_cat_size, 
                  fit_time=gscv.cv_results_['mean_fit_time'], 
                  auc=gscv.cv_results_['mean_test_score']))

## Lightgbm default model performance with different min_cat_sizes

In [None]:
from lightgbm.sklearn import LGBMClassifier

In [None]:
lightgbm_pipe = Pipeline([("trans", CategoricalTransform(cat_cols)),
                          ("lgbm", LGBMClassifier(n_jobs = -2))])

In [None]:
analyse_cat_size(lightgbm_pipe)

## Catboost performance

In [None]:
from catboost import CatBoostClassifier
from categorical_transform import IntegerCategoricalTransform
cbc = CatBoostClassifier(cat_features=cat_cols, thread_count=6, verbose=0)
catboost_pipe = Pipeline([("trans", IntegerCategoricalTransform(cat_cols)),
                          ("catboost", cbc)])

In [None]:
analyse_cat_size(catboost_pipe)

## Default xgboost, integer encoding

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs=-2, eval_metric='auc', use_label_encoder=False)
from categorical_transform import IntegerCategoricalTransform
from sklearn.pipeline import Pipeline
xgb_pipe = Pipeline([('trans',IntegerCategoricalTransform(cat_cols=cat_cols)),('xgboost', xgb)])

In [None]:
analyse_cat_size(xgb_pipe)

## Default xgboost, one-hot encoding

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs=-2, eval_metric='auc', use_label_encoder=False)
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
xgb_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                     ('oht',OneHotTransform()),
                     ('xgboost', xgb)])

In [None]:
analyse_cat_size(xgb_pipe)

One hot slightly better than ordinal encoding. Reducing category size speeds up training significantly.

## Randomforest default integer encoding

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-2)
from categorical_transform import IntegerCategoricalTransform
from sklearn.pipeline import Pipeline
rf_pipe = Pipeline([('trans',IntegerCategoricalTransform(cat_cols=cat_cols)),('rf', rf)])

In [None]:
analyse_cat_size(rf_pipe)

## Randomforest one hot encoding

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-2)
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
rf_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                    ('oht', OneHotTransform()),
                    ('rf', rf)])

In [None]:
analyse_cat_size(rf_pipe)

One hot slightly better than ordinal encoding. Reducing category size speeds up training slightly.

## Categorical naive bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB
catnb = CategoricalNB()
from categorical_transform import NonNegativeIntegerCategoricalTransform
from sklearn.pipeline import Pipeline
catnb_pipe = Pipeline([('trans',NonNegativeIntegerCategoricalTransform(cat_cols=cat_cols)),
                       ('catnb', catnb)])

In [None]:
analyse_cat_size(catnb_pipe, x_train=x_train[cat_cols])