In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from ml_tooling.transformers import RareFeatureEncoder, ToCategorical,Select
plt.rcParams['figure.figsize'] = (10, 5)

## ML Tooling - Rare Feature Encoder
https://ml-tooling.readthedocs.io/en/stable/transformers.html#rarefeatureencoder

#### General 
- Less than threshold (rare values =  frequencies < threshold ) 

- Nan values are not included, they should be transformed before.

- The threshold can be an int or percentage.

- Can be used in cross validation and gridsearch.

- Can handle columns which are either not in Train or Test.

- Column name needs to be identical.


## Data 
https://www.kaggle.com/c/petfinder-adoption-prediction/data



## Data 

In [None]:
data = pd.read_csv('train_PetFinder.csv')
print('Shape'.format(data.shape))
data.head(2)

In [None]:
data.Breed1.hist(bins=30)
plt.show()

In [None]:
data.Breed1.value_counts()

In [None]:
data.Breed1.value_counts(normalize=True)

## Transform Breed1

### Without Rare Feature Encoder

In [None]:
breed1 = Pipeline([
    ('select', Select('Breed1')),
    ('categorical', ToCategorical())
])

In [None]:
breed1.fit_transform(data).head()

### With Rare Feature Encoder

#### A number 

In [None]:
breed1_rare = Pipeline([
    ('select', Select('Breed1')),
    ('rare', RareFeatureEncoder(threshold= 200, fill_rare = 'Rare')),
    ('categorical', ToCategorical())
])

In [None]:
breed1_rare.fit_transform(data)

#### A percentage 

In [None]:
breed1_rare_percent = Pipeline([
    ('select', Select('Breed1')),
    ('rare', RareFeatureEncoder(threshold= 0.02, fill_rare = 'Rare')),
    ('categorical', ToCategorical())
])

In [None]:
breed1_rare_percent.fit_transform(data)