In [2]:
import pandas as pd
import numpy as np
import feature_engine
from sklearn.model_selection import train_test_split
import seaborn as sns

In [3]:
data = pd.read_csv(r'../../../data/train.csv')
data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [4]:
soil_columns = [x for x in data.columns if x.startswith('Soil_Type')]

In [5]:
x = pd.DataFrame(data[soil_columns])
x.head()

Unnamed: 0,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
x.columns[np.where(x!=0)[1]]

Index(['Soil_Type29', 'Soil_Type29', 'Soil_Type12', 'Soil_Type30',
       'Soil_Type29', 'Soil_Type29', 'Soil_Type29', 'Soil_Type29',
       'Soil_Type29', 'Soil_Type29',
       ...
       'Soil_Type10', 'Soil_Type10', 'Soil_Type10', 'Soil_Type11',
       'Soil_Type11', 'Soil_Type4', 'Soil_Type4', 'Soil_Type4', 'Soil_Type4',
       'Soil_Type2'],
      dtype='object', length=15120)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from feature_engine.encoding import RareLabelEncoder
# This function just makes sure that the object is fitted
from sklearn.utils.validation import check_is_fitted

In [25]:
class FromDummiesToCategories(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_operate, new_column_name):
        self.cols_to_operate = cols_to_operate
        self.new_column_name = new_column_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X1 = pd.DataFrame(X[self.columns])
        serie = X1.columns[np.where(X1!=0)[1]]
        X[self.new_column_name] = serie
        X.drop(self.columns, axis=1, inplace=True)
        return X

In [26]:
fdmtoc = FromDummiesToCategories('Soil_Type3', 'new_name')
fdmtoc

FromDummiesToCategories(cols_to_operate='Soil_Type3',
                        new_column_name='new_name')

In [10]:
x = fdmtoc.fit_transform(data)

In [11]:
x

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Cover_Type,new_name
0,1,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,5,Soil_Type29
1,2,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,5,Soil_Type29
2,3,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,2,Soil_Type12
3,4,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,2,Soil_Type30
4,5,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,5,Soil_Type29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15115,15116,2607,243,23,258,7,660,170,251,214,1282,0,0,1,0,3,Soil_Type4
15116,15117,2603,121,19,633,195,618,249,221,91,1325,0,0,1,0,3,Soil_Type4
15117,15118,2492,134,25,365,117,335,250,220,83,1187,0,0,1,0,3,Soil_Type4
15118,15119,2487,167,28,218,101,242,229,237,119,932,0,0,1,0,3,Soil_Type4


In [12]:
x['new_name'].value_counts(normalize=True)

Soil_Type10    0.141667
Soil_Type29    0.085384
Soil_Type3     0.063624
Soil_Type4     0.055754
Soil_Type23    0.050066
Soil_Type38    0.048148
Soil_Type30    0.047950
Soil_Type32    0.045635
Soil_Type39    0.043452
Soil_Type6     0.042989
Soil_Type2     0.041204
Soil_Type33    0.040741
Soil_Type17    0.040476
Soil_Type13    0.031481
Soil_Type40    0.030357
Soil_Type11    0.026852
Soil_Type1     0.023479
Soil_Type22    0.022817
Soil_Type31    0.021958
Soil_Type24    0.016997
Soil_Type12    0.015013
Soil_Type14    0.011177
Soil_Type5     0.010913
Soil_Type20    0.009193
Soil_Type16    0.007540
Soil_Type35    0.006746
Soil_Type18    0.003968
Soil_Type26    0.003571
Soil_Type19    0.003042
Soil_Type37    0.002249
Soil_Type34    0.001455
Soil_Type21    0.001058
Soil_Type27    0.000992
Soil_Type9     0.000661
Soil_Type36    0.000661
Soil_Type28    0.000595
Soil_Type8     0.000066
Soil_Type25    0.000066
Name: new_name, dtype: float64

In [13]:
from feature_engine.encoding import RareLabelEncoder

def load_titanic():
    data = pd.read_csv(
        'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['embarked'].fillna('C', inplace=True)
    return data

data = load_titanic()

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['survived', 'name', 'ticket'], axis=1),
    data['survived'], test_size=0.3, random_state=0)

# set up the encoder
encoder = RareLabelEncoder(tol=0.03, n_categories=2, variables=['cabin', 'pclass', 'embarked'],
                           replace_with='Rare')

# fit the encoder
encoder.fit(X_train)

# transform the data
train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)

encoder.encoder_dict_

{'cabin': Index(['n', 'C', 'B', 'E', 'D'], dtype='object'),
 'pclass': Int64Index([3, 1, 2], dtype='int64'),
 'embarked': Index(['S', 'C', 'Q'], dtype='object')}

In [14]:
X_train['cabin'].value_counts(normalize=True)

n    0.766376
C    0.077511
B    0.045852
E    0.034934
D    0.034934
A    0.018559
F    0.016376
G    0.004367
T    0.001092
Name: cabin, dtype: float64

In [15]:
train_t['cabin'].value_counts(normalize=True)

n       0.766376
C       0.077511
B       0.045852
Rare    0.040393
E       0.034934
D       0.034934
Name: cabin, dtype: float64

In [16]:
test_t['cabin'].value_counts(normalize=True)

n       0.793893
B       0.058524
C       0.058524
D       0.035623
Rare    0.030534
E       0.022901
Name: cabin, dtype: float64