## Tree discretizer

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

from feature_engine.discretisation import DecisionTreeDiscretiser

### Regression

In [16]:
URL = 'http://jse.amstat.org/v19n3/decock/AmesHousing.xls'

data = pd.read_excel(URL)
data.columns = data.columns.str.replace(' ', '')

X_train, X_test, y_train, y_test =  train_test_split(
            data.drop(['PID', 'SalePrice'], axis=1),
            data['SalePrice'], test_size=0.3, random_state=0)

In [17]:
# Creating a discretizer
reg_disc = DecisionTreeDiscretiser(
        cv=3,
        scoring='neg_mean_squared_error',
        variables=['LotArea', 'GrLivArea'],
        regression=True
)

# Fitting the transformer
reg_disc.fit(X_train, y_train)

# Data transformation
pd.concat([
  X_train[['LotArea', 'GrLivArea']].add_suffix('_orig'),
  reg_disc.transform(X_train)[['LotArea', 'GrLivArea']].add_suffix('_disc')
], axis=1).sort_index(axis=1)

Unnamed: 0,GrLivArea_disc,GrLivArea_orig,LotArea_disc,LotArea_orig
1928,166900.364821,1411,212915.490506,12198
2497,124241.280510,941,174893.294910,8789
261,124241.280510,894,174893.294910,10650
1775,166900.364821,1369,174893.294910,9910
2587,124241.280510,1005,174893.294910,11050
...,...,...,...,...
763,148185.548077,1200,174893.294910,10800
835,124241.280510,1040,212915.490506,12464
1653,196873.870293,1725,212915.490506,11584
2607,124241.280510,1086,212915.490506,13400


In [27]:
reg_disc.binner_dict_['LotArea'].estimator.decision_path(X_train)



ValueError: could not convert string to float: 'RL'

In [45]:
X_train[['LotArea', 'GrLivArea']]

Unnamed: 0,LotArea,GrLivArea
1928,12198,1411
2497,8789,941
261,10650,894
1775,9910,1369
2587,11050,1005
...,...,...
763,10800,1200
835,12464,1040
1653,11584,1725
2607,13400,1086


### Classification

In [29]:
iris = load_iris()

In [30]:
# https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
data = pd.DataFrame(
    data= np.c_[iris['data'], iris['target']],
    columns= iris['feature_names'] + ['target']
)
data.columns = data.columns.str[:-5]
data.columns = data.columns.str.replace(' ', '_')

In [31]:
X, y = data.iloc[:, :4], data.iloc[:, 4:]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)
X_cols = X.columns.tolist()

In [32]:
# Creating a discretizer
class_disc = DecisionTreeDiscretiser(
        cv=3,
        scoring='accuracy',
        variables=X_cols,
        regression=False
)

# Fitting the transformer
class_disc.fit(X_train, y_train)

In [33]:
pd.concat([
  X_train[X_cols].add_suffix('_orig'),
  class_disc.transform(X_train)[X_cols].add_suffix('_disc')
], axis=1).sort_index(axis=1)

Unnamed: 0,petal_length_disc,petal_length_orig,petal_width_disc,petal_width_orig,sepal_length_disc,sepal_length_orig,sepal_width_disc,sepal_width_orig
60,0.911765,3.5,0.885714,1.0,0.125000,5.0,1.000000,2.0
116,0.027027,5.5,0.027778,1.8,0.296296,6.5,0.250000,3.0
144,0.027027,5.7,0.027778,2.5,0.296296,6.7,0.200000,3.3
119,0.027027,5.0,0.885714,1.5,0.296296,6.0,0.500000,2.2
108,0.027027,5.8,0.027778,1.8,0.296296,6.7,0.434783,2.5
...,...,...,...,...,...,...,...,...
9,0.000000,1.5,0.000000,0.1,0.125000,4.9,0.250000,3.1
103,0.027027,5.6,0.027778,1.8,0.296296,6.3,0.800000,2.9
67,0.911765,4.1,0.885714,1.0,0.296296,5.8,0.434783,2.7
117,0.027027,6.7,0.027778,2.2,0.296296,7.7,0.000000,3.8


In [28]:
np.unique(y_train)

array([0., 1., 2.])

In [38]:
class_disc.binner_dict_['sepal_length'].best_estimator_.predict_proba(X_test[['sepal_length']])

array([[0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.125     , 0.8125    , 0.0625    ],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.83333333, 0.125     , 0.04166667],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.83333333, 0.125     , 0.04166667],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.83333333, 0.125     , 0.04166667],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.125     , 0.8125    , 0.0625    ],
       [1.        , 0.        , 0.        ],
       [0.83333333, 0.125     , 0.04166667],
       [0.125     , 0.8125    , 0.0625    ],
       [0.125     , 0.8125    , 0.0625    ],
       [1.

## GBMDiscretizer

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from skgbm.preprocessing import GBMDiscretizer
# Select the GBM model you want
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from catboost import CatBoostClassifier

# Preparing data
iris = load_iris()
# https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
data = pd.DataFrame(
    data= np.c_[iris['data'], iris['target']],
    columns= iris['feature_names'] + ['target']
)
data.columns = data.columns.str[:-5]
data.columns = data.columns.str.replace(' ', '_')

# Data splitting
X, y = data.iloc[:, :4], data.iloc[:, 4:]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)
X_cols = X.columns.tolist()

# Fitting the discretizer & transforming the data
gbm_discretizer = GBMDiscretizer(XGBClassifier(), X_cols, one_hot=False)
gbm_discretizer.fit_transform(X_train, y_train)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
60,2,0,1,1
116,15,8,12,9
144,17,11,12,10
119,10,0,11,6
108,17,3,12,9
...,...,...,...,...
9,1,9,0,0
103,13,7,12,9
67,8,5,6,1
117,21,16,12,10


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from skgbm.preprocessing import GBMDiscretizer
# Select the GBM model you want
# from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingRegressor
# from catboost import CatBoostClassifier

# Preparing data
URL = 'http://jse.amstat.org/v19n3/decock/AmesHousing.xls'

data = pd.read_excel(URL)
data.columns = data.columns.str.replace(' ', '')
X_cols = ['LotArea', 'GrLivArea']

X_train, X_test, y_train, y_test =  train_test_split(
            data.drop(['PID', 'SalePrice'], axis=1),
            data['SalePrice'], test_size=0.3, random_state=0)

# Fitting the discretizer & transforming the data
gbm_discretizer = GBMDiscretizer(GradientBoostingRegressor(), X_cols, one_hot=False)
gbm_discretizer.fit_transform(X_train, y_train)[X_cols]

Unnamed: 0,LotArea,GrLivArea
1928,128,51
2497,74,19
261,100,19
1775,86,51
2587,108,25
...,...,...
763,104,39
835,131,28
1653,117,86
2607,137,30


In [13]:
X_train

Unnamed: 0,Order,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1928,1929,20,RL,70.0,12198,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2007,COD,Normal
2497,2498,20,RL,,8789,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
261,262,20,RL,75.0,10650,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1775,1776,20,RL,75.0,9910,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2007,New,Partial
2587,2588,20,RL,65.0,11050,Pave,,Reg,Lvl,AllPub,...,288,0,,,,0,7,2006,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,764,90,RL,60.0,10800,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,3,2009,WD,Alloca
835,836,20,RL,82.0,12464,Pave,,IR2,Low,AllPub,...,0,0,,GdPrv,,0,11,2009,WD,Normal
1653,1654,60,RL,80.0,11584,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,11,2007,WD,Normal
2607,2608,80,RL,85.0,13400,Pave,,Reg,Lvl,AllPub,...,0,0,,GdWo,,0,6,2006,WD,Normal


In [11]:
data.columns

Index(['Order', 'PID', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemod/Add', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 