## Tree discretizer

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

from feature_engine.discretisation import DecisionTreeDiscretiser

### Regression

In [2]:
URL = 'http://jse.amstat.org/v19n3/decock/AmesHousing.xls'

data = pd.read_excel(URL)
data.columns = data.columns.str.replace(' ', '')

X_train, X_test, y_train, y_test =  train_test_split(
            data.drop(['PID', 'SalePrice'], axis=1),
            data['SalePrice'], test_size=0.3, random_state=0)

In [3]:
# Creating a discretizer
reg_disc = DecisionTreeDiscretiser(
        cv=3,
        scoring='neg_mean_squared_error',
        variables=['LotArea', 'GrLivArea'],
        regression=True
)

# Fitting the transformer
reg_disc.fit(X_train, y_train)

# Data transformation
pd.concat([
  X_train[['LotArea', 'GrLivArea']].add_suffix('_orig'),
  reg_disc.transform(X_train)[['LotArea', 'GrLivArea']].add_suffix('_disc')
], axis=1).sort_index(axis=1)

Unnamed: 0,GrLivArea_disc,GrLivArea_orig,LotArea_disc,LotArea_orig
1928,166900.364821,1411,212915.490506,12198
2497,124241.280510,941,174893.294910,8789
261,124241.280510,894,174893.294910,10650
1775,166900.364821,1369,174893.294910,9910
2587,124241.280510,1005,174893.294910,11050
...,...,...,...,...
763,148185.548077,1200,174893.294910,10800
835,124241.280510,1040,212915.490506,12464
1653,196873.870293,1725,212915.490506,11584
2607,124241.280510,1086,212915.490506,13400


In [45]:
X_train[['LotArea', 'GrLivArea']]

Unnamed: 0,LotArea,GrLivArea
1928,12198,1411
2497,8789,941
261,10650,894
1775,9910,1369
2587,11050,1005
...,...,...
763,10800,1200
835,12464,1040
1653,11584,1725
2607,13400,1086


### Classification

In [29]:
iris = load_iris()

In [30]:
# https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
data = pd.DataFrame(
    data= np.c_[iris['data'], iris['target']],
    columns= iris['feature_names'] + ['target']
)
data.columns = data.columns.str[:-5]
data.columns = data.columns.str.replace(' ', '_')

In [31]:
X, y = data.iloc[:, :4], data.iloc[:, 4:]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)
X_cols = X.columns.tolist()

In [32]:
# Creating a discretizer
class_disc = DecisionTreeDiscretiser(
        cv=3,
        scoring='accuracy',
        variables=X_cols,
        regression=False
)

# Fitting the transformer
class_disc.fit(X_train, y_train)

In [33]:
pd.concat([
  X_train[X_cols].add_suffix('_orig'),
  class_disc.transform(X_train)[X_cols].add_suffix('_disc')
], axis=1).sort_index(axis=1)

Unnamed: 0,petal_length_disc,petal_length_orig,petal_width_disc,petal_width_orig,sepal_length_disc,sepal_length_orig,sepal_width_disc,sepal_width_orig
60,0.911765,3.5,0.885714,1.0,0.125000,5.0,1.000000,2.0
116,0.027027,5.5,0.027778,1.8,0.296296,6.5,0.250000,3.0
144,0.027027,5.7,0.027778,2.5,0.296296,6.7,0.200000,3.3
119,0.027027,5.0,0.885714,1.5,0.296296,6.0,0.500000,2.2
108,0.027027,5.8,0.027778,1.8,0.296296,6.7,0.434783,2.5
...,...,...,...,...,...,...,...,...
9,0.000000,1.5,0.000000,0.1,0.125000,4.9,0.250000,3.1
103,0.027027,5.6,0.027778,1.8,0.296296,6.3,0.800000,2.9
67,0.911765,4.1,0.885714,1.0,0.296296,5.8,0.434783,2.7
117,0.027027,6.7,0.027778,2.2,0.296296,7.7,0.000000,3.8


In [28]:
np.unique(y_train)

array([0., 1., 2.])

In [38]:
class_disc.binner_dict_['sepal_length'].best_estimator_.predict_proba(X_test[['sepal_length']])

array([[0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.125     , 0.8125    , 0.0625    ],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.83333333, 0.125     , 0.04166667],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.83333333, 0.125     , 0.04166667],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.83333333, 0.125     , 0.04166667],
       [0.01851852, 0.2962963 , 0.68518519],
       [0.125     , 0.8125    , 0.0625    ],
       [1.        , 0.        , 0.        ],
       [0.83333333, 0.125     , 0.04166667],
       [0.125     , 0.8125    , 0.0625    ],
       [0.125     , 0.8125    , 0.0625    ],
       [1.

## GBMDiscretizer

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from skgbm.preprocessing import GBMDiscretizer
# Select the GBM model you want
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from catboost import CatBoostClassifier

# Preparing data
iris = load_iris()
# https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
data = pd.DataFrame(
    data= np.c_[iris['data'], iris['target']],
    columns= iris['feature_names'] + ['target']
)
data.columns = data.columns.str[:-5]
data.columns = data.columns.str.replace(' ', '_')

# Data splitting
X, y = data.iloc[:, :4], data.iloc[:, 4:]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)
X_cols = X.columns.tolist()

# Fitting the discretizer & transforming the data
gbm_discretizer = GBMDiscretizer(XGBClassifier(), X_cols, one_hot=False)
gbm_discretizer.fit_transform(X_train, y_train)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
60,2,0,1,1
116,15,8,12,9
144,17,11,12,10
119,10,0,11,6
108,17,3,12,9
...,...,...,...,...
9,1,9,0,0
103,13,7,12,9
67,8,5,6,1
117,21,16,12,10


In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

from skgbm.preprocessing import GBMDiscretizer
# Select the GBM model you want
# from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingRegressor
# from catboost import CatBoostClassifier

# Preparing data
diabetes = load_diabetes()
# https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
data = pd.DataFrame(
    data= np.c_[diabetes['data'], diabetes['target']],
    columns= diabetes['feature_names'] + ['target']
)
data.columns = data.columns.str[:-5]
data.columns = data.columns.str.replace(' ', '_')

# Data splitting
X, y = data.iloc[:, :4], data.iloc[:, 4:]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)
X_cols = X.columns.tolist()

# Fitting the discretizer & transforming the data
gbm_discretizer = GBMDiscretizer(GradientBoostingRegressor(), X_cols, one_hot=False)
gbm_discretizer.fit_transform(X_train, y_train)

ValueError: y should be a 1d array, got an array of shape (309, 7) instead.

In [4]:
data

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,t
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0
