# Discretizing features with gradient boosted tree ensemble

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import DecisionTreeDiscretiser

In [None]:
## Compare
* https://feature-engine.trainindata.com/en/1.0.x/discretisation/DecisionTreeDiscretiser.html
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html#sklearn.preprocessing.KBinsDiscretizer
    

## Dataset

In [4]:
!pip install xlrd

Defaulting to user installation because normal site-packages is not writeable
Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
     |████████████████████████████████| 96 kB 776 kB/s            
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1
You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [5]:
URL = 'http://jse.amstat.org/v19n3/decock/AmesHousing.xls'
house_prices = pd.read_excel(URL)

In [16]:
house_prices.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [14]:
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            house_prices.drop(['Order', 'PID', 'SalePrice'], axis=1),
            house_prices['SalePrice'], test_size=0.3, random_state=0)

In [17]:
# set up the discretisation transformer
disc = DecisionTreeDiscretiser(cv=3,
                          scoring='neg_mean_squared_error',
                          variables=['Lot Area', 'Gr Liv Area'],
                          regression=True)

# fit the transformer
disc.fit(X_train, y_train)

# transform the data
train_t= disc.transform(X_train)
test_t= disc.transform(X_test)

disc.binner_dict_

{'Lot Area': GridSearchCV(cv=3, estimator=DecisionTreeRegressor(),
              param_grid={'max_depth': [1, 2, 3, 4]},
              scoring='neg_mean_squared_error'),
 'Gr Liv Area': GridSearchCV(cv=3, estimator=DecisionTreeRegressor(),
              param_grid={'max_depth': [1, 2, 3, 4]},
              scoring='neg_mean_squared_error')}

In [20]:
train_t['Lot Area']

1928    212915.490506
2497    174893.294910
261     174893.294910
1775    174893.294910
2587    174893.294910
            ...      
763     174893.294910
835     212915.490506
1653    212915.490506
2607    212915.490506
2732    174893.294910
Name: Lot Area, Length: 2051, dtype: float64

In [8]:
from sklearn.preprocessing import KBinsDiscretizer
X = [[-2, 1, -4,   -1],
      [-1, 2, -3, -0.5],
      [ 0, 3, -2,  0.5],
      [ 1, 4, -1,    2]]
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
est.fit(X)

Xt = est.transform(X)
Xt.shape  

(4, 12)