## Tree discretizer

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

from feature_engine.discretisation import DecisionTreeDiscretiser

### Regression

In [12]:
URL = 'http://jse.amstat.org/v19n3/decock/AmesHousing.xls'

In [13]:
data = pd.read_excel(URL)
data.columns = data.columns.str.replace(' ', '')
  
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data.drop(['PID', 'SalePrice'], axis=1),
            data['SalePrice'], test_size=0.3, random_state=0)

In [43]:
# set up the discretisation transformer
reg_disc = DecisionTreeDiscretiser(
        cv=3,
        scoring='neg_mean_squared_error',
        variables=['LotArea', 'GrLivArea'],
        regression=True
)

# fit the transformer
reg_disc.fit(X_train, y_train)

# transform the data
train_t= disc.transform(X_train)
test_t= disc.transform(X_test)

train_t[['LotArea', 'GrLivArea']]

Unnamed: 0,LotArea,GrLivArea
1928,212915.490506,166900.364821
2497,174893.294910,124241.280510
261,174893.294910,124241.280510
1775,174893.294910,166900.364821
2587,174893.294910,124241.280510
...,...,...
763,174893.294910,148185.548077
835,212915.490506,124241.280510
1653,212915.490506,196873.870293
2607,212915.490506,124241.280510


### Classification

In [29]:
iris = load_iris()

In [30]:
# https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
data = pd.DataFrame(
    data= np.c_[iris['data'], iris['target']],
    columns= iris['feature_names'] + ['target']
)

In [39]:
X, y = data.iloc[:, :4], data.iloc[:, 4:]

In [41]:
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
