# Variable Discretization

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

**Discretization, or binning**, is the process of **transforming continuous variables into discrete
variables** by creating a set of contiguous intervals, also called bins, that span the range of
the variable values. Discretization is used to **change the distribution of skewed variables**
and to **minimize the influence of outliers**, and hence improve the performance of some
machine learning models.

In [None]:
boston = pd.read_csv("data/boston.csv")
boston.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from helpers.datasets import make_wave

X, y = make_wave(n_samples=100)

line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)

reg = DecisionTreeRegressor(min_samples_split=3).fit(X, y)
plt.plot(line, reg.predict(line), label="decision tree")

reg = LinearRegression().fit(X, y)
plt.plot(line, reg.predict(line), label="linear regression")

plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")
plt.show()

## Dividing the variable into intervals of equal width

**In equal-width discretization, the variable values are sorted into intervals of the same
width.** 

    Width = (Max(X) - Min(X)) / Bins

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data = boston.copy()

X_train, X_test, y_train, y_test = train_test_split(data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3, random_state=0)

In [None]:
# Let's calculate the range of the LSTAT variable, that is, the difference between its maximum and minimum values:
lstat_range = X_train['LSTAT'].max() - X_train['LSTAT'].min()

# Let's determine the interval width, which is the variable's value range divided by the number of bins:
inter_width = int(lstat_range / 10)

# Let's capture in new variables, the rounded minimum and maximum values of LSTAT:
min_value = int(np.floor( X_train['LSTAT'].min()))
max_value = int(np.ceil( X_train['LSTAT'].max()))

print(min_value, max_value, inter_width)

#Let's create a list with the interval limits using list comprehension and print out the limits:
intervals = [i for i in range(min_value, max_value + inter_width, inter_width)]
print(intervals)

# Let's discretize LSTAT and capture the discretized variable in a new column in the dataframe:
X_train['lstat_disc'] = pd.cut(x=X_train['LSTAT'], bins=intervals, include_lowest=True)

#Let's print the top 10 observations of the discretized and original variable, side by side:
print(X_train[['LSTAT', 'lstat_disc']].head(10))

In [None]:
# Let's calculate the number of observations per interval:
print(X_train.groupby('lstat_disc')['LSTAT'].count())

In [None]:
# Now, let's discretize LSTAT in the test set using pandas' cut() method:
X_test['lstat_disc'] = pd.cut(x=X_test['LSTAT'], bins=intervals, include_lowest=True)

In [None]:
t1 = X_train['lstat_disc'].value_counts() / len(X_train)
t2 = X_test['lstat_disc'].value_counts() / len(X_test)
tmp = pd.concat([t1, t2], axis=1)
tmp.columns = ['train', 'test']
tmp.plot.bar()
plt.xticks(rotation=45)
plt.ylabel('Number of observations per bin')
plt.show()

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

data = boston.copy()

X_train, X_test, y_train, y_test = train_test_split(data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3, random_state=0)

# Let's create an equal-width discretizer with scikit-learn by setting its strategy to uniform:
disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')

In [None]:
disc.fit(X_train[['LSTAT', 'DIS', 'RM']])

In [None]:
train_t = disc.transform(X_train[['LSTAT', 'DIS', 'RM']])
test_t = disc.transform(X_test[['LSTAT', 'DIS', 'RM']])

In [None]:
# We can inspect the bin boundaries learned by the transformer
disc.bin_edges_

- Equal Width doesn’t improve the value spread
- It can handle outliers
- Can be combined with categorical encodings

## Sorting the variable values in intervals of equal frequency

In [None]:
data = boston.copy()

X_train, X_test, y_train, y_test = train_test_split(data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3, random_state=0)

In [None]:
disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')

disc.fit(X_train[['LSTAT', 'DIS', 'RM']])

train_t = disc.transform(X_train[['LSTAT', 'DIS', 'RM']])
test_t = disc.transform(X_test[['LSTAT', 'DIS', 'RM']])

In [None]:
disc.bin_edges_

In [None]:
X_train = pd.DataFrame(train_t, columns =['LSTAT', 'DIS', 'RM'])
X_test = pd.DataFrame(test_t, columns =['LSTAT', 'DIS', 'RM'])
t1 = X_train['LSTAT'].value_counts() / len(X_train)
t2 = X_test['LSTAT'].value_counts() / len(X_test)
tmp = pd.concat([t1, t2], axis=1)
tmp.columns = ['train', 'test']
tmp.plot.bar()
plt.xticks(rotation=45)
plt.ylabel('Number of observations per bin')
plt.show()

- Equal Frequency does improve the value spread
- It can handle outliers
- Can be combined with categorical encoding

## Performing discretization followed by categorical encoding

In [None]:
data = boston.copy()

X_train, X_test, y_train, y_test = train_test_split(data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3, random_state=0)

In [None]:
disc = KBinsDiscretizer(n_bins=10, encode='onehot', strategy='quantile')

disc.fit(X_train[['LSTAT', 'DIS', 'RM']])

train_t = disc.transform(X_train[['LSTAT', 'DIS', 'RM']])
test_t = disc.transform(X_test[['LSTAT', 'DIS', 'RM']])

In [None]:
disc.bin_edges_

In [None]:
train_t

## Allocating the variable values in arbitrary intervals

In [None]:
data = boston.copy()

X_train, X_test, y_train, y_test = train_test_split(data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3, random_state=0)

In [None]:
data['LSTAT'].hist(bins=30)
plt.show()

In [None]:
# Let's create a list with the arbitrary interval limits, setting the upper limit to infinity to accommodate bigger values:
intervals = [0, 10, 20, 30, np.Inf]

# Let's create a list with the interval limits as labels, that is, strings:
labels = ['0-10', '10-20', '20-30', '>30']

# Let's discretize the LSTAT variable
data['lstat_labels'] = pd.cut(data['LSTAT'], bins=intervals, labels=labels, include_lowest=True)
data['lstat_intervals'] = pd.cut(data['LSTAT'], bins=intervals, labels=None, include_lowest=True)

data[['LSTAT','lstat_labels', 'lstat_intervals']].head(10)

In [None]:
data['lstat_intervals'].value_counts()

## Using decision trees for discretization

In [None]:
from sklearn.tree import DecisionTreeRegressor

data = boston.copy()

X_train, X_test, y_train, y_test = train_test_split(data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3, random_state=0)

# Let's assemble a decision tree to predict the MEDV target, setting the maximum depth to 3 and random_state for reproducibility
tree_model = DecisionTreeRegressor(max_depth=3, random_state=0)

# Let's fit the decision tree using the LSTAT variable to predict the MEDV target
tree_model.fit(X_train['LSTAT'].to_frame(), y_train)

X_train['lstat_tree'] = tree_model.predict(X_train['LSTAT'].to_frame())

# Let's explore the end leaves, that is, bins, the tree created:
X_train['lstat_tree'].unique()

In [None]:
# Let's now discretize the LSTAT variable in the test set:
X_test['lstat_tree'] = tree_model.predict(X_test['LSTAT'].to_frame())

In [None]:
pd.concat([X_test, y_test],axis=1).groupby(['lstat_tree'])['MEDV'].mean().plot()
plt.title('Monotonic relationship between discretised LSTAT and target')
plt.ylabel('MEDV')
plt.show()

In [None]:
t1 = X_train['lstat_tree'].value_counts() / len(X_train)
t2 = X_test['lstat_tree'].value_counts() / len(X_test)
tmp = pd.concat([t1, t2], axis=1)
tmp.columns = ['train', 'test']
tmp.plot.bar()
plt.xticks(rotation=45)
plt.ylabel('Number of observations per bin')
plt.show()

In [None]:
from feature_engine.discretisation import DecisionTreeDiscretiser

treeDisc = DecisionTreeDiscretiser(cv=10, 
                                   scoring='neg_mean_squared_error',
                                   variables=['LSTAT', 'RM', 'DIS'], 
                                   regression=True, 
                                   param_grid={'max_depth': [1,2,3,4]})

treeDisc.fit(X_train, y_train)
treeDisc.binner_dict_['LSTAT'].best_params_

In [None]:
train_t = treeDisc.transform(X_train)
test_t = treeDisc.transform(X_test)

- Decision Tree does not improve the value spread
- It can handle outliers well as trees are robust to outliers.
- Creates monotonic relationships

## Example: Wave regression dataset 

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
X, y = make_wave(n_samples=100)

kb = KBinsDiscretizer(n_bins=10, strategy='uniform')
kb.fit(X)
print("bin edges: \n", kb.bin_edges_)

In [None]:
X_binned = kb.transform(X)
X_binned

In [None]:
print(X[:10])
X_binned.toarray()[:10]

In [None]:
kb = KBinsDiscretizer(n_bins=10, strategy='uniform', encode='onehot-dense')
kb.fit(X)
X_binned = kb.transform(X)

In [None]:
line_binned = kb.transform(line)

reg = LinearRegression().fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='linear regression binned')

reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='decision tree binned')
plt.plot(X[:, 0], y, 'o', c='k')
plt.vlines(kb.bin_edges_[0], -3, 3, linewidth=1, alpha=.2)
plt.legend(loc="best")
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.show()