<a href="https://colab.research.google.com/github/kyook17/BADM576/blob/main/Discretization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Discretization of numeric variables


* Equal Width

* Equal Frequency

* Model-based
> - KNN based
> - Decision Tree based

The `sklearn.preprocessing` has `KBinsDiscretizer` to create bins.

`KBinsDiscretizer` takes three arguments:

1 `n_bins`

2 `encode` = `ordinal` vs `onehot`

3 `strategy`
> - `uniform`  = Equal Width
> - `quantile` = Equal Frequency
> - `kmeans` = Use kMeans to create clusters

In [None]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import KBinsDiscretizer

from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/ashish-cell/BADM-211-FA21/main/Data/medical_cost/insurance.csv")

In [None]:
X = df.drop(columns = ["charges"])

y = df["charges"]

In [None]:
# let's separate into training and testing set

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
num_vars = [var for var in train_X.columns if df[var].dtype != "O"]

In [None]:
num_vars

['age', 'bmi', 'children']

In [None]:
train_X[num_vars].head()

Unnamed: 0,age,bmi,children
560,46,19.95,2
1285,47,24.32,0
1142,52,24.86,0
969,39,34.32,5
486,54,21.47,3


## Equal width discretization

In [None]:
# Initialize discretizer
disc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')

In [None]:
# fit
disc.fit(train_X[num_vars])

In [None]:
disc.bin_edges_ # Check the difference in values in each column.

array([array([18. , 29.5, 41. , 52.5, 64. ]),
       array([15.96  , 25.2525, 34.545 , 43.8375, 53.13  ]),
       array([0.  , 1.25, 2.5 , 3.75, 5.  ])], dtype=object)

In [None]:
# transform
train_d = disc.transform(train_X[num_vars])

train_d = pd.DataFrame(train_d, columns = num_vars)

train_d.head()

Unnamed: 0,age,bmi,children
0,2.0,0.0,1.0
1,2.0,0.0,0.0
2,2.0,0.0,0.0
3,1.0,1.0,3.0
4,3.0,0.0,2.0


In [None]:
for col in train_d.columns:

  print(train_d[col].value_counts())
  print("**" * 50)

0.0    330
2.0    268
3.0    246
1.0    226
Name: age, dtype: int64
****************************************************************************************************
1.0    588
2.0    246
0.0    215
3.0     21
Name: bmi, dtype: int64
****************************************************************************************************
0.0    715
1.0    191
2.0    127
3.0     37
Name: children, dtype: int64
****************************************************************************************************


## Equal frequency discretization

In [None]:
# Initialize discretizer
disc = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile') # Initialize discretizer

In [None]:
# fit
disc.fit(train_X[num_vars])



In [None]:
disc.bin_edges_ # Check the difference in values in each column. # Notice that it did not create four bins for each column

array([array([18., 31., 48., 64.]),
       array([15.96      , 27.74      , 33.10666667, 53.13      ]),
       array([0., 1., 5.])], dtype=object)

In [None]:
# transform
train_d = disc.transform(train_X[num_vars])

train_d = pd.DataFrame(train_d, columns = num_vars)

train_d.head()

Unnamed: 0,age,bmi,children
0,1.0,0.0,1.0
1,1.0,0.0,0.0
2,2.0,0.0,0.0
3,1.0,2.0,1.0
4,2.0,0.0,1.0


In [None]:
for col in train_d.columns:

  print(train_d[col].value_counts())
  print("**" * 50)

1.0    363
2.0    360
0.0    347
Name: age, dtype: int64
****************************************************************************************************
2.0    357
1.0    357
0.0    356
Name: bmi, dtype: int64
****************************************************************************************************
1.0    614
0.0    456
Name: children, dtype: int64
****************************************************************************************************


## Kmeans discretizer

In [None]:
# Initialize discretizer
disc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans')

In [None]:
# fit
disc.fit(train_X[num_vars])

In [None]:
disc.bin_edges_ # Check the difference in values in each column.

array([array([18.        , 29.04276637, 41.35503541, 52.57723577, 64.        ]),
       array([15.96      , 25.95856016, 31.74899531, 37.80593632, 53.13      ]),
       array([0.        , 1.18111888, 2.5       , 3.7027027 , 5.        ])],
      dtype=object)

In [None]:
# transform
train_d = disc.transform(train_X[num_vars])

train_d = pd.DataFrame(train_d, columns = num_vars)

train_d.head()

Unnamed: 0,age,bmi,children
0,2.0,0.0,1.0
1,2.0,0.0,0.0
2,2.0,0.0,0.0
3,1.0,2.0,3.0
4,3.0,0.0,2.0


In [None]:
for col in train_d.columns:

  print(train_d[col].value_counts())
  print("**" * 50)

0.0    330
1.0    248
2.0    246
3.0    246
Name: age, dtype: int64
****************************************************************************************************
1.0    378
2.0    308
0.0    257
3.0    127
Name: bmi, dtype: int64
****************************************************************************************************
0.0    715
1.0    191
2.0    127
3.0     37
Name: children, dtype: int64
****************************************************************************************************


# Create your own bins

use `pd.cut`

[pandas.cut](https://pandas.pydata.org/docs/reference/api/pandas.cut.html)

In [None]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), bins = 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [None]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)

([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
 Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]],
 array([0.994, 3.   , 5.   , 7.   ]))

In [None]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]),
       3, labels=["bad", "medium", "good"])

['bad', 'good', 'medium', 'medium', 'good', 'bad']
Categories (3, object): ['bad' < 'medium' < 'good']