# Chapter 4

# 4.4.5. Binning and discretization

Equal-width binning

In [1]:
import numpy as np
import pandas as pd

# Sample data
np.random.seed(0)
glucose_levels = np.random.normal(loc=120, scale=20, size=100)
glucose_levels = np.clip(glucose_levels, 50, 200)  # Ensure values are within 50-200

df = pd.DataFrame({'Glucose': glucose_levels})

# Define bin edges
bin_edges = np.linspace(50, 200, num=6)  # 5 bins

# Assign bins
df['Glucose_Bin'] = pd.cut(df['Glucose'], bins=bin_edges, labels=False, include_lowest=True)

print(df.head())


      Glucose  Glucose_Bin
0  155.281047            3
1  128.003144            2
2  139.574760            2
3  164.817864            3
4  157.351160            3


Equal-frequency binning

In [2]:
# Assign bins using quantiles
df['Glucose_Bin_EqualFreq'] = pd.qcut(df['Glucose'], q=5, labels=False)

print(df.head())


      Glucose  Glucose_Bin  Glucose_Bin_EqualFreq
0  155.281047            3                      4
1  128.003144            2                      3
2  139.574760            2                      4
3  164.817864            3                      4
4  157.351160            3                      4


K-means binning

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

# Generate synthetic data
np.random.seed(42)
data_size = 500

# Create a skewed distribution (e.g., exponential distribution)
X = np.random.exponential(scale=2, size=data_size)
df = pd.DataFrame({'Value': X})

# Apply K-Means binning
k = 5  # Number of bins/clusters
est = KBinsDiscretizer(n_bins=k, encode='ordinal', strategy='kmeans', random_state=42, subsample=None)

# Reshape data for the transformer
X_reshaped = df[['Value']]

# Fit and transform the data
df['KMeans_Bin'] = est.fit_transform(X_reshaped).astype(int)

# Display the first few rows
print(df.head())

# Print the cluster centers (bin edges)
print("Cluster Centers (Bin Edges):")
print(est.bin_edges_[0])


      Value  KMeans_Bin
0  0.938536           0
1  6.020243           3
2  2.633491           1
3  1.825885           1
4  0.339250           0
Cluster Centers (Bin Edges):
[0.01014887 1.23546742 2.89844556 5.02610747 7.56704349 9.91365722]
