In [50]:
import numpy as np

def real_to_categ(x, bins=None, num_bins=5):
    assert num_bins > 1  # Ensure a valid number of bins
    n, m = x.shape  # Number of examples and features
    
    # If bins are not provided, create them using the min and max of each feature
    if bins is None:
        min_vals = np.min(x, axis=0)
        max_vals = np.max(x, axis=0)
        
        # Initialize bins array
        bins = np.zeros((m, num_bins))
        # Create bins for each feature
        for i in range(m):
            bins[i] = np.linspace(min_vals[i], max_vals[i], num_bins, endpoint=True)
    
    # Check the shape of the bins
    assert bins.shape == (m, num_bins), 'Invalid bins shape.'
    
    # Use digitize to convert all features at once
    cat_x = np.zeros((n, m), dtype=int)
    for i in range(m):
        cat_x[:, i] = np.digitize(x[:, i], bins[i]) - 1  # Subtract 1 to start categories at 0
    
    return cat_x, bins

# Example usage:
np.random.seed(0)
x_example = np.random.rand(10, 3)  # 10 examples, 3 features
cat_x_example, bins_example = real_to_categ(x_example, num_bins=5)

In [51]:
x_example

array([[0.5488135 , 0.71518937, 0.60276338],
       [0.54488318, 0.4236548 , 0.64589411],
       [0.43758721, 0.891773  , 0.96366276],
       [0.38344152, 0.79172504, 0.52889492],
       [0.56804456, 0.92559664, 0.07103606],
       [0.0871293 , 0.0202184 , 0.83261985],
       [0.77815675, 0.87001215, 0.97861834],
       [0.79915856, 0.46147936, 0.78052918],
       [0.11827443, 0.63992102, 0.14335329],
       [0.94466892, 0.52184832, 0.41466194]])

In [52]:
cat_x_example

array([[2, 3, 2],
       [2, 1, 2],
       [1, 3, 3],
       [1, 3, 2],
       [2, 4, 0],
       [0, 0, 3],
       [3, 3, 4],
       [3, 1, 3],
       [0, 2, 0],
       [4, 2, 1]])

In [53]:
bins_example

array([[0.0871293 , 0.3015142 , 0.51589911, 0.73028401, 0.94466892],
       [0.0202184 , 0.24656296, 0.47290752, 0.69925208, 0.92559664],
       [0.07103606, 0.29793163, 0.5248272 , 0.75172277, 0.97861834]])

In [49]:
bins_example

array([[0.0871293 , 0.25863722, 0.43014515, 0.60165307, 0.77316099],
       [0.0202184 , 0.20129405, 0.38236969, 0.56344534, 0.74452099],
       [0.07103606, 0.25255252, 0.43406897, 0.61558543, 0.79710189]])

In [56]:
# Assuming x is your feature matrix
x = np.random.rand(10, 5)  # Example matrix with 100 samples and 5 features

# Calculate the minimum values for each feature
min_vals = np.min(x, axis=0)

# Calculate the 67th percentiles for each feature
percentiles_67 = np.percentile(x, 67, axis=0)

# Combine min_vals and percentiles_67 to define bin edges for each feature
bins = np.array([min_vals, percentiles_67])

In [63]:
x

array([[0.0243132 , 0.34261098, 0.62223106, 0.27906795, 0.20974995],
       [0.11570323, 0.57714024, 0.69527001, 0.67195714, 0.94886102],
       [0.00270321, 0.64719665, 0.60039224, 0.58873961, 0.96277032],
       [0.01687167, 0.69648243, 0.81367865, 0.5098072 , 0.33396487],
       [0.79084016, 0.09724293, 0.44203564, 0.51995237, 0.69395641],
       [0.09088573, 0.2277595 , 0.41030156, 0.62329467, 0.88696078],
       [0.61882617, 0.13346147, 0.98058013, 0.87178573, 0.50272076],
       [0.92234798, 0.54138079, 0.92330607, 0.82989737, 0.96828641],
       [0.91978281, 0.03603382, 0.174772  , 0.38913468, 0.9521427 ],
       [0.30002892, 0.16046764, 0.88630467, 0.44639442, 0.90787559]])

In [60]:
min_vals

array([0.00270321, 0.03603382, 0.174772  , 0.27906795, 0.20974995])

In [64]:
percentiles_67

array([0.62398659, 0.54245358, 0.81585743, 0.62475455, 0.94895947])

In [66]:
bins.T

array([[0.00270321, 0.62398659],
       [0.03603382, 0.54245358],
       [0.174772  , 0.81585743],
       [0.27906795, 0.62475455],
       [0.20974995, 0.94895947]])