In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.datasets import make_regression


In [2]:
NUM_FEATURES = 100
NUM_BINS = 45
NUM_SAMPLES = 1_000
# Generate synthetic regression data
X, y = make_regression(n_samples=NUM_SAMPLES, n_features=NUM_FEATURES, noise=0.1, random_state=1)

# Convert to pandas DataFrame
df = pd.DataFrame(data=X, columns=[f'Feature_{i}' for i in range(1, NUM_FEATURES+1)])
df['Target'] = y

df_data = df.drop('Target', axis=1).astype('float32')

print(df_data.head())

   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0   0.308937  -1.358117   1.799673   0.370344  -1.089044  -0.611431   
1  -0.714677  -0.911399   0.704543   0.072666  -0.146416  -1.417644   
2  -2.414273   0.270416  -1.890207  -0.984467  -0.509132  -0.236473   
3   1.537630  -0.936710   0.922793  -0.003897  -1.335670  -0.665940   
4  -0.484542   1.470649   1.364323  -0.467492   1.580360  -0.009018   

   Feature_7  Feature_8  Feature_9  Feature_10  ...  Feature_91  Feature_92  \
0  -0.335119   1.147323  -1.054796    0.139363  ...   -0.450311   -0.015725   
1  -0.267344   0.963182   1.159643   -0.782204  ...    1.002065   -1.594992   
2   0.924821  -1.554528   0.220635   -1.079696  ...   -0.240047    1.455053   
3   0.716759  -0.155484   1.408668    2.234313  ...    0.171679   -2.075227   
4   0.412203   0.587010  -0.461107   -0.079273  ...   -0.292875   -1.347707   

   Feature_93  Feature_94  Feature_95  Feature_96  Feature_97  Feature_98  \
0   -0.080910   -0.80

In [3]:
df_data.dtypes

Feature_1      float32
Feature_2      float32
Feature_3      float32
Feature_4      float32
Feature_5      float32
                ...   
Feature_96     float32
Feature_97     float32
Feature_98     float32
Feature_99     float32
Feature_100    float32
Length: 100, dtype: object

In [4]:
# Define the number of bins
num_bins = NUM_BINS

# Initialize an empty list to store the bin boundaries for each column
bin_boundaries = {}
quantiles = np.linspace(0, 1, num_bins + 1)

# Loop through each column and compute the bin boundaries
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn].astype(np.float32)  # Extract the current column
    boundaries = column_data.quantile(quantiles) # Compute the bin boundaries for the current column
    bin_boundaries[fn] = boundaries.astype(np.float32)  # Add the bin boundaries to     bin_boundaries.append(boundaries)  # Add the bin boundaries to the list

# print(f"\nBin boundaries:")
# # Print the bin boundaries for each column
# for i, boundaries in bin_boundaries.items():
#     print(f"Column {i} bin boundaries:\n{boundaries}")


In [5]:
%%time
# Loop through each column and perform piecewise linear encoding

encode_data_list = []
idxs_list = []
encoded_value_list = []
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn].values.astype(np.float32)  # Extract the current column
    column_bin_boundaries = np.array(bin_boundaries[fn])  # Get the bin boundaries for the current column

    # Initialize a matrix of all ones to store the encoded data
    encoded_data = np.ones([column_data.shape[0], num_bins])
    # print(f"encoded_data0:\n{encoded_data}")
    
    # Use np.digitize to find the bin indices for each data point
    bin_indices = np.digitize(column_data, column_bin_boundaries) - 1
    # print(f"column_bin_boundaries:\n{column_bin_boundaries}")
    # print(f"zip(columndata, bin_indices):\n{list(zip(column_data, bin_indices))}")

    # compute numerator, adjust for edge case at max value
    # find the bin min for each data point
    bin_min = column_bin_boundaries[bin_indices]

    # for maximum data point, set bin min to second to last bin boundary
    bin_min[bin_indices == num_bins] = column_bin_boundaries[-2]

    # compute the bin numerator for each data point
    bin_numerator = column_data - bin_min
    # print(f"\nbin_numerator:\n{bin_numerator}")
    
    # Calculate the bin widths based on the bin boundaries
    bin_widths = np.diff(column_bin_boundaries)

    # adjust for edge case of last bin
    idxs = bin_indices
    idxs[idxs == num_bins] = num_bins - 1
    # for maximum data point, set bin width to last bin boundary

    bin_denominator = bin_widths[idxs]
    # print(f"\nbin_widths:\n{bin_widths}")
   
    # Calculate the encoded value of each data point within the selected bin
    encoded_values = bin_numerator / bin_denominator
   
    # Create a mask to store the encoded value in the corresponding column of encoded_data
    mask = np.zeros_like(encoded_data, dtype=bool)
    mask[np.arange(encoded_data.shape[0]), bin_indices] = True

    # Store the encoded value in the corresponding column of encoded_data
    encoded_data[mask] = encoded_values

    # Create mask to set all values after the column-specific bin index to 0
    mask = np.tile(np.arange(encoded_data.shape[1]), (encoded_data.shape[0], 1))
    mask = mask > bin_indices.reshape(-1, 1)
    encoded_data[mask] = 0

    encode_data_list.append(encoded_data)
    idxs_list.append(bin_indices)
    encoded_value_list.append(encoded_values)

# encoded_data now contains the piecewise linear encoding for each column
encoded_data1 = np.array(encode_data_list).astype(np.float32).transpose(1, 0, 2)
idxs = np.vstack(idxs_list).T
# print(f"Encoded data1: {encoded_data.shape}\n{encoded_data1}")
# print(f"idxs: {idxs.shape}\n{idxs}")
print(f"encoded_data1 shape: {encoded_data1.shape}")



encoded_data1 shape: (1000, 100, 45)
CPU times: user 66.6 ms, sys: 15.8 ms, total: 82.4 ms
Wall time: 80.6 ms


import os
os.environ["NUMBA_DEBUG"] = "1"

In [6]:
%%time
from ple_transformer import MyTransformerNP as MyTransformer

CPU times: user 112 ms, sys: 4.03 ms, total: 116 ms
Wall time: 116 ms


In [7]:
%%time
# Create an instance of the transformer
transformer = MyTransformer(num_bins=NUM_BINS)

# Fit the transformer to the data
transformer.fit(df_data)


CPU times: user 75.6 ms, sys: 0 ns, total: 75.6 ms
Wall time: 74.1 ms


In [8]:

%%time
# Transform the data using the fitted transformer
encoded_data2 = transformer.transform(df_data)


CPU times: user 52.5 ms, sys: 10.8 ms, total: 63.3 ms
Wall time: 62.1 ms


In [9]:
encoded_data1.shape, encoded_data2.shape,

((1000, 100, 45), (1000, 100, 45))

In [10]:
encoded_data1.dtype, encoded_data2.dtype

(dtype('float32'), dtype('float32'))

In [11]:
%%time
# print(f"sklearn Encoded data: {encoded_data2.shape}\n{encoded_data2}")

if not np.all(np.isclose(encoded_data1, encoded_data2)):
    print("ERROR: Encoded data does not match, max diff = ", np.max(np.abs(encoded_data1 - encoded_data2)))
else:
    print("SUCCESS: Encoded data matches!")

assert np.all(np.isclose(encoded_data1, encoded_data2)), f"Encoded data does not match"
    

ERROR: Encoded data does not match, max diff =  1.475215e-06


AssertionError: Encoded data does not match