In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.datasets import make_regression


In [2]:
NUM_FEATURES = 5
NUM_BINS = 5
NUM_SAMPLES = 5
# Generate synthetic regression data
X, y = make_regression(n_samples=NUM_SAMPLES, n_features=NUM_FEATURES, noise=0.1, random_state=1)

# Convert to pandas DataFrame
df = pd.DataFrame(data=X, columns=[f'Feature_{i}' for i in range(1, NUM_FEATURES+1)])
df['Target'] = y

df_data = df.drop('Target', axis=1)  #.astype('float32')

print(df_data.head())

   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5
0   0.502494   0.901591   1.144724   0.900856  -1.100619
1  -0.384054  -0.322417  -2.060141   1.133769   1.462108
2  -1.072969  -0.528172  -0.611756   0.865408   1.624345
3   0.042214  -0.877858  -0.172428   0.582815  -1.099891
4   0.319039  -0.761207   1.744812  -0.249370  -2.301539


In [3]:
df_data.dtypes

Feature_1    float64
Feature_2    float64
Feature_3    float64
Feature_4    float64
Feature_5    float64
dtype: object

In [4]:
# Define the number of bins
num_bins = NUM_BINS

# Initialize an empty list to store the bin boundaries for each column
bin_boundaries = {}
quantiles = np.linspace(0, 1, num_bins + 1)

# Loop through each column and compute the bin boundaries
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn]  # Extract the current column
    boundaries = column_data.quantile(quantiles) # Compute the bin boundaries for the current column
    bin_boundaries[fn] = boundaries  # Add the bin boundaries to     bin_boundaries.append(boundaries)  # Add the bin boundaries to the list

# print(f"\nBin boundaries:")
# # Print the bin boundaries for each column
# for i, boundaries in bin_boundaries.items():
#     print(f"Column {i} bin boundaries:\n{boundaries}")


In [5]:
%%time
# Loop through each column and perform piecewise linear encoding

encode_data_list = []
idxs_list = []
encoded_value_list = []
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn].values  # Extract the current column
    column_bin_boundaries = np.array(bin_boundaries[fn])  # Get the bin boundaries for the current column

    # Initialize a matrix of all ones to store the encoded data
    encoded_data = np.ones([column_data.shape[0], num_bins])
    # print(f"encoded_data0:\n{encoded_data}")
    
    # Use np.digitize to find the bin indices for each data point
    bin_indices = np.digitize(column_data, column_bin_boundaries) - 1
    # print(f"column_bin_boundaries:\n{column_bin_boundaries}")
    # print(f"zip(columndata, bin_indices):\n{list(zip(column_data, bin_indices))}")

    # compute numerator, adjust for edge case at max value
    # find the bin min for each data point
    bin_min = column_bin_boundaries[bin_indices]

    # for maximum data point, set bin min to second to last bin boundary
    bin_min[bin_indices == num_bins] = column_bin_boundaries[-2]

    # compute the bin numerator for each data point
    bin_numerator = column_data - bin_min
    # print(f"\nbin_numerator:\n{bin_numerator}")
    
    # Calculate the bin widths based on the bin boundaries
    bin_widths = np.diff(column_bin_boundaries)

    # adjust for edge case of last bin
    idxs = bin_indices
    idxs[idxs == num_bins] = num_bins - 1
    # for maximum data point, set bin width to last bin boundary

    bin_denominator = bin_widths[idxs]
    # print(f"\nbin_widths:\n{bin_widths}")
   
    # Calculate the encoded value of each data point within the selected bin
    encoded_values = bin_numerator / bin_denominator
   
    # Create a mask to store the encoded value in the corresponding column of encoded_data
    mask = np.zeros_like(encoded_data, dtype=bool)
    mask[np.arange(encoded_data.shape[0]), bin_indices] = True

    # Store the encoded value in the corresponding column of encoded_data
    encoded_data[mask] = encoded_values

    # Create mask to set all values after the column-specific bin index to 0
    mask = np.tile(np.arange(encoded_data.shape[1]), (encoded_data.shape[0], 1))
    mask = mask > bin_indices.reshape(-1, 1)
    encoded_data[mask] = 0

    encode_data_list.append(encoded_data)
    idxs_list.append(bin_indices)
    encoded_value_list.append(encoded_values)

# encoded_data now contains the piecewise linear encoding for each column
encoded_data1 = np.array(encode_data_list).transpose(1, 0, 2)
idxs = np.vstack(idxs_list).T
# print(f"Encoded data1: {encoded_data.shape}\n{encoded_data1}")
# print(f"idxs: {idxs.shape}\n{idxs}")
print(f"encoded_data1 shape: {encoded_data1.shape}")



encoded_data1 shape: (5, 5, 5)
CPU times: user 1.91 ms, sys: 330 µs, total: 2.24 ms
Wall time: 2.62 ms


import os
os.environ["NUMBA_DEBUG"] = "1"

In [6]:
%%time
from ple_transformer import MyTransformer3 as MyTransformer

CPU times: user 2.7 s, sys: 0 ns, total: 2.7 s
Wall time: 2.7 s


In [7]:
%%time
# Create an instance of the transformer
transformer = MyTransformer(num_bins=NUM_BINS)

# Fit the transformer to the data
transformer.fit(df_data)


CPU times: user 6.5 ms, sys: 0 ns, total: 6.5 ms
Wall time: 5.91 ms


In [8]:

%%time
# Transform the data using the fitted transformer
encoded_data2 = transformer.transform(df_data)


CPU times: user 2.85 ms, sys: 0 ns, total: 2.85 ms
Wall time: 2.35 ms


In [9]:
%%time
# Create an instance of the transformer
transformer2 = MyTransformer(num_bins=NUM_BINS)

# Fit the transformer to the data
transformer2.fit(df_data)


CPU times: user 10.3 ms, sys: 0 ns, total: 10.3 ms
Wall time: 8.51 ms


In [10]:
%%time
# Transform the data using the fitted transformer
encoded_data3 = transformer2.transform(df_data)

CPU times: user 2.01 ms, sys: 0 ns, total: 2.01 ms
Wall time: 1.72 ms


In [11]:
%%time
# print(f"sklearn Encoded data: {encoded_data2.shape}\n{encoded_data2}")

print(f"encoded_data1 == encoded_data3: {np.allclose(encoded_data1, encoded_data3)}")

encoded_data1 == encoded_data3: False
CPU times: user 218 µs, sys: 0 ns, total: 218 µs
Wall time: 224 µs


In [12]:
encoded_data1.shape, encoded_data2.shape, encoded_data3.shape

((5, 5, 5), (5, 5, 5), (5, 5, 5))

In [13]:
encoded_data1.dtype, encoded_data2.dtype, encoded_data3.dtype

(dtype('float64'), dtype('float64'), dtype('float64'))

In [14]:
encoded_data1 == encoded_data3

array([[[ True,  True,  True,  True, False],
        [ True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True]],

       [[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True]],

       [[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True, False]],

       [[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True]],

       [[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
  

In [15]:
encoded_data1[0]

array([[1.        , 1.        , 1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        , 0.86815725, 0.        ],
       [1.        , 1.        , 1.        , 0.31346296, 0.        ],
       [1.        , 0.99818492, 0.        , 0.        , 0.        ]])

In [16]:
encoded_data3[0]

array([[1.        , 1.        , 1.        , 1.        , 2.38171479],
       [1.        , 1.        , 1.        , 1.        , 1.37607427],
       [1.        , 1.        , 1.        , 0.86815725, 0.        ],
       [1.        , 1.        , 1.        , 0.31346296, 0.        ],
       [1.        , 0.99818492, 0.        , 0.        , 0.        ]])