In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.datasets import make_regression


In [2]:
NUM_FEATURES = 75
# Generate synthetic regression data
X, y = make_regression(n_samples=20000, n_features=NUM_FEATURES, noise=0.1)

# Convert to pandas DataFrame
df = pd.DataFrame(data=X, columns=[f'Feature_{i}' for i in range(1, NUM_FEATURES+1)])
df['Target'] = y

df_data = df.drop('Target', axis=1).astype('float32')

print(df_data.head())

   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0  -1.764389   0.460320   0.433077   0.024548   0.474093   0.627299   
1   0.250289   2.106588   0.697111   1.164758   0.886688  -0.403761   
2  -0.799830  -0.115744   0.792541  -1.197304  -0.254514   0.022244   
3  -0.776549  -1.911951  -0.380405  -1.463214   2.097378   0.470284   
4  -0.762134   0.248931   0.115716   0.990664  -0.711780  -0.376125   

   Feature_7  Feature_8  Feature_9  Feature_10  ...  Feature_66  Feature_67  \
0  -0.002908  -0.205944   0.514504    1.163743  ...    0.063879   -0.898047   
1  -0.135772  -1.953848  -1.247171   -0.177695  ...    0.896385    0.561336   
2  -0.062386  -0.163041   2.274531   -0.697315  ...   -0.903302   -1.660583   
3  -0.750628  -0.721114   1.192973    0.442310  ...   -2.253413   -0.626872   
4   0.049101  -1.995285  -0.072153   -0.675385  ...    0.786182   -0.116393   

   Feature_68  Feature_69  Feature_70  Feature_71  Feature_72  Feature_73  \
0    0.532052   -0.23

In [3]:
df_data.dtypes

Feature_1     float32
Feature_2     float32
Feature_3     float32
Feature_4     float32
Feature_5     float32
               ...   
Feature_71    float32
Feature_72    float32
Feature_73    float32
Feature_74    float32
Feature_75    float32
Length: 75, dtype: object

In [4]:
# Define the number of bins
num_bins = 75

# Initialize an empty list to store the bin boundaries for each column
bin_boundaries = {}
quantiles = np.linspace(0, 1, num_bins + 1)

# Loop through each column and compute the bin boundaries
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn]  # Extract the current column
    boundaries = column_data.quantile(quantiles) # Compute the bin boundaries for the current column
    bin_boundaries[fn] = boundaries  # Add the bin boundaries to     bin_boundaries.append(boundaries)  # Add the bin boundaries to the list

# print(f"\nBin boundaries:")
# # Print the bin boundaries for each column
# for i, boundaries in bin_boundaries.items():
#     print(f"Column {i} bin boundaries:\n{boundaries}")


In [5]:
%%time
# Loop through each column and perform piecewise linear encoding

encode_data_list = []
idxs_list = []
encoded_value_list = []
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn].values  # Extract the current column
    column_bin_boundaries = np.array(bin_boundaries[fn])  # Get the bin boundaries for the current column

    # Initialize a matrix of all ones to store the encoded data
    encoded_data = np.ones([column_data.shape[0], num_bins])
    # print(f"encoded_data0:\n{encoded_data}")
    
    # Use np.digitize to find the bin indices for each data point
    bin_indices = np.digitize(column_data, column_bin_boundaries) - 1
    # print(f"column_bin_boundaries:\n{column_bin_boundaries}")
    # print(f"zip(columndata, bin_indices):\n{list(zip(column_data, bin_indices))}")

    # compute numerator, adjust for edge case at max value
    # find the bin min for each data point
    bin_min = column_bin_boundaries[bin_indices]

    # for maximum data point, set bin min to second to last bin boundary
    bin_min[bin_indices == num_bins] = column_bin_boundaries[-2]

    # compute the bin numerator for each data point
    bin_numerator = column_data - bin_min
    # print(f"\nbin_numerator:\n{bin_numerator}")
    
    # Calculate the bin widths based on the bin boundaries
    bin_widths = np.diff(column_bin_boundaries)

    # adjust for edge case of last bin
    idxs = bin_indices
    idxs[idxs == num_bins] = num_bins - 1
    # for maximum data point, set bin width to last bin boundary

    bin_denominator = bin_widths[idxs]
    # print(f"\nbin_widths:\n{bin_widths}")
   
    # Calculate the encoded value of each data point within the selected bin
    encoded_values = bin_numerator / bin_denominator
   
    # Create a mask to store the encoded value in the corresponding column of encoded_data
    mask = np.zeros_like(encoded_data, dtype=bool)
    mask[np.arange(encoded_data.shape[0]), bin_indices] = True

    # Store the encoded value in the corresponding column of encoded_data
    encoded_data[mask] = encoded_values

    # Create mask to set all values after the column-specific bin index to 0
    mask = np.tile(np.arange(encoded_data.shape[1]), (encoded_data.shape[0], 1))
    mask = mask > bin_indices.reshape(-1, 1)
    encoded_data[mask] = 0

    encode_data_list.append(encoded_data)
    idxs_list.append(bin_indices)
    encoded_value_list.append(encoded_values)

# encoded_data now contains the piecewise linear encoding for each column
encoded_data1 = np.array(encode_data_list).transpose(1, 0, 2)
idxs = np.vstack(idxs_list).T
# print(f"Encoded data1: {encoded_data.shape}\n{encoded_data1}")
# print(f"idxs: {idxs.shape}\n{idxs}")
print(f"encoded_data1 shape: {encoded_data1.shape}")



encoded_data1 shape: (20000, 75, 75)
CPU times: user 616 ms, sys: 820 ms, total: 1.44 s
Wall time: 1.43 s


In [10]:
from ple_transformer import MyTransformer1 as MyTransformer

In [11]:
%%time
# Create an instance of the transformer
transformer = MyTransformer(num_bins=75)

# Fit the transformer to the data
transformer.fit(df_data)


CPU times: user 206 ms, sys: 0 ns, total: 206 ms
Wall time: 204 ms


In [12]:

%%time
# Transform the data using the fitted transformer
encoded_data2 = transformer.transform(df_data)


CPU times: user 724 ms, sys: 205 ms, total: 929 ms
Wall time: 926 ms


In [13]:
%%time
# print(f"sklearn Encoded data: {encoded_data2.shape}\n{encoded_data2}")

print(f"encoded_data1 == encoded_data2: {np.allclose(encoded_data1, encoded_data2)}")

encoded_data1 == encoded_data2: True
CPU times: user 964 ms, sys: 169 ms, total: 1.13 s
Wall time: 1.13 s


In [14]:
encoded_data2.dtype

dtype('float64')