In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.datasets import make_regression


In [2]:
NUM_FEATURES = 75
# Generate synthetic regression data
X, y = make_regression(n_samples=20000, n_features=NUM_FEATURES, noise=0.1)

# Convert to pandas DataFrame
df = pd.DataFrame(data=X, columns=[f'Feature_{i}' for i in range(1, NUM_FEATURES+1)])
df['Target'] = y

df_data = df.drop('Target', axis=1)  #.astype('float32')

print(df_data.head())

   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0  -0.114686  -0.688847  -0.810012   0.186895  -0.078368   0.446053   
1   0.931153   0.651449   0.342267   0.172851   1.198334  -1.492716   
2  -1.296885   2.090927   0.638129   0.679499  -1.673971  -0.416401   
3   0.836807  -0.493522  -0.156829   0.176909  -0.835537  -2.640528   
4  -1.692138   0.557319  -0.418471  -1.271486  -0.775110  -1.262629   

   Feature_7  Feature_8  Feature_9  Feature_10  ...  Feature_66  Feature_67  \
0   0.636313   0.431090  -0.100530   -0.781681  ...   -0.891808   -0.520644   
1   0.714839   1.110767  -0.021322   -0.921828  ...   -0.321143   -1.631833   
2  -0.102578   1.027223  -0.562475   -0.540435  ...   -0.066197   -0.419002   
3  -0.548480  -0.419266  -1.090254   -0.809320  ...   -1.292386    2.323236   
4   0.425924   0.606922   1.211993   -0.825758  ...   -1.101874    0.843631   

   Feature_68  Feature_69  Feature_70  Feature_71  Feature_72  Feature_73  \
0    2.671463    1.27

In [3]:
df_data.dtypes

Feature_1     float64
Feature_2     float64
Feature_3     float64
Feature_4     float64
Feature_5     float64
               ...   
Feature_71    float64
Feature_72    float64
Feature_73    float64
Feature_74    float64
Feature_75    float64
Length: 75, dtype: object

In [4]:
# Define the number of bins
num_bins = 75

# Initialize an empty list to store the bin boundaries for each column
bin_boundaries = {}
quantiles = np.linspace(0, 1, num_bins + 1)

# Loop through each column and compute the bin boundaries
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn]  # Extract the current column
    boundaries = column_data.quantile(quantiles) # Compute the bin boundaries for the current column
    bin_boundaries[fn] = boundaries  # Add the bin boundaries to     bin_boundaries.append(boundaries)  # Add the bin boundaries to the list

# print(f"\nBin boundaries:")
# # Print the bin boundaries for each column
# for i, boundaries in bin_boundaries.items():
#     print(f"Column {i} bin boundaries:\n{boundaries}")


In [5]:
%%time
# Loop through each column and perform piecewise linear encoding

encode_data_list = []
idxs_list = []
encoded_value_list = []
for fn in df_data.columns:  # Iterate over columns
    column_data = df_data[fn].values  # Extract the current column
    column_bin_boundaries = np.array(bin_boundaries[fn])  # Get the bin boundaries for the current column

    # Initialize a matrix of all ones to store the encoded data
    encoded_data = np.ones([column_data.shape[0], num_bins])
    # print(f"encoded_data0:\n{encoded_data}")
    
    # Use np.digitize to find the bin indices for each data point
    bin_indices = np.digitize(column_data, column_bin_boundaries) - 1
    # print(f"column_bin_boundaries:\n{column_bin_boundaries}")
    # print(f"zip(columndata, bin_indices):\n{list(zip(column_data, bin_indices))}")

    # compute numerator, adjust for edge case at max value
    # find the bin min for each data point
    bin_min = column_bin_boundaries[bin_indices]

    # for maximum data point, set bin min to second to last bin boundary
    bin_min[bin_indices == num_bins] = column_bin_boundaries[-2]

    # compute the bin numerator for each data point
    bin_numerator = column_data - bin_min
    # print(f"\nbin_numerator:\n{bin_numerator}")
    
    # Calculate the bin widths based on the bin boundaries
    bin_widths = np.diff(column_bin_boundaries)

    # adjust for edge case of last bin
    idxs = bin_indices
    idxs[idxs == num_bins] = num_bins - 1
    # for maximum data point, set bin width to last bin boundary

    bin_denominator = bin_widths[idxs]
    # print(f"\nbin_widths:\n{bin_widths}")
   
    # Calculate the encoded value of each data point within the selected bin
    encoded_values = bin_numerator / bin_denominator
   
    # Create a mask to store the encoded value in the corresponding column of encoded_data
    mask = np.zeros_like(encoded_data, dtype=bool)
    mask[np.arange(encoded_data.shape[0]), bin_indices] = True

    # Store the encoded value in the corresponding column of encoded_data
    encoded_data[mask] = encoded_values

    # Create mask to set all values after the column-specific bin index to 0
    mask = np.tile(np.arange(encoded_data.shape[1]), (encoded_data.shape[0], 1))
    mask = mask > bin_indices.reshape(-1, 1)
    encoded_data[mask] = 0

    encode_data_list.append(encoded_data)
    idxs_list.append(bin_indices)
    encoded_value_list.append(encoded_values)

# encoded_data now contains the piecewise linear encoding for each column
encoded_data1 = np.array(encode_data_list).transpose(1, 0, 2)
idxs = np.vstack(idxs_list).T
# print(f"Encoded data1: {encoded_data.shape}\n{encoded_data1}")
# print(f"idxs: {idxs.shape}\n{idxs}")
print(f"encoded_data1 shape: {encoded_data1.shape}")



encoded_data1 shape: (20000, 75, 75)
CPU times: user 767 ms, sys: 60.5 ms, total: 828 ms
Wall time: 826 ms


import os
os.environ["NUMBA_DEBUG"] = "1"

In [6]:
%%time
from ple_transformer import MyTransformer3 as MyTransformer

CPU times: user 2.79 s, sys: 0 ns, total: 2.79 s
Wall time: 2.8 s


In [7]:
%%time
# Create an instance of the transformer
transformer = MyTransformer(num_bins=75)

# Fit the transformer to the data
transformer.fit(df_data)


CPU times: user 146 ms, sys: 0 ns, total: 146 ms
Wall time: 144 ms


In [8]:

%%time
# Transform the data using the fitted transformer
encoded_data2 = transformer.transform(df_data)


CPU times: user 1.16 s, sys: 437 ms, total: 1.59 s
Wall time: 1.59 s


In [9]:
%%time
# Create an instance of the transformer
transformer2 = MyTransformer(num_bins=75)

# Fit the transformer to the data
transformer2.fit(df_data)


CPU times: user 188 ms, sys: 0 ns, total: 188 ms
Wall time: 185 ms


In [10]:
%%time
# Transform the data using the fitted transformer
encoded_data3 = transformer2.transform(df_data)

CPU times: user 823 ms, sys: 345 ms, total: 1.17 s
Wall time: 1.16 s


In [11]:
%%time
# print(f"sklearn Encoded data: {encoded_data2.shape}\n{encoded_data2}")

print(f"encoded_data1 == encoded_data3: {np.allclose(encoded_data1, encoded_data3)}")

encoded_data1 == encoded_data3: True
CPU times: user 811 ms, sys: 120 ms, total: 931 ms
Wall time: 928 ms


In [12]:
encoded_data1.dtype, encoded_data2.dtype, encoded_data3.dtype

(dtype('float64'), dtype('float64'), dtype('float64'))