In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.datasets import make_regression

from ple_transformer import PiecewiseLinearEncoderNumpy, PiecewiseLinearEncoderNumbaV0, PiecewiseLinearEncoderNumbaV1, PiecewiseLinearEncoderCython

In [2]:
NUM_FEATURES = 100
NUM_BINS = 45
NUM_SAMPLES = 10_000
# Generate synthetic regression data
X, y = make_regression(n_samples=NUM_SAMPLES, n_features=NUM_FEATURES, noise=0.1, random_state=1)

# Convert to pandas DataFrame
df = pd.DataFrame(data=X, columns=[f'Feature_{i}' for i in range(1, NUM_FEATURES+1)])
df['Target'] = y

df_data = df.drop('Target', axis=1).astype('float32')

print(df_data.head())

   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0   2.669134   0.935506   0.119068   0.397455  -0.797867   1.063694   
1  -0.280565  -0.737132  -0.073916   0.266521   0.950513  -0.318143   
2  -1.545541  -0.813349   2.552004  -1.885060   1.019922  -0.546040   
3  -0.176935   1.082176   1.856537  -0.444410   0.462096   0.063075   
4   0.730443  -2.839458   0.602569  -0.249875  -1.140106  -0.024486   

   Feature_7  Feature_8  Feature_9  Feature_10  ...  Feature_91  Feature_92  \
0   0.223034  -0.394869   0.549681    0.203443  ...   -0.977882   -0.327025   
1   2.210178  -0.486361   1.207417   -0.132859  ...   -1.732534    0.704593   
2  -0.089909  -1.040497   0.124978    0.926639  ...   -0.860770    1.445409   
3   1.173178  -0.860313   0.186985   -0.392830  ...   -0.649400   -1.150807   
4   0.068322   0.137987   1.836735   -0.455747  ...    0.026673    1.308732   

   Feature_93  Feature_94  Feature_95  Feature_96  Feature_97  Feature_98  \
0   -0.995281   -0.82

In [3]:
%%time
encoded_values =[]

transformer_list = [
    PiecewiseLinearEncoderNumpy(num_bins=NUM_BINS),
    PiecewiseLinearEncoderNumbaV0(num_bins=NUM_BINS),
    PiecewiseLinearEncoderNumbaV1(num_bins=NUM_BINS),
    PiecewiseLinearEncoderCython(num_bins=NUM_BINS),
]

for transformer in transformer_list:
    short_name = transformer.__class__.__name__.replace('PiecewiseLinearEncoder', '')
    print(f'Running {short_name}')
    transformer.fit(df_data)
    encoded_values.append((short_name, transformer.fit_transform(df_data)))



Running Numpy
Running NumbaV0
Running NumbaV1
Running Cython
CPU times: user 7.35 s, sys: 590 ms, total: 7.94 s
Wall time: 7.32 s


In [4]:

numpy_encoded_value = encoded_values[0][1]
# loop through the encoded values to compare 2nd through the last values with the first one in the list
for i in range(1, len(encoded_values)):
    transformer_name = encoded_values[i][0]
    encoded_value = encoded_values[i][1]
    equal_flag = np.array_equal(encoded_value, numpy_encoded_value)
    print(f'transformer {transformer_name} and transformer Numpy? {equal_flag}')
    if not equal_flag:
        print(f'\tnumber of values: {np.size(encoded_value):,d}, number of mismatches: {np.sum(encoded_value != numpy_encoded_value):,d}')
        print(f'\tpercent of mismatches: {np.sum(encoded_value != numpy_encoded_value) / np.size(encoded_value):.2%}')
        print(f'\tmax absolute difference: {np.max(np.abs(encoded_value - numpy_encoded_value))}')
        

transformer NumbaV0 and transformer Numpy? True
transformer NumbaV1 and transformer Numpy? True
transformer Cython and transformer Numpy? False
	number of values: 45,000,000, number of mismatches: 775,155
	percent of mismatches: 1.72%
	max absolute difference: 6.016343832015991e-07
