## Part I: Fit and pickle a transformer

In [1]:
import pandas as pd
import numpy as np
from my_transformers import (NumbaColumnTransformer, PandasColumnTransformer, encoder_numba, encoder_pandas)
from sklearn.preprocessing import FunctionTransformer
import pickle

### generate random data

In [2]:
n=10**7
data = pd.DataFrame(np.random.randint(0,100,size = (n,3)),columns=['var1','var2','var3'])

### define and fit transformers

In [3]:
tx_numba = NumbaColumnTransformer(encoder_numba, func_arg='var1')
tx_pandas = PandasColumnTransformer(func=encoder_pandas)

In [4]:
tx_numba = tx_numba.fit(data)
tx_pandas = tx_pandas.fit(data)

### quick performance measurement

In [5]:
%%timeit -r1 -n1
_ = tx_numba.transform(data)

11.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [6]:
%%timeit -r1 -n1
_ = tx_pandas.transform(data)

1min 38s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### save the numba transformer

In [7]:
with open("./tx_numba.pkl","wb") as f:
    pickle.dump(tx_numba, f)

## Part II: Check consistency of pickles

### refresh kernel, load the saved transformer and dump it again

In [1]:
import pickle

In [2]:
with open("./tx_numba.pkl","rb") as f:
    tx_numba_reloaded = pickle.load(f)

In [3]:
with open("./tx_numba2.pkl","wb") as f:
    pickle.dump(tx_numba_reloaded, f)

In [4]:
with open("./tx_numba.pkl", mode='rb') as file: 
    tx_numba_str = file.read()
with open("./tx_numba2.pkl", mode='rb') as file: 
    tx_numba2_str = file.read()

### check if the binary pickles are the same

In [5]:
tx_numba_str==tx_numba2_str

False