# Featurino demo

In [1]:
import sys
sys.path.append('../')

from featurino.pipeline import FeaturinoPipeline
# there you can code for our features, check them out to learn  
# how to write your own Featurino subclasses  
from demo.iris_features import Lengths, Widths
import pandas as pd

First we get dataset we're going to work with

In [2]:
iris = pd.read_csv('iris.csv')
# let's introduce a column that will be used to uniquely represent rows
iris = iris.reset_index().rename(columns={'index': 'id'})

Now we are ready to create Featurino Pipeline

In [3]:
# we pass newly created col as merge_on parameter
pipeline = FeaturinoPipeline(main_df=iris, merge_on=['id'], data_dir_path='features')

Now let's generate features using our Featurino subclasses

In [4]:
custom_param = pd.DataFrame({'some_data': [1,2,3]})
# after type we may provide parameters that go directly to the featurino's init
# by every .pipe call we calculate features and when we're out of Featurinos - 
# we just call features_df to get the final dataframe.
pipeline.pipe(Lengths, some_custom_param=custom_param) \
        .pipe(Widths) \
        .features_df()

   some_data
0          1
1          2
2          3 will be used to calculate Lengths features
Lengths: Features has been calculated.
Widths: Features has been calculated.


Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,target,lengths__sepal_squared,lengths__petal_squared,widths__sepal_plus_petal
0,0,5.1,3.5,1.4,0.2,0,26.01,1.96,3.7
1,1,4.9,3.0,1.4,0.2,0,24.01,1.96,3.2
2,2,4.7,3.2,1.3,0.2,0,22.09,1.69,3.4
3,3,4.6,3.1,1.5,0.2,0,21.16,2.25,3.3
4,4,5.0,3.6,1.4,0.2,0,25.00,1.96,3.8
...,...,...,...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,2,44.89,27.04,5.3
146,146,6.3,2.5,5.0,1.9,2,39.69,25.00,4.4
147,147,6.5,3.0,5.2,2.0,2,42.25,27.04,5.0
148,148,6.2,3.4,5.4,2.3,2,38.44,29.16,5.7


As you see, our custom param was successfully used and printed. Now all the features used in Pipeline are cached on disk. All featurinos store features in separate files.

If we calculated features once we can pass only types. That's very convenient and easy to read.

In [5]:
pipeline.pipe(Lengths) \
        .pipe(Widths) \
        .features_df()

Lengths: Loaded from disk.
Widths: Loaded from disk.


Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,target,lengths__sepal_squared,lengths__petal_squared,widths__sepal_plus_petal
0,0,5.1,3.5,1.4,0.2,0,26.01,1.96,3.7
1,1,4.9,3.0,1.4,0.2,0,24.01,1.96,3.2
2,2,4.7,3.2,1.3,0.2,0,22.09,1.69,3.4
3,3,4.6,3.1,1.5,0.2,0,21.16,2.25,3.3
4,4,5.0,3.6,1.4,0.2,0,25.00,1.96,3.8
...,...,...,...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,2,44.89,27.04,5.3
146,146,6.3,2.5,5.0,1.9,2,39.69,25.00,4.4
147,147,6.5,3.0,5.2,2.0,2,42.25,27.04,5.0
148,148,6.2,3.4,5.4,2.3,2,38.44,29.16,5.7


Even though features for a featurino may be already cached, we can pass force_reload parameter to recalculate features according to the new logic in the featurino.
In the next example Lengths features will remain untouched, but Widths features will be recalculated.

In [6]:
pipeline.pipe(Lengths, force_reload=True, some_custom_param=custom_param) \
        .pipe(Widths) \
        .features_df()

   some_data
0          1
1          2
2          3 will be used to calculate Lengths features
Lengths: Features has been calculated.
Widths: Loaded from disk.


Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,target,lengths__sepal_squared,lengths__petal_squared,widths__sepal_plus_petal
0,0,5.1,3.5,1.4,0.2,0,26.01,1.96,3.7
1,1,4.9,3.0,1.4,0.2,0,24.01,1.96,3.2
2,2,4.7,3.2,1.3,0.2,0,22.09,1.69,3.4
3,3,4.6,3.1,1.5,0.2,0,21.16,2.25,3.3
4,4,5.0,3.6,1.4,0.2,0,25.00,1.96,3.8
...,...,...,...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,2,44.89,27.04,5.3
146,146,6.3,2.5,5.0,1.9,2,39.69,25.00,4.4
147,147,6.5,3.0,5.2,2.0,2,42.25,27.04,5.0
148,148,6.2,3.4,5.4,2.3,2,38.44,29.16,5.7


Note: if your featurino expects additional params and you pass force_reload without them - the code will crash.

If you want to reload many featurinos at once you may call set_force_reload method on pipeline:

In [7]:
pipeline.set_force_reload(True)\
        .pipe(Lengths, some_custom_param=custom_param)\
        .pipe(Widths)\
        .features_df()

   some_data
0          1
1          2
2          3 will be used to calculate Lengths features
Lengths: Features has been calculated.
Widths: Features has been calculated.


Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,target,lengths__sepal_squared,lengths__petal_squared,widths__sepal_plus_petal
0,0,5.1,3.5,1.4,0.2,0,26.01,1.96,3.7
1,1,4.9,3.0,1.4,0.2,0,24.01,1.96,3.2
2,2,4.7,3.2,1.3,0.2,0,22.09,1.69,3.4
3,3,4.6,3.1,1.5,0.2,0,21.16,2.25,3.3
4,4,5.0,3.6,1.4,0.2,0,25.00,1.96,3.8
...,...,...,...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,2,44.89,27.04,5.3
146,146,6.3,2.5,5.0,1.9,2,39.69,25.00,4.4
147,147,6.5,3.0,5.2,2.0,2,42.25,27.04,5.0
148,148,6.2,3.4,5.4,2.3,2,38.44,29.16,5.7
