In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
from df import blocks as blocks

**Only few pre-defined blocks are supported now but we can add more easily and add even more blocks specifically for ML feature engineering**
Goal: use the terms familiar with data scientists to build the Beam pipeline

# Simple sequential beam model

In [41]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

In [42]:
create = blocks.CreateBlock(values = sentences)

In [43]:
create.operation

<Create(PTransform) label=[Create] at 0x2a3613e50>

In [44]:
create

CreateBlock(block_type='Create', block_id=UUID('f448a157-d566-497d-9b23-c6ee6171c49d'), source_ids=[], target_ids=[], operation=<Create(PTransform) label=[Create] at 0x2a3613e50>, o=None, values=['This framework generates embeddings for each input sentence', 'Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.'])

In [45]:
embed = blocks.SentenceEmbeddingBlock()

In [46]:
embed.model_name

'all-MiniLM-L6-v2'

In [47]:
embed.operation

<ParDo(PTransform) label=[Map(<lambda at blocks.py:136>)] at 0x2a36130a0>

In [48]:
model = blocks.BlockAssembler.Sequential([create, embed])

In [49]:
model.compile()

In [50]:
len(model.blocks)

2

In [51]:
model.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [52]:
model.block_data(model.blocks[0])

Unnamed: 0,0
0,This framework generates embeddings for each i...
1,Sentences are passed as a list of string.
2,The quick brown fox jumps over the lazy dog.


In [53]:
model.block_data(model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


In [54]:
model.blocks[1].json(exclude={"operation", "o"})

'{"block_type": "SentenceEmbedding", "block_id": "694fef2e-015b-4d68-b1d8-cc68999c9176", "source_ids": ["f448a157-d566-497d-9b23-c6ee6171c49d"], "target_ids": [], "model_name": "all-MiniLM-L6-v2"}'

# Support the json config

In [55]:
model_in_json = model.to_json(indent=2)

In [56]:
print(model_in_json)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "f448a157-d566-497d-9b23-c6ee6171c49d",
      "source_ids": [],
      "target_ids": [
        "694fef2e-015b-4d68-b1d8-cc68999c9176"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "694fef2e-015b-4d68-b1d8-cc68999c9176",
      "source_ids": [
        "f448a157-d566-497d-9b23-c6ee6171c49d"
      ],
      "target_ids": [],
      "model_name": "all-MiniLM-L6-v2"
    }
  ],
  "model_type": "SEQUENTIAL"
}


In [57]:
new_model = blocks.BlockAssembler.from_json(model_in_json)

In [58]:
new_model.compile()

In [59]:
new_model.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [60]:
new_model.block_data(new_model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


# Use Functional API to build more complicated models

In [61]:
create_1 = blocks.CreateBlock(values = sentences)
embed_1 = blocks.SentenceEmbeddingBlock()([create_1])

In [62]:
target_sentences = ['This framework is for testing']
create_2 = blocks.CreateBlock(values = target_sentences)
embed_2 = blocks.SentenceEmbeddingBlock()([create_2])

In [63]:
cross = blocks.CrossJoinBlock()([embed_1, embed_2])

In [64]:
cross

CrossJoinBlock(block_type='CrossJoin', block_id=UUID('23d22d17-984d-4b39-8142-72423ec7f2e9'), source_ids=[UUID('c10b1f90-d82e-4c53-91d7-e526e87be9a1'), UUID('d3218632-a0a8-42f4-88f3-47353456b1fe')], target_ids=[], operation=None, o=None)

In [65]:
sim = blocks.CosSimilarityBlock()(cross)

In [66]:
sim

CosSimilarityBlock(block_type='CosSimilarity', block_id=UUID('34de5a7f-7959-4a1f-8231-fc7b8c169079'), source_ids=[UUID('23d22d17-984d-4b39-8142-72423ec7f2e9')], target_ids=[], operation=<ParDo(PTransform) label=[Map(<lambda at blocks.py:194>)] at 0x2a3dc0280>, o=None)

In [67]:
model_1 = blocks.BlockAssembler([create_1, create_2])

In [68]:
model_1.compile()

In [69]:
model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:
model_1.block_data(sim)

Unnamed: 0,0
0,0.342632
1,0.162461
2,0.051704


In [71]:
# You can also use the json config here for functional models
model_1_config = model_1.to_json(indent=2)

In [72]:
print(model_1_config)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "d839639d-a6c7-4c48-b460-817007c4de00",
      "source_ids": [],
      "target_ids": [
        "c10b1f90-d82e-4c53-91d7-e526e87be9a1"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "c10b1f90-d82e-4c53-91d7-e526e87be9a1",
      "source_ids": [
        "d839639d-a6c7-4c48-b460-817007c4de00"
      ],
      "target_ids": [
        "23d22d17-984d-4b39-8142-72423ec7f2e9",
        "23d22d17-984d-4b39-8142-72423ec7f2e9",
        "23d22d17-984d-4b39-8142-72423ec7f2e9"
      ],
      "model_name": "all-MiniLM-L6-v2"
    },
    {
      "block_type": "CrossJoin",
      "block_id": "23d22d17-984d-4b39-8142-72423ec7f2e9",
      "source_ids": [
        "c10b1f90-d82e-4c53-91d7-e526e87be9a1",
      

In [73]:
new_model_1 = blocks.BlockAssembler.from_json(model_1_config)

In [74]:
new_model_1.compile()

In [75]:
new_model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Process a CSV file using Beam

In [76]:
# read from a csv file using ReadCSVBlock
csv_b = blocks.ReadCSVBlock(path=str("../tests/beers.csv"), header=0)

In [77]:
# use two callback functions to process data using DataTransformBlock
def _fill_missing(data):
    """fill missing with a constant for the ibu field"""
    if data.ibu is None:
        return data._replace(ibu = -1.0)
    return data

dt_b = blocks.DataTransformBlock(callbacks=[_fill_missing])

In [78]:
# build the model
dt_model = blocks.BlockAssembler.Sequential([csv_b, dt_b])

In [79]:
dt_model.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [80]:
pdf = dt_model.block_data(dt_b)

In [81]:
pdf.head()

Unnamed: 0,row_id,abv,ibu,id,name,style,brewery_id,ounces
0,0,0.05,-1.0,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,-1.0,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,-1.0,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.09,-1.0,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,-1.0,2262,Sex and Candy,American IPA,177,12.0


In [82]:
dt_model_json = dt_model.to_json(indent = 2)

In [83]:
print(dt_model_json)

{
  "blocks": [
    {
      "block_type": "ReadCSV",
      "block_id": "87c32666-9190-409d-8b5e-1908d270db9c",
      "source_ids": [],
      "target_ids": [
        "e30d6937-5bdb-49aa-ab97-1d9c6fe7a219"
      ],
      "path": "../tests/beers.csv",
      "header": 0
    },
    {
      "block_type": "DataTransform",
      "block_id": "e30d6937-5bdb-49aa-ab97-1d9c6fe7a219",
      "source_ids": [
        "87c32666-9190-409d-8b5e-1908d270db9c"
      ],
      "target_ids": [],
      "callbacks": [
        {
          "_fill_missing": "def _fill_missing(data):\n    \"\"\"fill missing with a constant for the ibu field\"\"\"\n    if data.ibu is None:\n        return data._replace(ibu = -1.0)\n    return data\n"
        }
      ]
    }
  ],
  "model_type": "SEQUENTIAL"
}


In [84]:
dt_model_1 = blocks.BlockAssembler.from_json(dt_model_json)

In [85]:
dt_model_1.blocks

[ReadCSVBlock(block_type='ReadCSV', block_id=UUID('87c32666-9190-409d-8b5e-1908d270db9c'), source_ids=[], target_ids=[UUID('e30d6937-5bdb-49aa-ab97-1d9c6fe7a219')], operation=<_ReadFromPandas(PTransform) label=[_ReadFromPandas] at 0x2ad0b3a60>, o=None, path='../tests/beers.csv', header=0),
 DataTransformBlock(block_type='DataTransform', block_id=UUID('e30d6937-5bdb-49aa-ab97-1d9c6fe7a219'), source_ids=[UUID('87c32666-9190-409d-8b5e-1908d270db9c')], target_ids=[], operation=<ParDo(PTransform) label=[Map(_fill_missing)] at 0x2ac73ca30>, o=None, callbacks=[{'_fill_missing': 'def _fill_missing(data):\n    """fill missing with a constant for the ibu field"""\n    if data.ibu is None:\n        return data._replace(ibu = -1.0)\n    return data\n'}])]

In [86]:
dt_model_1.block_data(dt_model_1.blocks[1])

Unnamed: 0,row_id,abv,ibu,id,name,style,brewery_id,ounces
0,0,0.050,-1.0,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,-1.0,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,-1.0,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.090,-1.0,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,-1.0,2262,Sex and Candy,American IPA,177,12.0
...,...,...,...,...,...,...,...,...
2405,2405,0.067,45.0,928,Belgorado,Belgian IPA,424,12.0
2406,2406,0.052,-1.0,807,Rail Yard Ale,American Amber / Red Ale,424,12.0
2407,2407,0.055,-1.0,620,B3K Black Lager,Schwarzbier,424,12.0
2408,2408,0.055,40.0,145,Silverback Pale Ale,American Pale Ale (APA),424,12.0
