In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from df import blocks as blocks

**Only few pre-defined blocks are supported now but we can add more easily and add even more blocks specifically ML feature engineering**

# Simple sequential beam model

In [3]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

In [4]:
create = blocks.CreateBlock(values = sentences)

In [5]:
create.operation

<Create(PTransform) label=[Create] at 0x29214e5b0>

In [6]:
create

CreateBlock(block_type='Create', block_id=UUID('a543dc34-e452-4a28-b63f-f3b1b0eec0ee'), source_ids=[], target_ids=[], operation=<Create(PTransform) label=[Create] at 0x29214e5b0>, o=None, values=['This framework generates embeddings for each input sentence', 'Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.'])

In [7]:
embed = blocks.SentenceEmbeddingBlock()

In [8]:
embed.model_name

'all-MiniLM-L6-v2'

In [9]:
embed.operation

<ParDo(PTransform) label=[Map(<lambda at blocks.py:100>)] at 0x292183100>

In [10]:
model = blocks.BlockAssembler.Sequential([create, embed])

In [11]:
model.compile()

In [12]:
len(model.blocks)

2

In [13]:
model.show_graph()

/Users/xqhu/homebrew/bin/dot


In [14]:
model.block_data(model.blocks[0])

Unnamed: 0,0
0,This framework generates embeddings for each i...
1,Sentences are passed as a list of string.
2,The quick brown fox jumps over the lazy dog.


In [15]:
model.block_data(model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


In [16]:
model.blocks[1].json(exclude={"operation", "o"})

'{"block_type": "SentenceEmbedding", "block_id": "efb33448-a2b7-4d5a-a253-cd9c87d533be", "source_ids": ["a543dc34-e452-4a28-b63f-f3b1b0eec0ee"], "target_ids": [], "model_name": "all-MiniLM-L6-v2"}'

# Support the json config

In [17]:
model_in_json = model.to_json(indent=2)

In [18]:
print(model_in_json)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "a543dc34-e452-4a28-b63f-f3b1b0eec0ee",
      "source_ids": [],
      "target_ids": [
        "efb33448-a2b7-4d5a-a253-cd9c87d533be"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "efb33448-a2b7-4d5a-a253-cd9c87d533be",
      "source_ids": [
        "a543dc34-e452-4a28-b63f-f3b1b0eec0ee"
      ],
      "target_ids": [],
      "model_name": "all-MiniLM-L6-v2"
    }
  ],
  "model_type": "SEQUENTIAL"
}


In [19]:
new_model = blocks.BlockAssembler.from_json(model_in_json)

In [20]:
new_model.compile()

In [21]:
new_model.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
new_model.block_data(new_model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


# Use Functional API to build more complicated models

In [23]:
create_1 = blocks.CreateBlock(values = sentences)
embed_1 = blocks.SentenceEmbeddingBlock()([create_1])

In [24]:
target_sentences = ['This framework is for testing']
create_2 = blocks.CreateBlock(values = target_sentences)
embed_2 = blocks.SentenceEmbeddingBlock()([create_2])

In [25]:
cross = blocks.CrossJoinBlock()([embed_1, embed_2])

In [26]:
cross

CrossJoinBlock(block_type='CrossJoin', block_id=UUID('152c33af-5555-47b5-9bf5-25a09eb9e45e'), source_ids=[UUID('be9aca49-c82a-4253-b743-d62fd9d950ca'), UUID('70a52cb5-f7d6-434c-8a38-5238eeee0751')], target_ids=[], operation=None, o=None)

In [27]:
sim = blocks.CosSimilarityBlock()(cross)

In [28]:
sim

CosSimilarityBlock(block_type='CosSimilarity', block_id=UUID('f39645de-9e93-44a3-9fd1-c5b2c3b880d9'), source_ids=[UUID('152c33af-5555-47b5-9bf5-25a09eb9e45e')], target_ids=[], operation=<ParDo(PTransform) label=[Map(<lambda at blocks.py:128>)] at 0x2935ddd90>, o=None)

In [29]:
model_1 = blocks.BlockAssembler([create_1, create_2])

In [30]:
model_1.compile()

In [31]:
model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
model_1.block_data(sim)

  a = torch.tensor(a)


Unnamed: 0,0
0,0.342632
1,0.162461
2,0.051704


In [33]:
# You can also use the json config here for functional models
model_1_config = model_1.to_json(indent=2)

In [34]:
print(model_1_config)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "15c687b1-c759-4d2f-befc-837706e6f16c",
      "source_ids": [],
      "target_ids": [
        "be9aca49-c82a-4253-b743-d62fd9d950ca"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "be9aca49-c82a-4253-b743-d62fd9d950ca",
      "source_ids": [
        "15c687b1-c759-4d2f-befc-837706e6f16c"
      ],
      "target_ids": [
        "152c33af-5555-47b5-9bf5-25a09eb9e45e",
        "152c33af-5555-47b5-9bf5-25a09eb9e45e",
        "152c33af-5555-47b5-9bf5-25a09eb9e45e"
      ],
      "model_name": "all-MiniLM-L6-v2"
    },
    {
      "block_type": "CrossJoin",
      "block_id": "152c33af-5555-47b5-9bf5-25a09eb9e45e",
      "source_ids": [
        "be9aca49-c82a-4253-b743-d62fd9d950ca",
      

In [35]:
new_model_1 = blocks.BlockAssembler.from_json(model_1_config)

In [36]:
new_model_1.compile()

In [37]:
new_model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
