In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from df import blocks as blocks

**Only few pre-defined blocks are supported now but we can add more easily and add even more blocks specifically ML feature engineering**

# Simple sequential beam model

In [3]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

In [4]:
create = blocks.CreateBlock(values = sentences)

In [5]:
create.operation

<Create(PTransform) label=[Create] at 0x294a94550>

In [6]:
create

CreateBlock(block_type='Create', block_id=UUID('af6a023f-07ae-4e97-b16e-aafc4a89e8a7'), source_ids=[], target_ids=[], operation=<Create(PTransform) label=[Create] at 0x294a94550>, o=None, values=['This framework generates embeddings for each input sentence', 'Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.'])

In [7]:
embed = blocks.SentenceEmbeddingBlock()

In [8]:
embed.model_name

'all-MiniLM-L6-v2'

In [9]:
embed.operation

<ParDo(PTransform) label=[Map(<lambda at blocks.py:94>)] at 0x294a94e80>

In [10]:
model = blocks.BlockAssembler.Sequential([create, embed])

In [11]:
model.compile()

In [12]:
len(model.blocks)

2

In [13]:
model.show_graph()

/Users/xqhu/homebrew/bin/dot


In [14]:
model.block_data(model.blocks[0])

Unnamed: 0,0
0,This framework generates embeddings for each i...
1,Sentences are passed as a list of string.
2,The quick brown fox jumps over the lazy dog.


In [15]:
model.block_data(model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


In [16]:
model.blocks[1].json(exclude={"operation", "o"})

'{"block_type": "SentenceEmbedding", "block_id": "dc2dc2bf-e407-4811-8714-30ca6b3c3518", "source_ids": ["af6a023f-07ae-4e97-b16e-aafc4a89e8a7"], "target_ids": [], "model_name": "all-MiniLM-L6-v2"}'

# Support the json config

In [17]:
model_in_json = model.to_json(indent=2)

In [18]:
print(model_in_json)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "af6a023f-07ae-4e97-b16e-aafc4a89e8a7",
      "source_ids": [],
      "target_ids": [
        "dc2dc2bf-e407-4811-8714-30ca6b3c3518"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "dc2dc2bf-e407-4811-8714-30ca6b3c3518",
      "source_ids": [
        "af6a023f-07ae-4e97-b16e-aafc4a89e8a7"
      ],
      "target_ids": [],
      "model_name": "all-MiniLM-L6-v2"
    }
  ],
  "model_type": "SEQUENTIAL"
}


In [19]:
new_model = blocks.BlockAssembler.from_json(model_in_json)

In [20]:
new_model.compile()

In [21]:
new_model.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
new_model.block_data(new_model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


# Use Functional API to build more complicated models

In [23]:
create_1 = blocks.CreateBlock(values = sentences)
embed_1 = blocks.SentenceEmbeddingBlock()([create_1])

In [24]:
target_sentences = ['This framework is for testing']
create_2 = blocks.CreateBlock(values = target_sentences)
embed_2 = blocks.SentenceEmbeddingBlock()([create_2])

In [25]:
cross = blocks.CrossJoinBlock()([embed_1, embed_2])

In [26]:
cross

CrossJoinBlock(block_type='CrossJoin', block_id=UUID('e9c0f4fd-36bd-408f-a276-69f0242fd84a'), source_ids=[UUID('c31692e1-91c8-4a21-864c-51d4fb2135a0'), UUID('8cff6a1d-93e7-4326-98b4-465648c18e47')], target_ids=[], operation=None, o=None)

In [27]:
sim = blocks.CosSimilarityBlock()(cross)

In [28]:
sim

CosSimilarityBlock(block_type='CosSimilarity', block_id=UUID('ed01ae94-58f0-4e75-bd53-07a471e8cd31'), source_ids=[UUID('e9c0f4fd-36bd-408f-a276-69f0242fd84a')], target_ids=[], operation=<ParDo(PTransform) label=[Map(<lambda at blocks.py:120>)] at 0x295ed9820>, o=None)

In [29]:
model_1 = blocks.BlockAssembler([create_1, create_2])

In [30]:
model_1.compile()

In [31]:
model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
model_1.block_data(sim)

  a = torch.tensor(a)


Unnamed: 0,0
0,0.342632
1,0.162461
2,0.051704


In [33]:
# You can also use the json config here for functional models
model_1_config = model_1.to_json(indent=2)

In [34]:
print(model_1_config)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "6ec7ea12-a47d-4804-b09f-5ccdeb0a1750",
      "source_ids": [],
      "target_ids": [
        "c31692e1-91c8-4a21-864c-51d4fb2135a0"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "c31692e1-91c8-4a21-864c-51d4fb2135a0",
      "source_ids": [
        "6ec7ea12-a47d-4804-b09f-5ccdeb0a1750"
      ],
      "target_ids": [
        "e9c0f4fd-36bd-408f-a276-69f0242fd84a",
        "e9c0f4fd-36bd-408f-a276-69f0242fd84a",
        "e9c0f4fd-36bd-408f-a276-69f0242fd84a"
      ],
      "model_name": "all-MiniLM-L6-v2"
    },
    {
      "block_type": "CrossJoin",
      "block_id": "e9c0f4fd-36bd-408f-a276-69f0242fd84a",
      "source_ids": [
        "c31692e1-91c8-4a21-864c-51d4fb2135a0",
      

In [35]:
new_model_1 = blocks.BlockAssembler.from_json(model_1_config)

In [36]:
new_model_1.compile()

In [37]:
new_model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
