In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from df import blocks as blocks

# Simple sequential beam model

In [3]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

In [4]:
create = blocks.CreateBlock(values = sentences)

In [5]:
create.operation

<Create(PTransform) label=[Create] at 0x106fd8580>

In [6]:
create

CreateBlock(block_type='Create', block_id=UUID('0e6e0620-f26d-4fd7-98f5-93189251a47b'), source_ids=[], target_ids=[], operation=<Create(PTransform) label=[Create] at 0x106fd8580>, o=None, values=['This framework generates embeddings for each input sentence', 'Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.'])

In [7]:
embed = blocks.SentenceEmbeddingBlock()

In [8]:
embed.model_name

'all-MiniLM-L6-v2'

In [9]:
embed.operation

<ParDo(PTransform) label=[Map(<lambda at blocks.py:85>)] at 0x2ab875280>

In [10]:
model = blocks.BlockAssembler.Sequential([create, embed])

In [11]:
model.compile()

In [12]:
len(model.blocks)

2

In [13]:
model.show_graph()

/Users/xqhu/homebrew/bin/dot


In [14]:
model.block_data(model.blocks[0])

Unnamed: 0,0
0,This framework generates embeddings for each i...
1,Sentences are passed as a list of string.
2,The quick brown fox jumps over the lazy dog.


In [15]:
model.block_data(model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


In [16]:
model.blocks[1].json(exclude={"operation", "o"})

'{"block_type": "SentenceEmbedding", "block_id": "745323cb-a6f1-4fe1-a62c-98ac8cf65d8a", "source_ids": ["0e6e0620-f26d-4fd7-98f5-93189251a47b"], "target_ids": [], "model_name": "all-MiniLM-L6-v2"}'

# Support the json config

In [17]:
model_in_json = model.to_json(indent=2)

In [18]:
print(model_in_json)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "0e6e0620-f26d-4fd7-98f5-93189251a47b",
      "source_ids": [],
      "target_ids": [
        "745323cb-a6f1-4fe1-a62c-98ac8cf65d8a"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "745323cb-a6f1-4fe1-a62c-98ac8cf65d8a",
      "source_ids": [
        "0e6e0620-f26d-4fd7-98f5-93189251a47b"
      ],
      "target_ids": [],
      "model_name": "all-MiniLM-L6-v2"
    }
  ]
}


In [19]:
new_model = blocks.BlockAssembler.from_json(model_in_json)

In [20]:
new_model.compile()

In [21]:
new_model.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
new_model.block_data(new_model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


# Use Functional API to build more complicated models

In [23]:
create_1 = blocks.CreateBlock(values = sentences)

In [24]:
embed_1 = blocks.SentenceEmbeddingBlock()([create_1])

In [25]:
embed_1

SentenceEmbeddingBlock(block_type='SentenceEmbedding', block_id=UUID('a6c0b9c7-595d-4943-bac0-7371d5fd472b'), source_ids=[UUID('aab03a32-752c-41b0-9cad-ccc821de723b')], target_ids=[], operation=<ParDo(PTransform) label=[Map(<lambda at blocks.py:85>)] at 0x2ae198430>, o=None, model_name='all-MiniLM-L6-v2')

In [26]:
target_sentences = ['This framework is for testing']

In [27]:
create_2 = blocks.CreateBlock(values = target_sentences)
embed_2 = blocks.SentenceEmbeddingBlock()([create_2])

In [28]:
model_1 = blocks.BlockAssembler([create_1, create_2])

In [29]:
model_1.compile()

In [30]:
model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
