In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from df import blocks as blocks

# Simple sequential beam model

In [3]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

In [4]:
create = blocks.CreateBlock(values = sentences)

In [5]:
create.operation

<Create(PTransform) label=[Create] at 0x1039d2190>

In [6]:
create

CreateBlock(block_type='Create', block_id=UUID('8bca9fb6-8c7d-4756-b928-cbbf5b215f4b'), source_ids=[], target_ids=[], operation=<Create(PTransform) label=[Create] at 0x1039d2190>, o=None, values=['This framework generates embeddings for each input sentence', 'Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.'])

In [7]:
embed = blocks.SentenceEmbeddingBlock()

In [8]:
embed.model_name

'all-MiniLM-L6-v2'

In [9]:
embed.operation

<ParDo(PTransform) label=[Map(<lambda at blocks.py:88>)] at 0x16e2729d0>

In [10]:
model = blocks.BlockAssembler.Sequential([create, embed])

In [11]:
model.compile()

In [12]:
len(model.blocks)

2

In [13]:
model.show_graph()

/Users/xqhu/homebrew/bin/dot


In [14]:
model.block_data(model.blocks[0])

Unnamed: 0,0
0,This framework generates embeddings for each i...
1,Sentences are passed as a list of string.
2,The quick brown fox jumps over the lazy dog.


In [15]:
model.block_data(model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


In [16]:
model.blocks[1].json(exclude={"operation", "o"})

'{"block_type": "SentenceEmbedding", "block_id": "d99fdef9-1f7d-4f50-8cff-2e4bdfff16cd", "source_ids": ["8bca9fb6-8c7d-4756-b928-cbbf5b215f4b"], "target_ids": [], "model_name": "all-MiniLM-L6-v2"}'

# Support the json config

In [17]:
model_in_json = model.to_json(indent=2)

In [18]:
print(model_in_json)

{
  "blocks": [
    {
      "block_type": "Create",
      "block_id": "8bca9fb6-8c7d-4756-b928-cbbf5b215f4b",
      "source_ids": [],
      "target_ids": [
        "d99fdef9-1f7d-4f50-8cff-2e4bdfff16cd"
      ],
      "values": [
        "This framework generates embeddings for each input sentence",
        "Sentences are passed as a list of string.",
        "The quick brown fox jumps over the lazy dog."
      ]
    },
    {
      "block_type": "SentenceEmbedding",
      "block_id": "d99fdef9-1f7d-4f50-8cff-2e4bdfff16cd",
      "source_ids": [
        "8bca9fb6-8c7d-4756-b928-cbbf5b215f4b"
      ],
      "target_ids": [],
      "model_name": "all-MiniLM-L6-v2"
    }
  ]
}


In [19]:
new_model = blocks.BlockAssembler.from_json(model_in_json)

In [20]:
new_model.compile()

In [21]:
new_model.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
new_model.block_data(new_model.blocks[1])

Unnamed: 0,0
0,"[-0.013717398, -0.042851556, -0.01562865, 0.01..."
1,"[0.0564525, 0.05500244, 0.031379532, 0.0339484..."
2,"[0.043933515, 0.05893439, 0.04817844, 0.077548..."


# Use Functional API to build more complicated models

In [23]:
create_1 = blocks.CreateBlock(values = sentences)
embed_1 = blocks.SentenceEmbeddingBlock()([create_1])

In [24]:
target_sentences = ['This framework is for testing']
create_2 = blocks.CreateBlock(values = target_sentences)
embed_2 = blocks.SentenceEmbeddingBlock()([create_2])

In [25]:
cross = blocks.CrossJoinBlock()([embed_1, embed_2])

In [26]:
cross

CrossJoinBlock(block_type='CrossJoin', block_id=UUID('8a062255-cfa7-4988-bd3b-47caa380a727'), source_ids=[UUID('7e4f9fa9-34d5-44f3-8cc0-2d2403c75fc0'), UUID('22a44793-231f-4a07-87c8-42b36bab2638')], target_ids=[], operation=None, o=None)

In [27]:
sim = blocks.CosSimilarityBlock()(cross)

In [28]:
sim

CosSimilarityBlock(block_type='CosSimilarity', block_id=UUID('12337bf1-bcbe-451e-be80-910e757f78ce'), source_ids=[UUID('8a062255-cfa7-4988-bd3b-47caa380a727')], target_ids=[], operation=<ParDo(PTransform) label=[Map(<lambda at blocks.py:114>)] at 0x1778575e0>, o=None)

In [29]:
model_1 = blocks.BlockAssembler([create_1, create_2])

In [30]:
model_1.compile()

In [31]:
model_1.show_graph()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/Users/xqhu/homebrew/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
model_1.block_data(sim)

  a = torch.tensor(a)


Unnamed: 0,0
0,0.342632
1,0.162461
2,0.051704
