In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentences_1 = ['This framework generates embeddings for each input sentence',]

In [3]:
sentence_embeddings = model.encode(sentences)

In [4]:
sentence_embeddings.shape

(3, 384)

In [5]:
type(sentence_embeddings)

numpy.ndarray

In [6]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

In [7]:
from apache_beam.options.pipeline_options import PipelineOptions

In [8]:
p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions())

In [9]:
output = p | beam.Create(sentences) | beam.Map(lambda x: [model.encode(x)])



In [10]:
type(output)

apache_beam.pvalue.PCollection

In [11]:
output1 = p | beam.Create(sentences_1) | beam.Map(lambda x: [model.encode(x)])

In [13]:
ib.show_graph(p)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
res = ib.collect(output)

In [None]:
res.shape

In [None]:
res

In [None]:
res1 = ib.collect(output1)

In [None]:
res1

In [None]:
from sentence_transformers import util

# Any user-defined function.
# cross join is used as an example.
def cross_join(left, rights):
    for x in rights:
        yield (left, x)
    
comb_result = (
    output
    | 'ApplyCrossJoin' >> beam.FlatMap(
        cross_join, rights=beam.pvalue.AsIter(output1))
    | "Cosine" >> beam.Map(lambda x: float(util.cos_sim(x[0], x[1])[0][0])))

In [None]:
sim_res = ib.collect(comb_result)

In [None]:
sim_res[0]

In [None]:
type(comb_result)

In [None]:
from pydantic import BaseModel, Field
from typing import List, Any

class Block(BaseModel):
    source: List["Block"] = []
    target: List["Block"] = []
    operation: beam.ParDo
    o: beam.pvalue.PCollection = None
    
    class Config:
        arbitrary_types_allowed = True

In [None]:
class SentenceEmbeddingBlock(Block):
    operation: beam.ParDo = Field(default=beam.Map(lambda x: [model.encode(x)]))

In [None]:
# block for beam.Create
from pydantic import BaseModel, ValidationError, root_validator

class CreateBlock(Block):
    operation: beam.Create
    values: List[Any]

    @root_validator(pre=True)
    def _set_fields(cls, values: dict) -> dict:
        values["operation"] = beam.Create(values["values"])
        return values

In [None]:
block_p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions())

In [None]:
embed = SentenceEmbeddingBlock()

In [None]:
embed.o

In [None]:
create = CreateBlock(values = sentences)

In [None]:
create.operation

In [None]:
create.o

In [None]:
class BlockAssembler:
    def __init__(self, blocks: List[Block], p: beam.pipeline.Pipeline):
        self.blocks = blocks
        self.p = p
    @classmethod
    def Sequential(cls, blocks: List[Block], p: beam.pipeline.Pipeline):
        # connect all the blocks using the list order
        for i, block in enumerate(blocks):
            if i > 0:
                block.source = [blocks[i-1]]
            if i < (len(blocks)-1):
                block.target = [blocks[i+1]]
        return cls(blocks, p)

    def compile(self):
        # sequential
        o = self.p
        for block in self.blocks:
            block.o = o | block.operation
            o = block.o
        return 
    
    def show_graph(self):
        ib.show_graph(self.p)

In [None]:
blocks = BlockAssembler.Sequential([create, embed], p=block_p)

In [None]:
blocks.compile()

In [None]:
blocks.blocks[0].target

In [None]:
blocks.show_graph()