In [4]:
# !pip install -U sentence-transformers

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [55]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentences_1 = ['This framework generates embeddings for each input sentence',]

In [2]:
sentence_embeddings = model.encode(sentences)

In [3]:
sentence_embeddings.shape

(3, 384)

In [5]:
type(sentence_embeddings)

numpy.ndarray

In [32]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

In [33]:
from apache_beam.options.pipeline_options import PipelineOptions

In [58]:
p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions())

In [59]:
output = p | beam.Create(sentences) | beam.Map(lambda x: [model.encode(x)])



In [60]:
output1 = p | beam.Create(sentences_1) | beam.Map(lambda x: [model.encode(x)])

In [61]:
ib.show_graph(p)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/usr/local/bin/dot
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [62]:
res = ib.collect(output)



In [63]:
res.shape

(3, 1)

In [64]:
res

Unnamed: 0,0
0,"[-0.013717369, -0.042851534, -0.015628567, 0.0..."
1,"[0.0564525, 0.055002406, 0.03137959, 0.0339485..."
2,"[0.04393355, 0.058934387, 0.048178356, 0.07754..."


In [65]:
res1 = ib.collect(output1)



In [66]:
res1

Unnamed: 0,0
0,"[-0.013717369, -0.042851534, -0.015628567, 0.0..."


In [95]:
from sentence_transformers import util

# Any user-defined function.
# cross join is used as an example.
def cross_join(left, rights):
    for x in rights:
        yield (left, x)
    
comb_result = (
    output
    | 'ApplyCrossJoin' >> beam.FlatMap(
        cross_join, rights=beam.pvalue.AsIter(output1))
    | "Cosine" >> beam.Map(lambda x: float(util.cos_sim(x[0], x[1])[0][0])))

In [96]:
sim_res = ib.collect(comb_result)



In [97]:
sim_res[0]

0    1.000000
1    0.538079
2    0.118056
Name: 0, dtype: float64

In [98]:
type(comb_result)

apache_beam.pvalue.PCollection

In [116]:
from pydantic import BaseModel, Field
from typing import List, Any

class Block(BaseModel):
    source: List["Block"] = []
    target: List["Block"] = []
    operation: beam.ParDo
    
    class Config:
        arbitrary_types_allowed = True

In [119]:
class SentenceEmbeddingBlock(Block):
    operation: beam.ParDo = Field(default=beam.Map(lambda x: [model.encode(x)]))

In [120]:
embed = SentenceEmbeddingBlock()

In [121]:
embed

SentenceEmbeddingBlock(source=[], target=[], operation=<ParDo(PTransform) label=[Map(<lambda at 2263565550.py:2>)] at 0x16c88f610>)

In [138]:
class BlockAssembler:
    def __init__(self, blocks: List[Block]):
        self.blocks = blocks
    @classmethod
    def Sequential(cls, blocks: List[Block]):
        # connect all the blocks using the list order
        for i, block in enumerate(blocks):
            if i > 0:
                block.source = [blocks[i-1]]
            if i < (len(blocks)-1):
                block.target = [blocks[i+1]]
        return cls(blocks)

In [139]:
blocks = BlockAssembler.Sequential([embed])

In [140]:
blocks.blocks

[SentenceEmbeddingBlock(source=[], target=[], operation=<ParDo(PTransform) label=[Map(<lambda at 2263565550.py:2>)] at 0x16c88f610>)]

In [128]:
# TODO: IO blocks for beam.Create?


In [127]:
# TODO: compile blocks to get the final Beam pipeline

In [132]:
test = [1, 2, 3, 4]
for i, t in enumerate(test[1:]):
    print(i, t)

0 2
1 3
2 4
