/
embedding.py
71 lines (53 loc) · 1.91 KB
/
embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from typing import Union
from jina import DocumentArray, DocumentArrayMemmap
from .helper import AnyDNN, get_framework
def embed(
docs: Union[DocumentArray, DocumentArrayMemmap],
embed_model: AnyDNN,
device: str = 'cpu',
batch_size: int = 256,
) -> None:
"""Fill the embedding of Documents inplace by using `embed_model`
:param docs: the Documents to be embedded
:param embed_model: the embedding model written in Keras/Pytorch/Paddle
:param device: the computational device for `embed_model`, can be either `cpu` or `cuda`.
:param batch_size: number of Documents in a batch for embedding
"""
fm = get_framework(embed_model)
globals()[f'_set_embeddings_{fm}'](docs, embed_model, device, batch_size)
def _set_embeddings_keras(
docs: Union[DocumentArray, DocumentArrayMemmap],
embed_model: AnyDNN,
device: str = 'cpu',
batch_size: int = 256,
):
from .tuner.keras import get_device
device = get_device(device)
with device:
for b in docs.batch(batch_size):
b.embeddings = embed_model(b.blobs).numpy()
def _set_embeddings_torch(
docs: Union[DocumentArray, DocumentArrayMemmap],
embed_model: AnyDNN,
device: str = 'cpu',
batch_size: int = 256,
):
from .tuner.pytorch import get_device
device = get_device(device)
import torch
embed_model = embed_model.to(device)
with torch.inference_mode():
for b in docs.batch(batch_size):
tensor = torch.tensor(b.blobs, device=device)
b.embeddings = embed_model(tensor).cpu().detach().numpy()
def _set_embeddings_paddle(
docs: Union[DocumentArray, DocumentArrayMemmap],
embed_model: AnyDNN,
device: str = 'cpu',
batch_size: int = 256,
):
from .tuner.paddle import get_device
get_device(device)
import paddle
for b in docs.batch(batch_size):
b.embeddings = embed_model(paddle.Tensor(b.blobs)).numpy()