diff --git a/docs/basics/embed.png b/docs/basics/embed.png new file mode 100644 index 000000000..7ebf91fec Binary files /dev/null and b/docs/basics/embed.png differ diff --git a/docs/basics/fit.md b/docs/basics/fit.md index b1ab4641b..70220f916 100644 --- a/docs/basics/fit.md +++ b/docs/basics/fit.md @@ -33,6 +33,25 @@ Depending on your framework, `display` may require different argument for render More information can be {ref}`found here`. +## Embed documents + +You can use `finetuner.embed()` method to compute the embeddings of a `DocumentArray` or `DocumentArrayMemmap`. + +```python +import finetuner +from jina import DocumentArray + +docs = DocumentArray(...) + +finetuner.embed(docs, model) + +print(docs.embeddings) +``` + +Note that, `model` above must be an {term}`Embedding model`. + + + ## Example ```python @@ -59,9 +78,6 @@ model, summary = finetuner.fit( ) finetuner.display(model, input_size=(100,), input_dtype='long') - -finetuner.save(model, './saved-model') -summary.plot('fit.png') ``` ```console @@ -81,7 +97,32 @@ Green layers can be used as embedding layers, whose name can be used as layer_name in to_embedding_model(...). ``` +```python +finetuner.save(model, './saved-model') +summary.plot('fit.png') +``` + ```{figure} fit-plot.png :align: center :width: 80% +``` + +```python +from jina import DocumentArray +all_q = DocumentArray(generate_qa_match()) +finetuner.embed(all_q, model) +print(all_q.embeddings.shape) +``` + +```console +(481, 32) +``` + +```python +all_q.visualize('embed.png', method='tsne') +``` + +```{figure} embed.png +:align: center +:width: 80% ``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 5cbd20229..196b4b7c4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -136,7 +136,7 @@ Perfect! Now `embed_model` and `train_data` are already provided by you, simply ```python import finetuner -tuned_model, _ = finetuner.fit( +tuned_model, summary = finetuner.fit( embed_model, train_data=train_data ) @@ -159,7 +159,7 @@ emphasize-lines: 6 --- import finetuner -tuned_model, _ = finetuner.fit( +tuned_model, summary = finetuner.fit( embed_model, train_data=unlabeled_data, interactive=True @@ -183,7 +183,7 @@ emphasize-lines: 6, 7 --- import finetuner -tuned_model, _ = finetuner.fit( +tuned_model, summary = finetuner.fit( general_model, train_data=labeled_data, to_embedding_model=True, @@ -208,7 +208,7 @@ emphasize-lines: 6, 7 --- import finetuner -tuned_model, _ = finetuner.fit( +tuned_model, summary = finetuner.fit( general_model, train_data=labeled_data, interactive=True, diff --git a/finetuner/__init__.py b/finetuner/__init__.py index f455ed50d..8e3a74efa 100644 --- a/finetuner/__init__.py +++ b/finetuner/__init__.py @@ -67,7 +67,7 @@ def fit( optimizer: str = 'adam', optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', -) -> Tuple['AnyDNN', 'Summary']: +) -> Tuple['AnyDNN', None]: ... @@ -91,7 +91,7 @@ def fit( output_dim: Optional[int] = None, freeze: bool = False, device: str = 'cpu', -) -> Tuple['AnyDNN', 'Summary']: +) -> Tuple['AnyDNN', None]: ... @@ -116,3 +116,4 @@ def fit( # level them up to the top-level from .tuner import save from .tailor import display +from .embedding import embed diff --git a/finetuner/embedding.py b/finetuner/embedding.py index 0887fb6e2..bc0282160 100644 --- a/finetuner/embedding.py +++ b/finetuner/embedding.py @@ -5,7 +5,7 @@ from .helper import AnyDNN, get_framework -def set_embeddings( +def embed( docs: Union[DocumentArray, DocumentArrayMemmap], embed_model: AnyDNN, device: str = 'cpu', diff --git a/finetuner/labeler/executor.py b/finetuner/labeler/executor.py index f9291a7b6..c39446a65 100644 --- a/finetuner/labeler/executor.py +++ b/finetuner/labeler/executor.py @@ -4,7 +4,7 @@ from jina import Executor, DocumentArray, requests, DocumentArrayMemmap from jina.helper import cached_property -from ..embedding import set_embeddings +from ..embedding import embed from ..tuner import fit, save @@ -42,8 +42,8 @@ def embed(self, docs: DocumentArray, parameters: Dict, **kwargs): min(len(self._all_data), int(parameters.get('sample_size', 1000))) ) - set_embeddings(docs, self._embed_model) - set_embeddings(_catalog, self._embed_model) + embed(docs, self._embed_model) + embed(_catalog, self._embed_model) docs.match( _catalog, diff --git a/tests/unit/test_embedding.py b/tests/unit/test_embedding.py index 356428dc7..f5dd06ba6 100644 --- a/tests/unit/test_embedding.py +++ b/tests/unit/test_embedding.py @@ -4,7 +4,7 @@ import torch from jina import DocumentArray, DocumentArrayMemmap -from finetuner.embedding import set_embeddings +from finetuner.embedding import embed from finetuner.toydata import generate_fashion_match embed_models = { @@ -41,11 +41,11 @@ def test_set_embeddings(framework, tmpdir): # works for DA embed_model = embed_models[framework]() docs = DocumentArray(generate_fashion_match(num_total=100)) - set_embeddings(docs, embed_model) + embed(docs, embed_model) assert docs.embeddings.shape == (100, 32) # works for DAM dam = DocumentArrayMemmap(tmpdir) dam.extend(generate_fashion_match(num_total=42)) - set_embeddings(dam, embed_model) + embed(dam, embed_model) assert dam.embeddings.shape == (42, 32) diff --git a/tests/unit/tuner/keras/test_gpu.py b/tests/unit/tuner/keras/test_gpu.py index e5e1b166f..aeee56a27 100644 --- a/tests/unit/tuner/keras/test_gpu.py +++ b/tests/unit/tuner/keras/test_gpu.py @@ -3,7 +3,7 @@ from jina import DocumentArray, DocumentArrayMemmap from finetuner.tuner.keras import KerasTuner -from finetuner.embedding import set_embeddings +from finetuner.embedding import embed from finetuner.toydata import generate_fashion_match all_test_losses = [ @@ -47,11 +47,11 @@ def test_set_embeddings_gpu(tmpdir): ] ) docs = DocumentArray(generate_fashion_match(num_total=100)) - set_embeddings(docs, embed_model, 'cuda') + embed(docs, embed_model, 'cuda') assert docs.embeddings.shape == (100, 32) # works for DAM dam = DocumentArrayMemmap(tmpdir) dam.extend(generate_fashion_match(num_total=42)) - set_embeddings(dam, embed_model, 'cuda') + embed(dam, embed_model, 'cuda') assert dam.embeddings.shape == (42, 32) diff --git a/tests/unit/tuner/paddle/test_gpu.py b/tests/unit/tuner/paddle/test_gpu.py index 04499ba26..776b5c928 100644 --- a/tests/unit/tuner/paddle/test_gpu.py +++ b/tests/unit/tuner/paddle/test_gpu.py @@ -2,7 +2,7 @@ import paddle.nn as nn from jina import DocumentArray, DocumentArrayMemmap -from finetuner.embedding import set_embeddings +from finetuner.embedding import embed from finetuner.toydata import generate_fashion_match from finetuner.tuner.paddle import PaddleTuner @@ -45,11 +45,11 @@ def test_set_embeddings_gpu(tmpdir): nn.Linear(in_features=128, out_features=32), ) docs = DocumentArray(generate_fashion_match(num_total=100)) - set_embeddings(docs, embed_model, 'cuda') + embed(docs, embed_model, 'cuda') assert docs.embeddings.shape == (100, 32) # works for DAM dam = DocumentArrayMemmap(tmpdir) dam.extend(generate_fashion_match(num_total=42)) - set_embeddings(dam, embed_model, 'cuda') + embed(dam, embed_model, 'cuda') assert dam.embeddings.shape == (42, 32) diff --git a/tests/unit/tuner/torch/test_gpu.py b/tests/unit/tuner/torch/test_gpu.py index 1f79540d6..31e75a36f 100644 --- a/tests/unit/tuner/torch/test_gpu.py +++ b/tests/unit/tuner/torch/test_gpu.py @@ -3,7 +3,7 @@ import torch.nn as nn from jina import DocumentArray, DocumentArrayMemmap -from finetuner.embedding import set_embeddings +from finetuner.embedding import embed from finetuner.toydata import generate_fashion_match from finetuner.tuner.pytorch import PytorchTuner @@ -49,11 +49,11 @@ def test_set_embeddings_gpu(tmpdir): nn.Linear(in_features=128, out_features=32), ) docs = DocumentArray(generate_fashion_match(num_total=100)) - set_embeddings(docs, embed_model, 'cuda') + embed(docs, embed_model, 'cuda') assert docs.embeddings.shape == (100, 32) # works for DAM dam = DocumentArrayMemmap(tmpdir) dam.extend(generate_fashion_match(num_total=42)) - set_embeddings(dam, embed_model, 'cuda') + embed(dam, embed_model, 'cuda') assert dam.embeddings.shape == (42, 32)