diff --git a/docs/basics/data-format.md b/docs/basics/data-format.md index 6301921e8..b038f4e6f 100644 --- a/docs/basics/data-format.md +++ b/docs/basics/data-format.md @@ -4,8 +4,16 @@ Finetuner uses Jina [`Document`](https://docs.jina.ai/fundamentals/document/) as the primitive data type. In particular, [`DocumentArray`](https://docs.jina.ai/fundamentals/document/documentarray-api/) and [`DocumentArrayMemap`](https://docs.jina.ai/fundamentals/document/documentarraymemmap-api/) are the input data type -for Tailor and Tuner. This means, your training dataset and evaluation dataset should be stored in `DocumentArray` -or `DocumentArrayMemap`, where each training or evaluation instance is a `Document` object. +in the high-level `finetuner.fit()` API. This means, your training dataset and evaluation dataset should be stored in `DocumentArray` +or `DocumentArrayMemap`, where each training or evaluation instance is a `Document` object: + +```python +import finetuner + +finetuner.fit(model, + train_data=..., + eval_data=...) +``` This chapter introduces how to construct a `Document` in a way that Finetuner will accept. @@ -137,19 +145,6 @@ Yes. Labels should reflect the groundtruth as-is. If a Document contains only po However, if all match labels from all Documents are the same, then Finetuner cannot learn anything useful. ``` -### Catalog - -In search, queries and search results are often distinct sets. -Specifying a `catalog` helps you keep this distinction during finetuning. -When using `finetuner.fit(train_data=...,eval_data=..., catalog=...)`, `train_data` and `eval_data` specify the potential queries and the `catalog` specifies the potential results. -This distinction is mainly used - -- in the Labeler, when new sets of unlabeled results are generated and -- during evaluation, for the NDCG calculation. - -A `catalog` is either a `DocumentArray` or a `DocumentArrayMemmap`. -If no `catalog` is specified, the Finetuner will implicitly use `train_data` as catalog. - ## Data source After organizing the labeled `Document` into `DocumentArray` or `DocumentArrayMemmap`, you can feed them diff --git a/finetuner/__init__.py b/finetuner/__init__.py index f1241bae6..87a04c304 100644 --- a/finetuner/__init__.py +++ b/finetuner/__init__.py @@ -9,7 +9,8 @@ from typing import Dict, Optional, overload, TYPE_CHECKING, Tuple if TYPE_CHECKING: - from .helper import AnyDNN, DocumentArrayLike, TunerReturnType + from .helper import AnyDNN, DocumentArrayLike + from .tuner.summary import SummaryCollection # fit interface generated from Tuner @@ -25,7 +26,7 @@ def fit( optimizer: str = 'adam', optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', -) -> 'TunerReturnType': +) -> 'SummaryCollection': ... @@ -48,7 +49,7 @@ def fit( output_dim: Optional[int] = None, freeze: bool = False, device: str = 'cpu', -) -> 'TunerReturnType': +) -> 'SummaryCollection': ... @@ -96,7 +97,7 @@ def fit( def fit( model: 'AnyDNN', train_data: 'DocumentArrayLike', *args, **kwargs -) -> Optional['TunerReturnType']: +) -> Optional['SummaryCollection']: if kwargs.get('to_embedding_model', False): from .tailor import to_embedding_model diff --git a/finetuner/helper.py b/finetuner/helper.py index 550fbd087..63f470d00 100644 --- a/finetuner/helper.py +++ b/finetuner/helper.py @@ -35,9 +35,6 @@ LayerInfoType = List[ Dict[str, Any] ] #: The type of embedding layer information used in Tailor -TunerReturnType = Dict[ - str, Dict[str, Any] -] #: The type of loss, metric information Tuner returns def get_framework(dnn_model: AnyDNN) -> str: diff --git a/finetuner/labeler/__init__.py b/finetuner/labeler/__init__.py index 518e76181..88dfc035e 100644 --- a/finetuner/labeler/__init__.py +++ b/finetuner/labeler/__init__.py @@ -4,7 +4,7 @@ from typing import Optional import jina.helper -from jina import Flow, DocumentArrayMemmap +from jina import Flow from jina.logging.predefined import default_logger from .executor import FTExecutor, DataIterator @@ -14,7 +14,6 @@ def fit( embed_model: AnyDNN, train_data: DocumentArrayLike, - catalog: Optional[DocumentArrayLike] = None, clear_labels_on_start: bool = False, port_expose: Optional[int] = None, runtime_backend: str = 'thread', @@ -22,7 +21,6 @@ def fit( **kwargs, ) -> None: dam_path = tempfile.mkdtemp() - catalog_dam_path = init_catalog(dam_path, catalog, train_data) class MyExecutor(FTExecutor): def get_embed_model(self): @@ -39,14 +37,13 @@ def get_embed_model(self): uses=DataIterator, uses_with={ 'dam_path': dam_path, - 'catalog_dam_path': catalog_dam_path, 'clear_labels_on_start': clear_labels_on_start, }, ) .add( uses=MyExecutor, uses_with={ - 'catalog_dam_path': catalog_dam_path, + 'dam_path': dam_path, 'loss': loss, }, ) @@ -91,22 +88,8 @@ def open_frontend_in_browser(req): f.post( '/feed', train_data, - request_size=128, + request_size=10, show_progress=True, on_done=open_frontend_in_browser, ) f.block() - - -def init_catalog( - dam_path: str, catalog: DocumentArrayLike, train_data: DocumentArrayLike -): - if isinstance(catalog, DocumentArrayMemmap): - catalog_dam_path = catalog.path - else: - catalog_dam_path = dam_path + '/catalog' - catalog_memmap = DocumentArrayMemmap(catalog_dam_path) - if catalog is None: - catalog = train_data() if callable(train_data) else train_data - catalog_memmap.extend(catalog) - return catalog_dam_path diff --git a/finetuner/labeler/executor.py b/finetuner/labeler/executor.py index 045f71de5..94ee3820f 100644 --- a/finetuner/labeler/executor.py +++ b/finetuner/labeler/executor.py @@ -11,13 +11,13 @@ class FTExecutor(Executor): def __init__( self, - catalog_dam_path: str, + dam_path: str, metric: str = 'cosine', loss: str = 'CosineSiameseLoss', **kwargs, ): super().__init__(**kwargs) - self._catalog = DocumentArrayMemmap(catalog_dam_path) + self._all_data = DocumentArrayMemmap(dam_path) self._metric = metric self._loss = loss @@ -33,9 +33,9 @@ def _embed_model(self): def embed(self, docs: DocumentArray, parameters: Dict, **kwargs): if not docs: return - self._catalog.reload() - da = self._catalog.sample( - min(len(self._catalog), int(parameters.get('sample_size', 1000))) + self._all_data.reload() + da = self._all_data.sample( + min(len(self._all_data), int(parameters.get('sample_size', 1000))) ) f_type = get_framework(self._embed_model) @@ -77,7 +77,6 @@ def fit(self, docs: DocumentArray, parameters: Dict, **kwargs): fit( self._embed_model, docs, - self._catalog, epochs=int(parameters.get('epochs', 10)), loss=self._loss, ) @@ -93,14 +92,12 @@ class DataIterator(Executor): def __init__( self, dam_path: str, - catalog_dam_path: str, labeled_dam_path: Optional[str] = None, clear_labels_on_start: bool = False, **kwargs, ): super().__init__(**kwargs) self._all_data = DocumentArrayMemmap(dam_path) - self._catalog = DocumentArrayMemmap(catalog_dam_path) if not labeled_dam_path: labeled_dam_path = dam_path + '/labeled' self._labeled_dam = DocumentArrayMemmap(labeled_dam_path) @@ -108,25 +105,20 @@ def __init__( self._labeled_dam.clear() @requests(on='/feed') - def store_data(self, docs: DocumentArray, parameters: Dict, **kwargs): - if parameters.get('type', 'query') == 'query': - self._all_data.extend(docs) - else: - self._catalog.extend(docs) + def store_data(self, docs: DocumentArray, **kwargs): + self._all_data.extend(docs) @requests(on='/next') def take_batch(self, parameters: Dict, **kwargs): - count = int(parameters.get('new_examples', 5)) + st = int(parameters.get('start', 0)) + ed = int(parameters.get('end', 1)) self._all_data.reload() - count = min(max(count, 0), len(self._all_data)) - return self._all_data.sample(k=count) + return self._all_data[st:ed] @requests(on='/fit') def add_fit_data(self, docs: DocumentArray, **kwargs): - for d in docs.traverse_flat(['r']): + for d in docs.traverse_flat(['r', 'm']): d.content = self._all_data[d.id].content - for d in docs.traverse_flat(['m']): - d.content = self._catalog[d.id].content self._labeled_dam.extend(docs) return self._labeled_dam diff --git a/finetuner/labeler/ui/js/components/image-match-card.vue.js b/finetuner/labeler/ui/js/components/image-match-card.vue.js index dd1a71574..414353ab2 100644 --- a/finetuner/labeler/ui/js/components/image-match-card.vue.js +++ b/finetuner/labeler/ui/js/components/image-match-card.vue.js @@ -10,7 +10,7 @@ const imageMatchCard = { template: `
Select all images similar to the image on right
+Select all images similar to the image on right
Select all images similar to the image on right
+Select all meshes similar to the image on right