diff --git a/docs/basics/data-format.md b/docs/basics/data-format.md index 6301921e8..b038f4e6f 100644 --- a/docs/basics/data-format.md +++ b/docs/basics/data-format.md @@ -4,8 +4,16 @@ Finetuner uses Jina [`Document`](https://docs.jina.ai/fundamentals/document/) as the primitive data type. In particular, [`DocumentArray`](https://docs.jina.ai/fundamentals/document/documentarray-api/) and [`DocumentArrayMemap`](https://docs.jina.ai/fundamentals/document/documentarraymemmap-api/) are the input data type -for Tailor and Tuner. This means, your training dataset and evaluation dataset should be stored in `DocumentArray` -or `DocumentArrayMemap`, where each training or evaluation instance is a `Document` object. +in the high-level `finetuner.fit()` API. This means, your training dataset and evaluation dataset should be stored in `DocumentArray` +or `DocumentArrayMemap`, where each training or evaluation instance is a `Document` object: + +```python +import finetuner + +finetuner.fit(model, + train_data=..., + eval_data=...) +``` This chapter introduces how to construct a `Document` in a way that Finetuner will accept. @@ -137,19 +145,6 @@ Yes. Labels should reflect the groundtruth as-is. If a Document contains only po However, if all match labels from all Documents are the same, then Finetuner cannot learn anything useful. ``` -### Catalog - -In search, queries and search results are often distinct sets. -Specifying a `catalog` helps you keep this distinction during finetuning. -When using `finetuner.fit(train_data=...,eval_data=..., catalog=...)`, `train_data` and `eval_data` specify the potential queries and the `catalog` specifies the potential results. -This distinction is mainly used - -- in the Labeler, when new sets of unlabeled results are generated and -- during evaluation, for the NDCG calculation. - -A `catalog` is either a `DocumentArray` or a `DocumentArrayMemmap`. -If no `catalog` is specified, the Finetuner will implicitly use `train_data` as catalog. - ## Data source After organizing the labeled `Document` into `DocumentArray` or `DocumentArrayMemmap`, you can feed them diff --git a/finetuner/__init__.py b/finetuner/__init__.py index f1241bae6..87a04c304 100644 --- a/finetuner/__init__.py +++ b/finetuner/__init__.py @@ -9,7 +9,8 @@ from typing import Dict, Optional, overload, TYPE_CHECKING, Tuple if TYPE_CHECKING: - from .helper import AnyDNN, DocumentArrayLike, TunerReturnType + from .helper import AnyDNN, DocumentArrayLike + from .tuner.summary import SummaryCollection # fit interface generated from Tuner @@ -25,7 +26,7 @@ def fit( optimizer: str = 'adam', optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', -) -> 'TunerReturnType': +) -> 'SummaryCollection': ... @@ -48,7 +49,7 @@ def fit( output_dim: Optional[int] = None, freeze: bool = False, device: str = 'cpu', -) -> 'TunerReturnType': +) -> 'SummaryCollection': ... @@ -96,7 +97,7 @@ def fit( def fit( model: 'AnyDNN', train_data: 'DocumentArrayLike', *args, **kwargs -) -> Optional['TunerReturnType']: +) -> Optional['SummaryCollection']: if kwargs.get('to_embedding_model', False): from .tailor import to_embedding_model diff --git a/finetuner/helper.py b/finetuner/helper.py index 550fbd087..63f470d00 100644 --- a/finetuner/helper.py +++ b/finetuner/helper.py @@ -35,9 +35,6 @@ LayerInfoType = List[ Dict[str, Any] ] #: The type of embedding layer information used in Tailor -TunerReturnType = Dict[ - str, Dict[str, Any] -] #: The type of loss, metric information Tuner returns def get_framework(dnn_model: AnyDNN) -> str: diff --git a/finetuner/labeler/__init__.py b/finetuner/labeler/__init__.py index 518e76181..88dfc035e 100644 --- a/finetuner/labeler/__init__.py +++ b/finetuner/labeler/__init__.py @@ -4,7 +4,7 @@ from typing import Optional import jina.helper -from jina import Flow, DocumentArrayMemmap +from jina import Flow from jina.logging.predefined import default_logger from .executor import FTExecutor, DataIterator @@ -14,7 +14,6 @@ def fit( embed_model: AnyDNN, train_data: DocumentArrayLike, - catalog: Optional[DocumentArrayLike] = None, clear_labels_on_start: bool = False, port_expose: Optional[int] = None, runtime_backend: str = 'thread', @@ -22,7 +21,6 @@ def fit( **kwargs, ) -> None: dam_path = tempfile.mkdtemp() - catalog_dam_path = init_catalog(dam_path, catalog, train_data) class MyExecutor(FTExecutor): def get_embed_model(self): @@ -39,14 +37,13 @@ def get_embed_model(self): uses=DataIterator, uses_with={ 'dam_path': dam_path, - 'catalog_dam_path': catalog_dam_path, 'clear_labels_on_start': clear_labels_on_start, }, ) .add( uses=MyExecutor, uses_with={ - 'catalog_dam_path': catalog_dam_path, + 'dam_path': dam_path, 'loss': loss, }, ) @@ -91,22 +88,8 @@ def open_frontend_in_browser(req): f.post( '/feed', train_data, - request_size=128, + request_size=10, show_progress=True, on_done=open_frontend_in_browser, ) f.block() - - -def init_catalog( - dam_path: str, catalog: DocumentArrayLike, train_data: DocumentArrayLike -): - if isinstance(catalog, DocumentArrayMemmap): - catalog_dam_path = catalog.path - else: - catalog_dam_path = dam_path + '/catalog' - catalog_memmap = DocumentArrayMemmap(catalog_dam_path) - if catalog is None: - catalog = train_data() if callable(train_data) else train_data - catalog_memmap.extend(catalog) - return catalog_dam_path diff --git a/finetuner/labeler/executor.py b/finetuner/labeler/executor.py index 045f71de5..94ee3820f 100644 --- a/finetuner/labeler/executor.py +++ b/finetuner/labeler/executor.py @@ -11,13 +11,13 @@ class FTExecutor(Executor): def __init__( self, - catalog_dam_path: str, + dam_path: str, metric: str = 'cosine', loss: str = 'CosineSiameseLoss', **kwargs, ): super().__init__(**kwargs) - self._catalog = DocumentArrayMemmap(catalog_dam_path) + self._all_data = DocumentArrayMemmap(dam_path) self._metric = metric self._loss = loss @@ -33,9 +33,9 @@ def _embed_model(self): def embed(self, docs: DocumentArray, parameters: Dict, **kwargs): if not docs: return - self._catalog.reload() - da = self._catalog.sample( - min(len(self._catalog), int(parameters.get('sample_size', 1000))) + self._all_data.reload() + da = self._all_data.sample( + min(len(self._all_data), int(parameters.get('sample_size', 1000))) ) f_type = get_framework(self._embed_model) @@ -77,7 +77,6 @@ def fit(self, docs: DocumentArray, parameters: Dict, **kwargs): fit( self._embed_model, docs, - self._catalog, epochs=int(parameters.get('epochs', 10)), loss=self._loss, ) @@ -93,14 +92,12 @@ class DataIterator(Executor): def __init__( self, dam_path: str, - catalog_dam_path: str, labeled_dam_path: Optional[str] = None, clear_labels_on_start: bool = False, **kwargs, ): super().__init__(**kwargs) self._all_data = DocumentArrayMemmap(dam_path) - self._catalog = DocumentArrayMemmap(catalog_dam_path) if not labeled_dam_path: labeled_dam_path = dam_path + '/labeled' self._labeled_dam = DocumentArrayMemmap(labeled_dam_path) @@ -108,25 +105,20 @@ def __init__( self._labeled_dam.clear() @requests(on='/feed') - def store_data(self, docs: DocumentArray, parameters: Dict, **kwargs): - if parameters.get('type', 'query') == 'query': - self._all_data.extend(docs) - else: - self._catalog.extend(docs) + def store_data(self, docs: DocumentArray, **kwargs): + self._all_data.extend(docs) @requests(on='/next') def take_batch(self, parameters: Dict, **kwargs): - count = int(parameters.get('new_examples', 5)) + st = int(parameters.get('start', 0)) + ed = int(parameters.get('end', 1)) self._all_data.reload() - count = min(max(count, 0), len(self._all_data)) - return self._all_data.sample(k=count) + return self._all_data[st:ed] @requests(on='/fit') def add_fit_data(self, docs: DocumentArray, **kwargs): - for d in docs.traverse_flat(['r']): + for d in docs.traverse_flat(['r', 'm']): d.content = self._all_data[d.id].content - for d in docs.traverse_flat(['m']): - d.content = self._catalog[d.id].content self._labeled_dam.extend(docs) return self._labeled_dam diff --git a/finetuner/labeler/ui/js/components/image-match-card.vue.js b/finetuner/labeler/ui/js/components/image-match-card.vue.js index dd1a71574..414353ab2 100644 --- a/finetuner/labeler/ui/js/components/image-match-card.vue.js +++ b/finetuner/labeler/ui/js/components/image-match-card.vue.js @@ -10,7 +10,7 @@ const imageMatchCard = { template: `
-

Select all images similar to the image on right

+

Select all images similar to the image on right

diff --git a/finetuner/labeler/ui/js/components/mesh-match-card.vue.js b/finetuner/labeler/ui/js/components/mesh-match-card.vue.js index 0d58fca73..ab216a524 100644 --- a/finetuner/labeler/ui/js/components/mesh-match-card.vue.js +++ b/finetuner/labeler/ui/js/components/mesh-match-card.vue.js @@ -10,7 +10,7 @@ const meshMatchCard = { template: `
-

Select all images similar to the image on right

+

Select all meshes similar to the image on right

DocumentArrayLike: - return generate_qa_match_catalog(pre_init_generator=False, **kwargs)[0] - - -def generate_qa_match_catalog( +def generate_qa_match( num_total: int = 481, num_neg: int = 0, pos_value: int = 1, @@ -68,8 +64,7 @@ def generate_qa_match_catalog( to_ndarray: bool = True, max_seq_len: int = 100, is_testset: Optional[bool] = None, - pre_init_generator: bool = True, -) -> Tuple[DocumentArrayLike, DocumentArray]: +) -> Generator[Document, None, None]: """Get a generator of QA data with synthetic negative matches. :param num_total: the total number of documents to return @@ -81,6 +76,7 @@ def generate_qa_match_catalog( :param is_testset: If to generate test data, if set to None, will all data return :return: """ + num_doc = 0 all_docs = DocumentArray(_download_qa_data(is_testset=is_testset)) if to_ndarray: @@ -90,85 +86,57 @@ def generate_qa_match_catalog( + all_docs.get_attributes('tags__wrong_answer') ) vocab = _build_vocab(all_texts, min_freq=2) - catalog = DocumentArray() - text_to_id = {} - - def get_document(text, label): - doc = Document(text=text, tags={__default_tag_key__: {'label': label}}) - if text in text_to_id: - doc.id = text_to_id[text] - else: - text_to_id[text] = doc.id - catalog.append(doc) - return doc - - def generator(): - num_doc = 0 - for doc in all_docs: - d = Document(doc, copy=True) - d.text = d.tags['question'] - m_p = get_document(d.tags['answer'], pos_value) - m_n = get_document(d.tags['wrong_answer'], neg_value) - if to_ndarray: - d.blob = np.array( - _text_to_int_sequence(d.text, vocab, max_seq_len), np.long - ) - m_p.blob = np.array( - _text_to_int_sequence(m_p.text, vocab, max_seq_len), np.long - ) - m_n.blob = np.array( - _text_to_int_sequence(m_n.text, vocab, max_seq_len), np.long - ) - if num_neg > 0: - d.matches.append(m_p) - d.matches.append(m_n) - cur_num_neg = 1 - if num_neg > 1: - sampled_docs = all_docs.sample(num_neg, seed=num_doc) - for n_d in sampled_docs: - if n_d.id != d.id: - new_nd = get_document(n_d.tags['answer'], neg_value) - if to_ndarray: - new_nd.blob = np.array( - _text_to_int_sequence( - new_nd.text, vocab, max_seq_len - ), - np.long, - ) - d.matches.append(new_nd) - cur_num_neg += 1 - if cur_num_neg >= num_neg: - break - num_doc += 1 - yield d - - if num_doc >= num_total: - break - - # prefil catalog - [_ for _ in generator()] + for d in all_docs: + d.text = d.tags['question'] + m_p = Document( + text=d.tags['answer'], tags={__default_tag_key__: {'label': pos_value}} + ) + m_n = Document( + text=d.tags['wrong_answer'], + tags={__default_tag_key__: {'label': neg_value}}, + ) + if to_ndarray: + d.blob = np.array( + _text_to_int_sequence(d.text, vocab, max_seq_len), np.long + ) + m_p.blob = np.array( + _text_to_int_sequence(m_p.text, vocab, max_seq_len), np.long + ) + m_n.blob = np.array( + _text_to_int_sequence(m_n.text, vocab, max_seq_len), np.long + ) - if pre_init_generator: - return generator(), catalog - else: - return generator, catalog + if num_neg > 0: + d.matches.append(m_p) + d.matches.append(m_n) + cur_num_neg = 1 + if num_neg > 1: + sampled_docs = all_docs.sample(num_neg) + for n_d in sampled_docs: + if n_d.id != d.id: + new_nd = Document( + text=n_d.tags['answer'], + tags={__default_tag_key__: {'label': neg_value}}, + ) + if to_ndarray: + new_nd.blob = np.array( + _text_to_int_sequence(new_nd.text, vocab, max_seq_len), + np.long, + ) + d.matches.append(new_nd) + cur_num_neg += 1 + if cur_num_neg >= num_neg: + break + num_doc += 1 + yield d + + if num_doc >= num_total: + break def generate_fashion_match( - num_total=100, num_catalog=5000, **kwargs -) -> DocumentArrayLike: - return generate_fashion_match_catalog( - num_total=num_total, - num_catalog=num_catalog, - pre_init_generator=False, - **kwargs, - )[0] - - -def generate_fashion_match_catalog( num_total: int = 60000, - num_catalog: int = 60000, num_pos: int = 0, num_neg: int = 0, pos_value: int = 1, @@ -177,8 +145,7 @@ def generate_fashion_match_catalog( channels: int = 0, channel_axis: int = -1, is_testset: bool = False, - pre_init_generator: bool = True, -) -> Tuple[DocumentArrayLike, DocumentArray]: +) -> Generator[Document, None, None]: """Get a Generator of fashion-mnist Documents with synthetic matches. :param num_total: the total number of documents to return @@ -201,57 +168,44 @@ def generate_fashion_match_catalog( is_testset=is_testset, ) - catalog = DocumentArray(_orginal_fashion_doc) - if len(catalog) > num_catalog: - catalog = catalog.sample(num_catalog) + n_d = 0 if num_pos > 0 or num_neg > 0: # need to build synthetic matches - # copy_all_docs = copy.deepcopy(catalog) - rv = catalog.split('class') - - def generator(): - n_d = 0 - for od in catalog: - new_doc = Document(od, copy=True) - pos_label = new_doc.tags['class'] - pos_samples = rv[pos_label].sample(num_pos) - pos_samples = [Document(d, copy=True) for d in pos_samples] - for d in pos_samples: - d.tags[__default_tag_key__] = {'label': pos_value} - - neg_samples = DocumentArray() - while len(neg_samples) < num_neg: - neg_samples.extend( - Document(d, copy=True) - for d in catalog.sample(num_neg) - if d.tags['class'] != pos_label - ) - neg_samples = neg_samples[:num_neg] - - for d in neg_samples: - d.tags[__default_tag_key__] = {'label': neg_value} - - new_doc.matches.extend(pos_samples) - new_doc.matches.extend(neg_samples) - n_d += 1 - yield new_doc - if n_d >= num_total: - break - - else: + all_docs = DocumentArray(_orginal_fashion_doc) + + copy_all_docs = copy.deepcopy(all_docs) + rv = copy_all_docs.split('class') + + for od in all_docs: + pos_label = od.tags['class'] + pos_samples = rv[pos_label].sample(num_pos) + for d in pos_samples: + d.tags[__default_tag_key__] = {'label': pos_value} + + neg_samples = DocumentArray() + while len(neg_samples) < num_neg: + neg_samples.extend( + d + for d in copy_all_docs.sample(num_neg) + if d.tags['class'] != pos_label + ) + neg_samples = neg_samples[:num_neg] - def generator(): - n_d = 0 - for d in catalog: - n_d += 1 - yield d - if n_d >= num_total: - break + for d in neg_samples: + d.tags[__default_tag_key__] = {'label': neg_value} - if pre_init_generator: - return generator(), catalog + od.matches.extend(pos_samples) + od.matches.extend(neg_samples) + n_d += 1 + yield od + if n_d >= num_total: + break else: - return generator, catalog + for d in _orginal_fashion_doc: + n_d += 1 + yield d + if n_d >= num_total: + break def _download_qa_data( diff --git a/finetuner/tuner/__init__.py b/finetuner/tuner/__init__.py index af1bb7684..6af00cbac 100644 --- a/finetuner/tuner/__init__.py +++ b/finetuner/tuner/__init__.py @@ -1,10 +1,10 @@ from typing import Optional, TYPE_CHECKING, Type, Dict -from ..helper import AnyDNN, DocumentArrayLike, TunerReturnType, get_framework -from jina import DocumentArray +from ..helper import AnyDNN, DocumentArrayLike, get_framework if TYPE_CHECKING: from .base import BaseTuner + from .summary import SummaryCollection def get_tuner_class(dnn_model: AnyDNN) -> Type['BaseTuner']: @@ -27,7 +27,6 @@ def get_tuner_class(dnn_model: AnyDNN) -> Type['BaseTuner']: def fit( embed_model: AnyDNN, train_data: DocumentArrayLike, - catalog: DocumentArrayLike = None, eval_data: Optional[DocumentArrayLike] = None, epochs: int = 10, batch_size: int = 256, @@ -37,12 +36,13 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, -) -> TunerReturnType: +) -> 'SummaryCollection': """Finetune the model on the training data. + :param embed_model: an embedding model :param train_data: Data on which to train the model :param eval_data: Data on which to evaluate the model at the end of each epoch - :param epoch: Number of epochs to train the model + :param epochs: Number of epochs to train the model :param batch_size: The batch size to use for training and evaluation :param learning_rate: Learning rate to use in training :param optimizer: Which optimizer to use in training. Supported @@ -67,15 +67,8 @@ def fit( ``"cpu"`` and ``"cuda"`` (for GPU) """ ft = get_tuner_class(embed_model) - if catalog is None: - train_data = DocumentArray(train_data() if callable(train_data) else train_data) - catalog = DocumentArray() - catalog.extend(train_data.traverse_flat(['r', 'm'])) - if eval_data is not None: - eval_data = DocumentArray(eval_data() if callable(eval_data) else eval_data) - catalog.extend(eval_data.traverse_flat(['r', 'm'])) - - return ft(embed_model, catalog=catalog, loss=loss).fit( + + return ft(embed_model, loss=loss).fit( train_data, eval_data, epochs=epochs, diff --git a/finetuner/tuner/base.py b/finetuner/tuner/base.py index 3a659a631..e34a98fdb 100644 --- a/finetuner/tuner/base.py +++ b/finetuner/tuner/base.py @@ -8,11 +8,8 @@ Dict, ) -from jina.logging.logger import JinaLogger -from jina import DocumentArrayMemmap, DocumentArray - from ..helper import AnyDNN, AnyDataLoader, AnyOptimizer, DocumentArrayLike -from . import evaluation +from .summary import SummaryCollection class BaseLoss: @@ -23,7 +20,6 @@ class BaseTuner(abc.ABC): def __init__( self, embed_model: Optional[AnyDNN] = None, - catalog: DocumentArrayLike = None, loss: Union[AnyDNN, str] = 'CosineSiameseLoss', **kwargs, ): @@ -38,8 +34,6 @@ def __init__( self._loss = self._get_loss(loss) self._train_data_len = 0 self._eval_data_len = 0 - self._catalog = catalog - self.logger = JinaLogger(self.__class__.__name__) def _get_optimizer_kwargs(self, optimizer: str, custom_kwargs: Optional[Dict]): """Merges user-provided optimizer kwargs with default ones.""" @@ -107,7 +101,7 @@ def fit( batch_size: int = 256, *args, **kwargs, - ) -> Dict: + ) -> SummaryCollection: """Fit the :py:attr:`.embed_model` on labeled data. Note that fitting changes the weights in :py:attr:`.embed_model` in-place. This allows one to consecutively @@ -146,32 +140,11 @@ def _eval( """Evaluate the model on given labeled data""" ... - def get_metrics(self, docs: DocumentArrayLike): - docs = DocumentArray(docs()) if callable(docs) else docs - self.get_embeddings(docs) - self.get_embeddings(self._catalog) - if isinstance(self._catalog, DocumentArrayMemmap): - self._catalog.prune() - to_be_scored_docs = evaluation.prepare_eval_docs(docs, self._catalog, limit=10) - return { - 'hits': evaluation.get_hits_at_n(to_be_scored_docs), - 'ndcg': evaluation.get_ndcg_at_n(to_be_scored_docs), - } - - @abc.abstractmethod - def get_embeddings(self, docs: DocumentArrayLike): - """Calculates and adds the embeddings for the given Documents. - - :param docs: The documents to get embeddings from. - """ - class BaseDataset: def __init__( self, inputs: DocumentArrayLike, - catalog: DocumentArrayLike, ): super().__init__() self._inputs = inputs() if callable(inputs) else inputs - self._catalog = catalog() if callable(catalog) else catalog diff --git a/finetuner/tuner/dataset/__init__.py b/finetuner/tuner/dataset/__init__.py index c1c85b856..e77b81d75 100644 --- a/finetuner/tuner/dataset/__init__.py +++ b/finetuner/tuner/dataset/__init__.py @@ -1,16 +1,16 @@ import itertools -from ... import __default_tag_key__ + import numpy as np +from ... import __default_tag_key__ + class SiameseMixin: def __iter__(self): for d in self._inputs: d_blob = d.blob for m in d.matches: - yield (d_blob, self._catalog[m.id].blob), np.float32( - m.tags[__default_tag_key__]['label'] - ) + yield (d_blob, m.blob), np.float32(m.tags[__default_tag_key__]['label']) class TripletMixin: @@ -21,9 +21,9 @@ def __iter__(self): negatives = [] for m in d.matches: if m.tags[__default_tag_key__]['label'] > 0: - positives.append(self._catalog[m.id].blob) + positives.append(m.blob) else: - negatives.append(self._catalog[m.id].blob) + negatives.append(m.blob) for p, n in itertools.product(positives, negatives): yield (anchor, p, n), np.float32(0) diff --git a/finetuner/tuner/evaluation.py b/finetuner/tuner/evaluation.py deleted file mode 100644 index 49d8aad27..000000000 --- a/finetuner/tuner/evaluation.py +++ /dev/null @@ -1,53 +0,0 @@ -import numpy as np -from jina import Document, DocumentArray -from .. import __default_tag_key__ - - -def prepare_eval_docs(docs, catalog, limit=10, sample_size=100, seed=42): - sampled_docs = docs.sample(min(sample_size, len(docs)), seed) - to_be_scored_docs = DocumentArray() - for doc in sampled_docs: - d = Document( - id=doc.id, - embedding=doc.embedding, - tags={ - 'positive_ids': [ - m.id - for m in doc.matches - if m.tags[__default_tag_key__]['label'] > 0 - ] - }, - ) - to_be_scored_docs.append(d) - to_be_scored_docs.match(catalog, limit=limit) - return to_be_scored_docs - - -def get_hits_at_n(to_be_scored_docs, n=-1): - hits = 0 - for doc in to_be_scored_docs: - positive_ids = doc.tags['positive_ids'] - for match in doc.matches[:n]: - if match.id in positive_ids: - hits += 1 - return hits - - -def get_ndcg_at_n(to_be_scored_docs, n=-1): - ndcg = 0 - for doc in to_be_scored_docs: - dcg = 0 - positive_ids = doc.tags['positive_ids'] - first_n = doc.matches[:n] - for position, match in enumerate(first_n): - if match.id in positive_ids: - dcg += 1 / np.log(position + 2) - - max_positives = min(len(positive_ids), len(first_n)) - idcg = max(_get_idcg(max_positives), 1e-10) - ndcg += dcg / idcg - return ndcg - - -def _get_idcg(n): - return sum(1 / np.log(position + 2) for position in range(n)) diff --git a/finetuner/tuner/keras/__init__.py b/finetuner/tuner/keras/__init__.py index fb2991745..e1eb06cd4 100644 --- a/finetuner/tuner/keras/__init__.py +++ b/finetuner/tuner/keras/__init__.py @@ -1,6 +1,5 @@ from typing import Dict, Optional, Union, List -import numpy as np import tensorflow as tf from jina.logging.profile import ProgressBar from tensorflow import keras @@ -9,8 +8,7 @@ from . import losses, datasets from ..base import BaseTuner, BaseLoss from ..dataset.helper import get_dataset -from ..logger import LogGenerator -from ..stats import TunerStats +from ..summary import ScalarSummary, SummaryCollection from ...helper import DocumentArrayLike, AnyDataLoader @@ -31,7 +29,7 @@ def _get_data_loader( input_shape = self.embed_model.input_shape[1:] tf_data = tf.data.Dataset.from_generator( - lambda: ds(inputs, self._catalog), + lambda: ds(inputs), output_signature=( tuple( tf.TensorSpec(shape=input_shape, dtype=tf.float32) @@ -66,16 +64,13 @@ def _get_optimizer( def _train( self, data: AnyDataLoader, optimizer: Optimizer, description: str - ) -> List[float]: + ) -> ScalarSummary: """Train the model on given labeled data""" - losses = [] - - log_generator = LogGenerator('T', losses) - + _summary = ScalarSummary('Train Loss') with ProgressBar( description, - message_on_done=log_generator, + message_on_done=_summary.__str__, final_line_feed=False, total_length=self._train_data_len, ) as p: @@ -90,36 +85,39 @@ def _train( zip(grads, self._embed_model.trainable_weights) ) - losses.append(loss.numpy()) + _summary += loss.numpy() - p.update(message=log_generator()) + p.update(message=str(_summary)) self._train_data_len += 1 - return losses + return _summary def _eval( - self, data: AnyDataLoader, description: str = 'Evaluating', train_log: str = '' - ) -> List[float]: + self, + data: AnyDataLoader, + description: str = 'Evaluating', + train_loss: Optional[ScalarSummary] = None, + ) -> ScalarSummary: """Evaluate the model on given labeled data""" - losses = [] - - log_generator = LogGenerator('E', losses, train_log) + _summary = ScalarSummary('Eval Loss') with ProgressBar( - description, message_on_done=log_generator, total_length=self._eval_data_len + description, + message_on_done=lambda: f'{train_loss} | {_summary}', + total_length=self._eval_data_len, ) as p: self._eval_data_len = 0 for inputs, label in data: embeddings = [self._embed_model(inpt) for inpt in inputs] loss = self._loss([*embeddings, label]) - losses.append(loss.numpy()) + _summary += loss.numpy() - p.update(message=log_generator()) + p.update(message=str(_summary)) self._eval_data_len += 1 - return losses + return _summary def fit( self, @@ -132,7 +130,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ) -> TunerStats: + ) -> SummaryCollection: """Finetune the model on the training data. :param train_data: Data on which to train the model @@ -181,7 +179,8 @@ def fit( _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - stats = TunerStats() + m_train_loss = ScalarSummary('train') + m_eval_loss = ScalarSummary('eval') with self.device: for epoch in range(epochs): @@ -190,26 +189,13 @@ def fit( _optimizer, description=f'Epoch {epoch + 1}/{epochs}', ) - stats.add_train_loss(lt) + m_train_loss += lt if eval_data: - le = self._eval(_eval_data, train_log=LogGenerator("T", lt)()) - stats.add_eval_loss(le) - stats.add_eval_metric(self.get_metrics(eval_data)) + le = self._eval(_eval_data, train_loss=m_train_loss) + m_eval_loss += le - stats.print_last() - return stats - - def get_embeddings(self, docs: DocumentArrayLike): - """Calculates and adds the embeddings for the given Documents. - - :param docs: The documents to get embeddings from. - """ - blobs = docs.blobs - with self.device: - embeddings = self.embed_model(blobs) - for doc, embed in zip(docs, embeddings): - doc.embedding = np.array(embed) + return SummaryCollection(m_train_loss, m_eval_loss) def save(self, *args, **kwargs): """Save the embedding model. diff --git a/finetuner/tuner/logger.py b/finetuner/tuner/logger.py deleted file mode 100644 index 0030d89f3..000000000 --- a/finetuner/tuner/logger.py +++ /dev/null @@ -1,29 +0,0 @@ -import numpy as np - - -class LogGenerator: - def __init__(self, name, losses, prefix: str = ''): - self._losses = losses - self._prefix = prefix - self._name = name - - def __call__(self): - if self._prefix: - prefix = f'{self._prefix} | ' - else: - prefix = '' - return f'{prefix}{self._name}: {self.get_statistic()}' - - def get_statistic(self): - return f'Loss={self.mean_loss():>8}' - - def mean_loss(self): - return LogGenerator.get_log_value(self._losses) - - @staticmethod - def get_log_value(data): - mean = np.mean(data) - if mean < 1e5: - return f'{mean:.2f}' - else: - return f'{mean:.2e}' diff --git a/finetuner/tuner/paddle/__init__.py b/finetuner/tuner/paddle/__init__.py index 7a6a4b09a..e3b691986 100644 --- a/finetuner/tuner/paddle/__init__.py +++ b/finetuner/tuner/paddle/__init__.py @@ -1,6 +1,5 @@ from typing import Dict, Optional, Union, List -import numpy as np import paddle from jina.logging.profile import ProgressBar from paddle.io import DataLoader @@ -8,10 +7,9 @@ from . import losses, datasets from ..base import BaseTuner, BaseLoss -from ...helper import DocumentArrayLike, AnyDataLoader from ..dataset.helper import get_dataset -from ..logger import LogGenerator -from ..stats import TunerStats +from ..summary import ScalarSummary, SummaryCollection +from ...helper import DocumentArrayLike, AnyDataLoader class PaddleTuner(BaseTuner): @@ -28,7 +26,7 @@ def _get_data_loader( """Get the paddle ``DataLoader`` from the input data.""" ds = get_dataset(datasets, self.arity) return DataLoader( - dataset=ds(inputs=inputs, catalog=self._catalog), + dataset=ds(inputs=inputs), batch_size=batch_size, shuffle=shuffle, ) @@ -62,44 +60,45 @@ def _get_optimizer( ) def _eval( - self, data: AnyDataLoader, description: str = 'Evaluating', train_log: str = '' - ) -> List[float]: + self, + data: AnyDataLoader, + description: str = 'Evaluating', + train_loss: Optional[ScalarSummary] = None, + ) -> ScalarSummary: """Evaluate the model on given labeled data""" self._embed_model.eval() - losses = [] - - log_generator = LogGenerator('E', losses, train_log) + _summary = ScalarSummary('Eval Loss') with ProgressBar( - description, message_on_done=log_generator, total_length=self._eval_data_len + description, + message_on_done=lambda: f'{train_loss} | {_summary}', + total_length=self._eval_data_len, ) as p: self._eval_data_len = 0 for inputs, label in data: embeddings = [self._embed_model(inpt) for inpt in inputs] loss = self._loss(embeddings, label) - losses.append(loss.item()) + _summary += loss.item() - p.update(message=log_generator()) + p.update(message=str(_summary)) self._eval_data_len += 1 - return losses + return _summary def _train( self, data: AnyDataLoader, optimizer: Optimizer, description: str - ) -> List[float]: + ) -> ScalarSummary: """Train the model on given labeled data""" self._embed_model.train() - losses = [] - - log_generator = LogGenerator('T', losses) + _summary = ScalarSummary('Train Loss') with ProgressBar( description, - message_on_done=log_generator, + message_on_done=_summary.__str__, final_line_feed=False, total_length=self._train_data_len, ) as p: @@ -114,11 +113,11 @@ def _train( loss.backward() optimizer.step() - losses.append(loss.item()) + _summary += loss.item() - p.update(message=log_generator()) + p.update(message=str(_summary)) self._train_data_len += 1 - return losses + return _summary def fit( self, @@ -131,7 +130,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ) -> TunerStats: + ) -> SummaryCollection: """Finetune the model on the training data. :param train_data: Data on which to train the model @@ -170,7 +169,8 @@ def fit( _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - stats = TunerStats() + m_train_loss = ScalarSummary('train') + m_eval_loss = ScalarSummary('eval') for epoch in range(epochs): _data = self._get_data_loader( @@ -181,29 +181,17 @@ def fit( _optimizer, description=f'Epoch {epoch + 1}/{epochs}', ) - stats.add_train_loss(lt) + m_train_loss += lt if eval_data: _data = self._get_data_loader( inputs=eval_data, batch_size=batch_size, shuffle=False ) - le = self._eval(_data, train_log=LogGenerator('T', lt)()) - stats.add_eval_loss(le) - stats.add_eval_metric(self.get_metrics(eval_data)) - - stats.print_last() - return stats + le = self._eval(_data, train_loss=m_train_loss) + m_eval_loss += le - def get_embeddings(self, docs: DocumentArrayLike): - """Calculates and adds the embeddings for the given Documents. - - :param docs: The documents to get embeddings from. - """ - blobs = docs.blobs - embeddings = self.embed_model(paddle.Tensor(blobs)) - for doc, embed in zip(docs, embeddings): - doc.embedding = np.array(embed) + return SummaryCollection(m_train_loss, m_eval_loss) def save(self, *args, **kwargs): """Save the embedding model. diff --git a/finetuner/tuner/pytorch/__init__.py b/finetuner/tuner/pytorch/__init__.py index 2dc3ed629..8f44389f1 100644 --- a/finetuner/tuner/pytorch/__init__.py +++ b/finetuner/tuner/pytorch/__init__.py @@ -8,9 +8,8 @@ from . import losses, datasets from ..base import BaseTuner, BaseLoss from ..dataset.helper import get_dataset -from ..logger import LogGenerator +from ..summary import ScalarSummary, SummaryCollection from ...helper import DocumentArrayLike, AnyDataLoader -from ..stats import TunerStats class PytorchTuner(BaseTuner): @@ -27,7 +26,7 @@ def _get_data_loader( """Get pytorch ``DataLoader`` data loader from the input data.""" ds = get_dataset(datasets, self.arity) return DataLoader( - dataset=ds(inputs=inputs, catalog=self._catalog), + dataset=ds(inputs=inputs), batch_size=batch_size, shuffle=shuffle, ) @@ -65,17 +64,21 @@ def _get_optimizer( ) def _eval( - self, data: AnyDataLoader, description: str = 'Evaluating', train_log: str = '' - ) -> List[float]: + self, + data: AnyDataLoader, + description: str = 'Evaluating', + train_loss: Optional[ScalarSummary] = None, + ) -> ScalarSummary: """Evaluate the model on given labeled data""" self._embed_model.eval() - losses = [] - log_generator = LogGenerator('E', losses, train_log) + _summary = ScalarSummary('Eval Loss') with ProgressBar( - description, message_on_done=log_generator, total_length=self._eval_data_len + description, + message_on_done=lambda: f'{train_loss} | {_summary}', + total_length=self._eval_data_len, ) as p: self._eval_data_len = 0 for inputs, label in data: @@ -87,26 +90,24 @@ def _eval( embeddings = [self._embed_model(inpt) for inpt in inputs] loss = self._loss(embeddings, label) - losses.append(loss.item()) + _summary += loss.item() - p.update(message=log_generator()) + p.update(message=str(_summary)) self._eval_data_len += 1 - return losses + return _summary def _train( self, data: AnyDataLoader, optimizer: Optimizer, description: str - ) -> List[float]: + ) -> ScalarSummary: """Train the model on given labeled data""" self._embed_model.train() - losses = [] - - log_generator = LogGenerator('T', losses) + _summary = ScalarSummary('Train Loss') with ProgressBar( description, - message_on_done=log_generator, + message_on_done=_summary.__str__, final_line_feed=False, total_length=self._train_data_len, ) as p: @@ -124,11 +125,11 @@ def _train( loss.backward() optimizer.step() - losses.append(loss.item()) + _summary += loss.item() - p.update(message=log_generator()) + p.update(message=str(_summary)) self._train_data_len += 1 - return losses + return _summary def fit( self, @@ -141,7 +142,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ) -> TunerStats: + ) -> SummaryCollection: """Finetune the model on the training data. :param train_data: Data on which to train the model @@ -183,7 +184,8 @@ def fit( # Get optimizer _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - stats = TunerStats() + m_train_loss = ScalarSummary('train') + m_eval_loss = ScalarSummary('eval') for epoch in range(epochs): _data = self._get_data_loader( @@ -194,32 +196,17 @@ def fit( _optimizer, description=f'Epoch {epoch + 1}/{epochs}', ) - stats.add_train_loss(lt) + m_train_loss += lt if eval_data: _data = self._get_data_loader( inputs=eval_data, batch_size=batch_size, shuffle=False ) - le = self._eval(_data, train_log=LogGenerator('T', lt)()) - stats.add_eval_loss(le) - stats.add_eval_metric(self.get_metrics(eval_data)) - - stats.print_last() - return stats - - def get_embeddings(self, docs: DocumentArrayLike): - """Calculates and adds the embeddings for the given Documents. - - :param docs: The documents to get embeddings from. - """ - blobs = docs.blobs + le = self._eval(_data, train_loss=m_train_loss) + m_eval_loss += le - tensor = torch.tensor(blobs, device=self.device) - with torch.inference_mode(): - embeddings = self.embed_model(tensor) - for doc, embed in zip(docs, embeddings): - doc.embedding = embed.cpu().numpy() + return SummaryCollection(m_train_loss, m_eval_loss) def save(self, *args, **kwargs): """Save the embedding model. diff --git a/finetuner/tuner/stats.py b/finetuner/tuner/stats.py deleted file mode 100644 index 18eea78e0..000000000 --- a/finetuner/tuner/stats.py +++ /dev/null @@ -1,43 +0,0 @@ -import json -from typing import Dict, List - - -class TunerStats: - def __init__( - self, - loss_train: List = None, - loss_eval: List = None, - metrics_eval: List[Dict] = None, - ): - self._loss_train = loss_train if loss_train is not None else [] - self._loss_eval = loss_eval if loss_eval is not None else [] - self._metrics_eval = metrics_eval if metrics_eval is not None else [] - - def save(self, file: str): - with open(file, 'w') as output: - json.dump( - { - 'loss_train': [float(loss) for loss in self._loss_train], - 'loss_eval': [float(loss) for loss in self._loss_eval], - 'metrics_eval': self._metrics_eval, - }, - output, - ) - - def add_train_loss(self, losses: List): - self._loss_train.extend(losses) - - def add_eval_loss(self, losses: List): - self._loss_eval.extend(losses) - - def add_eval_metric(self, metric: Dict): - self._metrics_eval.append(metric) - - def print_last(self): - if self._metrics_eval: - eval_string = TunerStats.get_metrics_string(self._metrics_eval[-1]) - print(f'Evaluation metrics: {eval_string}') - - @staticmethod - def get_metrics_string(metrics: Dict): - return f'hits: {metrics.get("hits", 0):>3}, NDCG: {metrics.get("ndcg", 0):.2f}' diff --git a/finetuner/tuner/summary.py b/finetuner/tuner/summary.py new file mode 100644 index 000000000..326b0fa9c --- /dev/null +++ b/finetuner/tuner/summary.py @@ -0,0 +1,59 @@ +import json +from typing import List, Union, Optional, Dict + +import numpy as np + +NumericType = Union[ + int, float, complex, np.number +] #: The type of numerics including numpy data type + + +class ScalarSummary: + def __init__(self, name: str = '', data: Optional[List[NumericType]] = None): + """Create a record for storing a list of scalar values e.g. losses/metrics + + :param name: the name of that record + :param data: the data record to initialize from + """ + + self._name = name or '' + self._record = data or [] + + def __iadd__(self, other: Union[List[NumericType], float, 'ScalarSummary']): + if isinstance(other, list): + self._record += other + elif isinstance(other, ScalarSummary): + self._record += other._record + else: + self._record.append(other) + return self + + def __str__(self): + if self._record: + return ( + f'{self._name}: {np.mean([float(loss) for loss in self._record]):.2f}' + ) + else: + return f'{self._name} has no record' + + def floats(self) -> List[NumericType]: + """Return all numbers as a list of Python native float """ + return [float(v) for v in self._record] + + +class SummaryCollection: + def __init__(self, *records: ScalarSummary): + """Create a collection of summaries. """ + self._records = records + + def save(self, filepath: str): + """Store all summary into a JSON file""" + with open(filepath, 'w') as fp: + json.dump( + self.dict(), + fp, + ) + + def dict(self) -> Dict[str, List[NumericType]]: + """Return all summaries as a Dictionary, where key is the name and value is the record""" + return {r._name: r.floats() for r in self._records} diff --git a/tests/integration/fit/test_fit_lstm.py b/tests/integration/fit/test_fit_lstm.py index c7e44a12f..b14160434 100644 --- a/tests/integration/fit/test_fit_lstm.py +++ b/tests/integration/fit/test_fit_lstm.py @@ -1,9 +1,11 @@ +import json + import paddle import tensorflow as tf import torch from finetuner import fit -from finetuner.toydata import generate_qa_match_catalog +from finetuner.toydata import generate_qa_match all_test_losses = [ 'CosineSiameseLoss', @@ -50,19 +52,15 @@ def test_fit_all(tmpdir): for kb, b in embed_models.items(): for h in all_test_losses: - train_data, train_catalog = generate_qa_match_catalog( - num_total=300, num_neg=5, max_seq_len=10, pre_init_generator=False - ) - eval_data, eval_catalog = generate_qa_match_catalog( - num_total=300, num_neg=5, max_seq_len=10, pre_init_generator=False - ) - train_catalog.extend(eval_catalog) result = fit( b(), loss=h, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + train_data=lambda: generate_qa_match( + num_total=300, num_neg=5, max_seq_len=10 + ), + eval_data=lambda: generate_qa_match( + num_total=300, num_neg=5, max_seq_len=10 + ), epochs=2, ) result.save(tmpdir / f'result-{kb}-{h}.json') diff --git a/tests/integration/fit/test_fit_mlp.py b/tests/integration/fit/test_fit_mlp.py index fafbcd152..469e8c1f1 100644 --- a/tests/integration/fit/test_fit_mlp.py +++ b/tests/integration/fit/test_fit_mlp.py @@ -1,9 +1,11 @@ +import json + import paddle import tensorflow as tf import torch import finetuner -from finetuner.toydata import generate_fashion_match_catalog +from finetuner.toydata import generate_fashion_match all_test_losses = [ 'CosineSiameseLoss', @@ -44,28 +46,15 @@ def test_fit_all(tmpdir): for kb, b in embed_models.items(): for h in all_test_losses: - train_data, train_catalog = generate_fashion_match_catalog( - num_neg=10, - num_pos=10, - num_total=300, - num_catalog=3000, - pre_init_generator=False, - ) - eval_data, eval_catalog = generate_fashion_match_catalog( - num_neg=10, - num_pos=10, - num_total=300, - num_catalog=3000, - is_testset=True, - pre_init_generator=False, - ) - train_catalog.extend(eval_catalog) result = finetuner.fit( b(), loss=h, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + train_data=lambda: generate_fashion_match( + num_neg=10, num_pos=10, num_total=300 + ), + eval_data=lambda: generate_fashion_match( + num_neg=10, num_pos=10, num_total=300, is_testset=True + ), epochs=2, ) result.save(tmpdir / f'result-{kb}-{h}.json') diff --git a/tests/integration/keras/test_keras_trainer.py b/tests/integration/keras/test_keras_trainer.py index 74b8beaed..7635f0377 100644 --- a/tests/integration/keras/test_keras_trainer.py +++ b/tests/integration/keras/test_keras_trainer.py @@ -3,9 +3,9 @@ import tensorflow as tf from tensorflow import keras -from finetuner.tuner import fit, save -from finetuner.toydata import generate_fashion_match_catalog -from finetuner.toydata import generate_qa_match_catalog +from finetuner.tuner.keras import KerasTuner +from finetuner.toydata import generate_fashion_match +from finetuner.toydata import generate_qa_match all_test_losses = [ 'CosineSiameseLoss', @@ -29,33 +29,20 @@ def test_simple_sequential_model(tmpdir, params, loss): ] ) + kt = KerasTuner(user_model, loss=loss) + # fit and save the checkpoint - train_data, train_catalog = generate_fashion_match_catalog( - num_neg=10, - num_pos=10, - num_total=params['num_train'], - num_catalog=params['num_train'] * 10, - pre_init_generator=False, - ) - eval_data, eval_catalog = generate_fashion_match_catalog( - num_neg=10, - num_pos=10, - num_total=params['num_eval'], - num_catalog=params['num_eval'] * 10, - is_testset=True, - pre_init_generator=False, - ) - train_catalog.extend(eval_catalog) - fit( - user_model, - loss=loss, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + kt.fit( + train_data=lambda: generate_fashion_match( + num_pos=10, num_neg=10, num_total=params['num_train'] + ), + eval_data=lambda: generate_fashion_match( + num_pos=10, num_neg=10, num_total=params['num_eval'], is_testset=True + ), epochs=params['epochs'], batch_size=params['batch_size'], ) - save(user_model, tmpdir / 'trained.kt') + kt.save(tmpdir / 'trained.kt') embedding_model = keras.models.load_model(tmpdir / 'trained.kt') r = embedding_model.predict( @@ -76,33 +63,26 @@ def test_simple_lstm_model(tmpdir, params, loss): ] ) - # fit and save the checkpoint - train_data, train_catalog = generate_qa_match_catalog( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=False, - pre_init_generator=False, - ) - eval_data, eval_catalog = generate_qa_match_catalog( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=True, - pre_init_generator=False, - ) - train_catalog.extend(eval_catalog) + kt = KerasTuner(user_model, loss=loss) - fit( - user_model, - loss=loss, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + # fit and save the checkpoint + kt.fit( + train_data=lambda: generate_qa_match( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=False, + ), + eval_data=lambda: generate_qa_match( + num_total=params['num_eval'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=True, + ), epochs=params['epochs'], batch_size=params['batch_size'], ) - save(user_model, tmpdir / 'trained.kt') + kt.save(tmpdir / 'trained.kt') embedding_model = keras.models.load_model(tmpdir / 'trained.kt') r = embedding_model.predict( diff --git a/tests/integration/keras/test_tail_and_tune.py b/tests/integration/keras/test_tail_and_tune.py index 2f77e7506..88f4937b0 100644 --- a/tests/integration/keras/test_tail_and_tune.py +++ b/tests/integration/keras/test_tail_and_tune.py @@ -28,4 +28,4 @@ def test_tail_and_tune(embed_model, create_easy_data): output_dim=16, layer_name='dense_2', ) - assert rv._loss_train + assert rv.dict() diff --git a/tests/integration/labeler/test_tune_lstm.py b/tests/integration/labeler/test_tune_lstm.py index 7e9f4dc2f..9701c4d21 100644 --- a/tests/integration/labeler/test_tune_lstm.py +++ b/tests/integration/labeler/test_tune_lstm.py @@ -11,7 +11,7 @@ import paddle import torch -from finetuner.toydata import generate_qa_match_catalog +from finetuner.toydata import generate_qa_match class LastCellPT(torch.nn.Module): @@ -54,11 +54,10 @@ def _run(framework_name, loss, port_expose): paddle.nn.Linear(in_features=2 * 64, out_features=32), ), } - train_data, catalog = generate_qa_match_catalog(num_total=10, num_neg=0) + fit( embed_models[framework_name](), - train_data, - catalog=catalog, + generate_qa_match(num_total=10, num_neg=0), loss=loss, interactive=True, port_expose=port_expose, @@ -72,7 +71,6 @@ def _run(framework_name, loss, port_expose): 'EuclideanTripletLoss', ] - # 'keras' does not work under this test setup # Exception ... ust be from the same graph as Tensor ... # TODO: add keras backend back to the test @@ -97,7 +95,8 @@ def test_all_frameworks(framework, loss, tmpdir): json={ 'data': [], 'parameters': { - 'new_examples': 1, + 'start': 0, + 'end': 1, 'topk': 5, 'sample_size': 10, }, @@ -115,7 +114,7 @@ def test_all_frameworks(framework, loss, tmpdir): f'http://localhost:{port}/next', json={ 'data': [], - 'parameters': {'new_examples': 1, 'topk': 5, 'sample_size': 10}, + 'parameters': {'start': 0, 'end': 1, 'topk': 5, 'sample_size': 10}, }, ) assert req.status_code == 200 diff --git a/tests/integration/labeler/test_tune_mlp.py b/tests/integration/labeler/test_tune_mlp.py index 07f9a817d..f4175a039 100644 --- a/tests/integration/labeler/test_tune_mlp.py +++ b/tests/integration/labeler/test_tune_mlp.py @@ -5,7 +5,7 @@ import pytest import requests -from finetuner.toydata import generate_fashion_match_catalog +from finetuner.toydata import generate_fashion_match from jina.helper import random_port os.environ['JINA_LOG_LEVEL'] = 'DEBUG' @@ -52,14 +52,10 @@ def _run(framework_name, loss, port_expose): paddle.nn.Linear(in_features=128, out_features=32), ), } - data, catalog = generate_fashion_match_catalog( - num_total=10, num_catalog=100, num_pos=0, num_neg=0 - ) fit( embed_models[framework_name](), - data, - catalog=catalog, + generate_fashion_match(num_total=10, num_pos=0, num_neg=0), loss=loss, interactive=True, port_expose=port_expose, @@ -90,7 +86,8 @@ def test_all_frameworks(framework, loss): json={ 'data': [], 'parameters': { - 'new_examples': 1, + 'start': 0, + 'end': 1, 'topk': 5, 'sample_size': 10, }, @@ -108,7 +105,7 @@ def test_all_frameworks(framework, loss): f'http://localhost:{port}/next', json={ 'data': [], - 'parameters': {'new_examples': 1, 'topk': 5, 'sample_size': 10}, + 'parameters': {'start': 0, 'end': 1, 'topk': 5, 'sample_size': 10}, }, ) assert req.status_code == 200 diff --git a/tests/integration/paddle/test_paddle_trainer.py b/tests/integration/paddle/test_paddle_trainer.py index 905cdffaa..6f9693dd9 100644 --- a/tests/integration/paddle/test_paddle_trainer.py +++ b/tests/integration/paddle/test_paddle_trainer.py @@ -3,9 +3,9 @@ import pytest from paddle import nn -from finetuner.tuner import fit, save -from finetuner.toydata import generate_fashion_match_catalog -from finetuner.toydata import generate_qa_match_catalog +from finetuner.tuner.paddle import PaddleTuner +from finetuner.toydata import generate_fashion_match +from finetuner.toydata import generate_qa_match @pytest.mark.parametrize( @@ -28,31 +28,21 @@ def test_simple_sequential_model(tmpdir, params, loss): nn.Linear(in_features=params['feature_dim'], out_features=params['output_dim']), ) + pt = PaddleTuner(user_model, loss=loss) model_path = tmpdir / 'trained.pd' # fit and save the checkpoint - train_data, train_catalog = generate_fashion_match_catalog( - num_neg=10, num_pos=10, num_total=params['num_train'], pre_init_generator=False - ) - eval_data, eval_catalog = generate_fashion_match_catalog( - num_neg=10, - num_pos=10, - num_total=params['num_eval'], - is_testset=True, - pre_init_generator=False, - ) - train_catalog.extend(eval_catalog) - - fit( - user_model, - loss=loss, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + pt.fit( + train_data=lambda: generate_fashion_match( + num_pos=10, num_neg=10, num_total=params['num_train'] + ), + eval_data=lambda: generate_fashion_match( + num_pos=10, num_neg=10, num_total=params['num_eval'], is_testset=True + ), epochs=params['epochs'], batch_size=params['batch_size'], ) - save(user_model, model_path) + pt.save(model_path) user_model.set_state_dict(paddle.load(model_path)) user_model.eval() @@ -94,33 +84,26 @@ def forward(self, x): ) model_path = tmpdir / 'trained.pd' - # fit and save the checkpoint - train_data, train_catalog = generate_qa_match_catalog( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=False, - pre_init_generator=False, - ) - eval_data, eval_catalog = generate_qa_match_catalog( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=True, - pre_init_generator=False, - ) - train_catalog.extend(eval_catalog) + pt = PaddleTuner(user_model, loss=loss) - fit( - user_model, - loss=loss, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + # fit and save the checkpoint + pt.fit( + train_data=lambda: generate_qa_match( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=False, + ), + eval_data=lambda: generate_qa_match( + num_total=params['num_eval'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=True, + ), epochs=params['epochs'], batch_size=params['batch_size'], ) - save(user_model, model_path) + pt.save(model_path) # load the checkpoint and ensure the dim user_model.set_state_dict(paddle.load(model_path)) diff --git a/tests/integration/paddle/test_tail_and_tune.py b/tests/integration/paddle/test_tail_and_tune.py index b7bb80539..e57201fe5 100644 --- a/tests/integration/paddle/test_tail_and_tune.py +++ b/tests/integration/paddle/test_tail_and_tune.py @@ -29,4 +29,4 @@ def test_tail_and_tune(embed_model, create_easy_data): output_dim=16, layer_name='linear_4', ) - assert rv._loss_train + assert rv.dict() diff --git a/tests/integration/torch/test_tail_and_tune.py b/tests/integration/torch/test_tail_and_tune.py index 0a0cfead0..ef7f6ce74 100644 --- a/tests/integration/torch/test_tail_and_tune.py +++ b/tests/integration/torch/test_tail_and_tune.py @@ -29,4 +29,4 @@ def test_tail_and_tune(embed_model, create_easy_data): output_dim=16, layer_name='linear_4', ) - assert rv._loss_train + assert rv.dict() diff --git a/tests/integration/torch/test_torch_trainer.py b/tests/integration/torch/test_torch_trainer.py index 5a1f64378..ca81c0118 100644 --- a/tests/integration/torch/test_torch_trainer.py +++ b/tests/integration/torch/test_torch_trainer.py @@ -5,9 +5,9 @@ import torch import torch.nn as nn -from finetuner.tuner import fit, save -from finetuner.toydata import generate_fashion_match_catalog -from finetuner.toydata import generate_qa_match_catalog +from finetuner.tuner.pytorch import PytorchTuner +from finetuner.toydata import generate_fashion_match +from finetuner.toydata import generate_qa_match @pytest.mark.parametrize( @@ -31,29 +31,20 @@ def test_simple_sequential_model(tmpdir, params, loss): ) model_path = os.path.join(tmpdir, 'trained.pth') - # fit and save the checkpoint - train_data, train_catalog = generate_fashion_match_catalog( - num_neg=10, num_pos=10, num_total=params['num_train'], pre_init_generator=False - ) - eval_data, eval_catalog = generate_fashion_match_catalog( - num_neg=10, - num_pos=10, - num_total=params['num_eval'], - is_testset=True, - pre_init_generator=False, - ) - train_catalog.extend(eval_catalog) + pt = PytorchTuner(user_model, loss=loss) - fit( - user_model, - loss=loss, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + # fit and save the checkpoint + pt.fit( + train_data=lambda: generate_fashion_match( + num_pos=10, num_neg=10, num_total=params['num_train'] + ), + eval_data=lambda: generate_fashion_match( + num_pos=10, num_neg=10, num_total=params['num_eval'], is_testset=True + ), epochs=params['epochs'], batch_size=params['batch_size'], ) - save(user_model, model_path) + pt.save(model_path) # load the checkpoint and ensure the dim user_model.load_state_dict(torch.load(model_path)) @@ -97,33 +88,26 @@ def forward(self, x): ) model_path = os.path.join(tmpdir, 'trained.pth') - # fit and save the checkpoint - train_data, train_catalog = generate_qa_match_catalog( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=False, - pre_init_generator=False, - ) - eval_data, eval_catalog = generate_qa_match_catalog( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=True, - pre_init_generator=False, - ) - train_catalog.extend(eval_catalog) + pt = PytorchTuner(user_model, loss=loss) - fit( - user_model, - loss=loss, - train_data=train_data, - eval_data=eval_data, - catalog=train_catalog, + # fit and save the checkpoint + pt.fit( + train_data=lambda: generate_qa_match( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=False, + ), + eval_data=lambda: generate_qa_match( + num_total=params['num_eval'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=True, + ), epochs=params['epochs'], batch_size=params['batch_size'], ) - save(user_model, model_path) + pt.save(model_path) # load the checkpoint and ensure the dim user_model.load_state_dict(torch.load(model_path)) diff --git a/tests/unit/toydata/test_data_gen.py b/tests/unit/toydata/test_data_gen.py index 1cec95db8..05e28ba20 100644 --- a/tests/unit/toydata/test_data_gen.py +++ b/tests/unit/toydata/test_data_gen.py @@ -8,7 +8,7 @@ def test_qa_data_generator(): - for d in generate_qa_match_catalog()[0]: + for d in generate_qa_match(): assert d.tags['question'] assert d.tags['answer'] assert d.tags['wrong_answer'] @@ -16,33 +16,31 @@ def test_qa_data_generator(): def test_train_test_generator(): - fmdg_train, _ = generate_fashion_match_catalog(is_testset=True) - fmdg_test, _ = generate_fashion_match_catalog(is_testset=False) + fmdg_train = generate_fashion_match(is_testset=True) + fmdg_test = generate_fashion_match(is_testset=False) for d1, d2 in zip(fmdg_train, fmdg_test): assert np.any(np.not_equal(d1.blob, d2.blob)) break def test_train_test_qa_generator(): - fmdg_train = generate_qa_match_catalog(is_testset=True)[0] - fmdg_test = generate_qa_match_catalog(is_testset=False)[0] + fmdg_train = generate_qa_match(is_testset=True) + fmdg_test = generate_qa_match(is_testset=False) for d1, d2 in zip(fmdg_train, fmdg_test): assert d1.id != d2.id assert np.any(np.not_equal(d1.blob, d2.blob)) def test_doc_generator(): - for d in generate_fashion_match_catalog()[0]: + for d in generate_fashion_match(): assert d.tags['class'] break @pytest.mark.parametrize('channels', [0, 1, 3]) -@pytest.mark.parametrize('upsampling', [1, 2]) +@pytest.mark.parametrize('upsampling', [1, 2, 4]) def test_doc_generator_channel(channels, upsampling): - for d in generate_fashion_match_catalog(channels=channels, upsampling=upsampling)[ - 0 - ]: + for d in generate_fashion_match(channels=channels, upsampling=upsampling): if channels == 0: assert d.blob.ndim == 2 else: @@ -59,9 +57,9 @@ def test_doc_generator_channel(channels, upsampling): @pytest.mark.parametrize('pos_value, neg_value', [(1, 0), (1, -1)]) @pytest.mark.parametrize('num_pos, num_neg', [(5, 7), (10, 10)]) def test_fashion_matches_generator(num_pos, num_neg, pos_value, neg_value): - for d in generate_fashion_match_catalog( + for d in generate_fashion_match( num_pos=num_pos, num_neg=num_neg, pos_value=pos_value, neg_value=neg_value - )[0]: + ): assert len(d.matches) == num_pos + num_neg all_labels = [int(d.tags[__default_tag_key__]['label']) for d in d.matches] assert all_labels.count(pos_value) == num_pos @@ -75,17 +73,13 @@ def test_fashion_matches_generator(num_pos, num_neg, pos_value, neg_value): def test_fashion_documentarray(): - da = DocumentArray( - generate_fashion_match_catalog( - num_total=10, num_catalog=1000, num_pos=2, num_neg=3 - )[0] - ) + da = DocumentArray(generate_fashion_match(num_total=10, num_pos=2, num_neg=3)) assert len(da) == 10 assert len(da[0].matches) == 5 def test_qa_documentarray(): - da = DocumentArray(generate_qa_match_catalog(num_total=10, num_neg=3)[0]) + da = DocumentArray(generate_qa_match(num_total=10, num_neg=3)) assert len(da) == 10 assert len(da[0].matches) == 4 @@ -94,9 +88,9 @@ def test_qa_documentarray(): @pytest.mark.parametrize('num_neg', [1, 2, 10]) @pytest.mark.parametrize('to_ndarray', [True, False]) def test_generate_qa_doc_match(pos_value, neg_value, num_neg, to_ndarray): - for d in generate_qa_match_catalog( + for d in generate_qa_match( num_neg=num_neg, pos_value=pos_value, neg_value=neg_value, to_ndarray=to_ndarray - )[0]: + ): assert len(d.matches) == 1 + num_neg all_labels = [int(d.tags[__default_tag_key__]['label']) for d in d.matches] assert all_labels.count(pos_value) == 1 @@ -111,7 +105,7 @@ def test_generate_qa_doc_match(pos_value, neg_value, num_neg, to_ndarray): @pytest.mark.parametrize('max_length', [1, 10, 100]) def test_qa_sequence_same_length(max_length): num_neg = 5 - for s in generate_qa_match_catalog(num_neg=num_neg, max_seq_len=max_length)[0]: + for s in generate_qa_match(num_neg=num_neg, max_seq_len=max_length): assert s.blob.shape[0] == max_length assert len(s.matches) == num_neg + 1 for m in s.matches: diff --git a/tests/unit/toydata/test_dataset.py b/tests/unit/toydata/test_dataset.py index 462abdd8b..5cea052c3 100644 --- a/tests/unit/toydata/test_dataset.py +++ b/tests/unit/toydata/test_dataset.py @@ -1,26 +1,23 @@ import numpy as np import pytest -from finetuner.toydata import generate_fashion_match_catalog +from finetuner.toydata import generate_fashion_match from finetuner.tuner.base import BaseDataset from finetuner.tuner.dataset import SiameseMixin, TripletMixin @pytest.mark.parametrize( - 'pre_init_generator', - [True, False], + 'data_src', + [ + generate_fashion_match(num_pos=10, num_neg=10, num_total=100), + lambda: generate_fashion_match(num_pos=10, num_neg=10, num_total=100), + ], ) -def test_siamese_dataset(pre_init_generator): +def test_siamese_dataset(data_src): class SD(SiameseMixin, BaseDataset): ... - data, catalog = generate_fashion_match_catalog( - num_pos=10, - num_neg=10, - num_total=100, - pre_init_generator=pre_init_generator, - ) - sd = SD(data, catalog) + sd = SD(data_src) for d in sd: assert len(d) == 2 assert len(d[0]) == 2 @@ -31,21 +28,17 @@ class SD(SiameseMixin, BaseDataset): @pytest.mark.parametrize( - 'pre_init_generator', - [True, False], + 'data_src', + [ + generate_fashion_match(num_pos=10, num_neg=10, num_total=100), + lambda: generate_fashion_match(num_pos=10, num_neg=10, num_total=100), + ], ) -def test_triplet_dataset(pre_init_generator): +def test_triplet_dataset(data_src): class SD(TripletMixin, BaseDataset): ... - data, catalog = generate_fashion_match_catalog( - num_pos=10, - num_neg=10, - num_total=100, - pre_init_generator=pre_init_generator, - ) - - sd = SD(data, catalog) + sd = SD(data_src) for d in sd: assert len(d) == 2 assert len(d[0]) == 3 diff --git a/tests/unit/tuner/keras/test_gpu.py b/tests/unit/tuner/keras/test_gpu.py index b200c35cf..644726949 100644 --- a/tests/unit/tuner/keras/test_gpu.py +++ b/tests/unit/tuner/keras/test_gpu.py @@ -24,11 +24,10 @@ def tf_gpu_config(): @pytest.mark.parametrize('loss', all_test_losses) def test_gpu_keras(generate_random_triplets, loss, caplog): data = generate_random_triplets(4, 4) - catalog = DocumentArray(data.traverse_flat(['m'])) embed_model = tf.keras.models.Sequential() embed_model.add(tf.keras.layers.InputLayer(input_shape=(4,))) embed_model.add(tf.keras.layers.Dense(4)) - tuner = KerasTuner(embed_model, catalog, loss) + tuner = KerasTuner(embed_model, loss) tuner.fit(data, data, epochs=2, batch_size=4, device='cuda') diff --git a/tests/unit/tuner/paddle/test_gpu.py b/tests/unit/tuner/paddle/test_gpu.py index 2931000d2..fbe8e644e 100644 --- a/tests/unit/tuner/paddle/test_gpu.py +++ b/tests/unit/tuner/paddle/test_gpu.py @@ -16,13 +16,12 @@ def test_gpu_paddle(generate_random_triplets, loss): data = generate_random_triplets(4, 4) - catalog = DocumentArray(data.traverse_flat(['m'])) embed_model = nn.Sequential( nn.Linear(in_features=4, out_features=4), ) - tuner = PaddleTuner(embed_model, catalog=catalog, loss=loss) + tuner = PaddleTuner(embed_model, loss=loss) tuner.fit(data, data, epochs=2, batch_size=4, device='cuda') diff --git a/tests/unit/tuner/torch/test_gpu.py b/tests/unit/tuner/torch/test_gpu.py index 0927e7739..48e831054 100644 --- a/tests/unit/tuner/torch/test_gpu.py +++ b/tests/unit/tuner/torch/test_gpu.py @@ -17,13 +17,12 @@ def test_gpu_pytorch(generate_random_triplets, loss): data = generate_random_triplets(4, 4) - catalog = DocumentArray(data.traverse_flat(['m'])) embed_model = torch.nn.Sequential( torch.nn.Linear(in_features=4, out_features=4), ) - tuner = PytorchTuner(embed_model, catalog, loss) + tuner = PytorchTuner(embed_model, loss) # Run quick training - mainly makes sure no errors appear, and that the model # is moved to GPU