From 0be69a45c0550dd44cf7bae88c70c76b22d6c47f Mon Sep 17 00:00:00 2001 From: Maximilian Werk Date: Tue, 19 Oct 2021 11:02:28 +0200 Subject: [PATCH] Introduce catalog + ndcg (#120) * feat: add ndcg calc * feat: added hits metric and refactoring * feat: fix ndcg * feat: gpu support for eval * feat: add gpu for eval * fix: sample size * feat: labeler working * feat: paddle and torch * fix: train data callable * test: fixed * feat: fmnist with catalog * feat: used dam for catalog * refactor: qa toy data * fix: gpu tests * test: fix test size * test: fix wrong arg * test: speed up test data generation * feat: removed train metrics * fix: next only called when needed * feat: restored old toy data generation behavior --- docs/get-started/covid-qa.md | 2 +- docs/get-started/fashion-mnist.md | 2 +- finetuner/labeler/__init__.py | 23 +- finetuner/labeler/executor.py | 30 ++- finetuner/labeler/ui/js/main.js | 8 +- finetuner/toydata.py | 210 +++++++++++------- finetuner/tuner/__init__.py | 11 +- finetuner/tuner/base.py | 25 +++ finetuner/tuner/dataset/__init__.py | 8 +- finetuner/tuner/evaluation.py | 53 +++++ finetuner/tuner/keras/__init__.py | 35 +-- finetuner/tuner/paddle/__init__.py | 24 +- finetuner/tuner/pytorch/__init__.py | 25 ++- finetuner/tuner/stats.py | 43 ++++ tests/integration/fit/test_fit_lstm.py | 31 +-- tests/integration/fit/test_fit_mlp.py | 40 ++-- tests/integration/keras/test_keras_trainer.py | 78 ++++--- tests/integration/keras/test_overfit.py | 5 +- tests/integration/keras/test_tail_and_tune.py | 2 +- tests/integration/labeler/test_tune_lstm.py | 13 +- tests/integration/labeler/test_tune_mlp.py | 13 +- tests/integration/paddle/test_overfit.py | 5 +- .../integration/paddle/test_paddle_trainer.py | 73 +++--- .../integration/paddle/test_tail_and_tune.py | 2 +- tests/integration/torch/test_overfit.py | 6 +- tests/integration/torch/test_tail_and_tune.py | 2 +- tests/integration/torch/test_torch_trainer.py | 74 +++--- tests/unit/toydata/test_data_gen.py | 36 +-- tests/unit/toydata/test_dataset.py | 37 +-- tests/unit/tuner/keras/test_gpu.py | 5 +- tests/unit/tuner/paddle/test_gpu.py | 6 +- tests/unit/tuner/torch/test_gpu.py | 6 +- 32 files changed, 613 insertions(+), 320 deletions(-) create mode 100644 finetuner/tuner/evaluation.py create mode 100644 finetuner/tuner/stats.py diff --git a/docs/get-started/covid-qa.md b/docs/get-started/covid-qa.md index 06eff69db..dd80e33f0 100644 --- a/docs/get-started/covid-qa.md +++ b/docs/get-started/covid-qa.md @@ -86,7 +86,7 @@ import finetuner finetuner.fit( embed_model, - train_data=generate_qa_match, + train_data=generate_qa_match(), interactive=True) ``` diff --git a/docs/get-started/fashion-mnist.md b/docs/get-started/fashion-mnist.md index c8a6f1117..bfb67d177 100644 --- a/docs/get-started/fashion-mnist.md +++ b/docs/get-started/fashion-mnist.md @@ -77,7 +77,7 @@ import finetuner finetuner.fit( embed_model, - train_data=generate_fashion_match, + train_data=generate_fashion_match(), interactive=True) ``` diff --git a/finetuner/labeler/__init__.py b/finetuner/labeler/__init__.py index 88dfc035e..518e76181 100644 --- a/finetuner/labeler/__init__.py +++ b/finetuner/labeler/__init__.py @@ -4,7 +4,7 @@ from typing import Optional import jina.helper -from jina import Flow +from jina import Flow, DocumentArrayMemmap from jina.logging.predefined import default_logger from .executor import FTExecutor, DataIterator @@ -14,6 +14,7 @@ def fit( embed_model: AnyDNN, train_data: DocumentArrayLike, + catalog: Optional[DocumentArrayLike] = None, clear_labels_on_start: bool = False, port_expose: Optional[int] = None, runtime_backend: str = 'thread', @@ -21,6 +22,7 @@ def fit( **kwargs, ) -> None: dam_path = tempfile.mkdtemp() + catalog_dam_path = init_catalog(dam_path, catalog, train_data) class MyExecutor(FTExecutor): def get_embed_model(self): @@ -37,13 +39,14 @@ def get_embed_model(self): uses=DataIterator, uses_with={ 'dam_path': dam_path, + 'catalog_dam_path': catalog_dam_path, 'clear_labels_on_start': clear_labels_on_start, }, ) .add( uses=MyExecutor, uses_with={ - 'dam_path': dam_path, + 'catalog_dam_path': catalog_dam_path, 'loss': loss, }, ) @@ -88,8 +91,22 @@ def open_frontend_in_browser(req): f.post( '/feed', train_data, - request_size=10, + request_size=128, show_progress=True, on_done=open_frontend_in_browser, ) f.block() + + +def init_catalog( + dam_path: str, catalog: DocumentArrayLike, train_data: DocumentArrayLike +): + if isinstance(catalog, DocumentArrayMemmap): + catalog_dam_path = catalog.path + else: + catalog_dam_path = dam_path + '/catalog' + catalog_memmap = DocumentArrayMemmap(catalog_dam_path) + if catalog is None: + catalog = train_data() if callable(train_data) else train_data + catalog_memmap.extend(catalog) + return catalog_dam_path diff --git a/finetuner/labeler/executor.py b/finetuner/labeler/executor.py index f75336444..671d87670 100644 --- a/finetuner/labeler/executor.py +++ b/finetuner/labeler/executor.py @@ -11,13 +11,13 @@ class FTExecutor(Executor): def __init__( self, - dam_path: str, + catalog_dam_path: str, metric: str = 'cosine', loss: str = 'CosineSiameseLoss', **kwargs, ): super().__init__(**kwargs) - self._all_data = DocumentArrayMemmap(dam_path) + self._catalog = DocumentArrayMemmap(catalog_dam_path) self._metric = metric self._loss = loss @@ -33,9 +33,9 @@ def _embed_model(self): def embed(self, docs: DocumentArray, parameters: Dict, **kwargs): if not docs: return - self._all_data.reload() - da = self._all_data.sample( - min(len(self._all_data), int(parameters.get('sample_size', 1000))) + self._catalog.reload() + da = self._catalog.sample( + min(len(self._catalog), int(parameters.get('sample_size', 1000))) ) f_type = get_framework(self._embed_model) @@ -76,6 +76,7 @@ def fit(self, docs, parameters: Dict, **kwargs): fit( self._embed_model, docs, + self._catalog, epochs=int(parameters.get('epochs', 10)), loss=self._loss, ) @@ -91,12 +92,14 @@ class DataIterator(Executor): def __init__( self, dam_path: str, + catalog_dam_path: str, labeled_dam_path: Optional[str] = None, clear_labels_on_start: bool = False, **kwargs, ): super().__init__(**kwargs) self._all_data = DocumentArrayMemmap(dam_path) + self._catalog = DocumentArrayMemmap(catalog_dam_path) if not labeled_dam_path: labeled_dam_path = dam_path + '/labeled' self._labeled_dam = DocumentArrayMemmap(labeled_dam_path) @@ -104,20 +107,25 @@ def __init__( self._labeled_dam.clear() @requests(on='/feed') - def store_data(self, docs: DocumentArray, **kwargs): - self._all_data.extend(docs) + def store_data(self, docs: DocumentArray, parameters: Dict, **kwargs): + if parameters.get('type', 'query') == 'query': + self._all_data.extend(docs) + else: + self._catalog.extend(docs) @requests(on='/next') def take_batch(self, parameters: Dict, **kwargs): - st = int(parameters.get('start', 0)) - ed = int(parameters.get('end', 1)) + count = int(parameters.get('new_examples', 5)) self._all_data.reload() - return self._all_data[st:ed] + count = min(max(count, 0), len(self._all_data)) + return self._all_data.sample(k=count) @requests(on='/fit') def add_fit_data(self, docs: DocumentArray, **kwargs): - for d in docs.traverse_flat(['r', 'm']): + for d in docs.traverse_flat(['r']): d.content = self._all_data[d.id].content + for d in docs.traverse_flat(['m']): + d.content = self._catalog[d.id].content self._labeled_dam.extend(docs) return self._labeled_dam diff --git a/finetuner/labeler/ui/js/main.js b/finetuner/labeler/ui/js/main.js index 39553aa75..8293ef056 100644 --- a/finetuner/labeler/ui/js/main.js +++ b/finetuner/labeler/ui/js/main.js @@ -156,11 +156,12 @@ const app = new Vue({ }, next_batch: function () { let end_idx = app.labeler_config.start_idx + (app.labeler_config.example_per_view - app.cur_batch.length) - if (end_idx === app.labeler_config.start_idx) { + if (end_idx <= app.labeler_config.start_idx) { return } let start_idx = app.labeler_config.start_idx app.labeler_config.start_idx = end_idx + let new_examples = end_idx - start_idx app.is_busy = true app.is_conn_broken = false $.ajax({ @@ -169,8 +170,7 @@ const app = new Vue({ data: JSON.stringify({ data: [], parameters: { - 'start': start_idx, - 'end': end_idx, + 'new_examples': new_examples, 'topk': app.labeler_config.topk_per_example, 'sample_size': app.advanced_config.sample_size.value } @@ -243,4 +243,4 @@ const app = new Vue({ Vue.nextTick(function () { app.next_batch() -}) \ No newline at end of file +}) diff --git a/finetuner/toydata.py b/finetuner/toydata.py index c8662592d..f7da9a95c 100644 --- a/finetuner/toydata.py +++ b/finetuner/toydata.py @@ -56,7 +56,11 @@ def _text_to_int_sequence(text, vocab, max_len=None): return vec -def generate_qa_match( +def generate_qa_match(**kwargs): + return generate_qa_match_catalog(pre_init_generator=False, **kwargs)[0] + + +def generate_qa_match_catalog( num_total: int = 481, num_neg: int = 0, pos_value: int = 1, @@ -64,6 +68,7 @@ def generate_qa_match( to_ndarray: bool = True, max_seq_len: int = 100, is_testset: Optional[bool] = None, + pre_init_generator: bool = True, ) -> Generator[Document, None, None]: """Get a generator of QA data with synthetic negative matches. @@ -76,7 +81,6 @@ def generate_qa_match( :param is_testset: If to generate test data, if set to None, will all data return :return: """ - num_doc = 0 all_docs = DocumentArray(_download_qa_data(is_testset=is_testset)) if to_ndarray: @@ -86,57 +90,83 @@ def generate_qa_match( + all_docs.get_attributes('tags__wrong_answer') ) vocab = _build_vocab(all_texts, min_freq=2) + catalog = DocumentArray() + text_to_id = {} + + def get_document(text, label): + doc = Document(text=text, tags={__default_tag_key__: {'label': label}}) + if text in text_to_id: + doc.id = text_to_id[text] + else: + text_to_id[text] = doc.id + catalog.append(doc) + return doc + + def generator(): + num_doc = 0 + for doc in all_docs: + d = Document(doc, copy=True) + d.text = d.tags['question'] + m_p = get_document(d.tags['answer'], pos_value) + m_n = get_document(d.tags['wrong_answer'], neg_value) + if to_ndarray: + d.blob = np.array( + _text_to_int_sequence(d.text, vocab, max_seq_len), np.long + ) + m_p.blob = np.array( + _text_to_int_sequence(m_p.text, vocab, max_seq_len), np.long + ) + m_n.blob = np.array( + _text_to_int_sequence(m_n.text, vocab, max_seq_len), np.long + ) - for d in all_docs: - d.text = d.tags['question'] - m_p = Document( - text=d.tags['answer'], tags={__default_tag_key__: {'label': pos_value}} - ) - m_n = Document( - text=d.tags['wrong_answer'], - tags={__default_tag_key__: {'label': neg_value}}, - ) - if to_ndarray: - d.blob = np.array( - _text_to_int_sequence(d.text, vocab, max_seq_len), np.long - ) - m_p.blob = np.array( - _text_to_int_sequence(m_p.text, vocab, max_seq_len), np.long - ) - m_n.blob = np.array( - _text_to_int_sequence(m_n.text, vocab, max_seq_len), np.long - ) + if num_neg > 0: + d.matches.append(m_p) + d.matches.append(m_n) + cur_num_neg = 1 + if num_neg > 1: + sampled_docs = all_docs.sample(num_neg, seed=num_doc) + for n_d in sampled_docs: + if n_d.id != d.id: + new_nd = get_document(n_d.tags['answer'], neg_value) + if to_ndarray: + new_nd.blob = np.array( + _text_to_int_sequence( + new_nd.text, vocab, max_seq_len + ), + np.long, + ) + d.matches.append(new_nd) + cur_num_neg += 1 + if cur_num_neg >= num_neg: + break + num_doc += 1 + yield d + + if num_doc >= num_total: + break - if num_neg > 0: - d.matches.append(m_p) - d.matches.append(m_n) - cur_num_neg = 1 - if num_neg > 1: - sampled_docs = all_docs.sample(num_neg) - for n_d in sampled_docs: - if n_d.id != d.id: - new_nd = Document( - text=n_d.tags['answer'], - tags={__default_tag_key__: {'label': neg_value}}, - ) - if to_ndarray: - new_nd.blob = np.array( - _text_to_int_sequence(new_nd.text, vocab, max_seq_len), - np.long, - ) - d.matches.append(new_nd) - cur_num_neg += 1 - if cur_num_neg >= num_neg: - break - num_doc += 1 - yield d - - if num_doc >= num_total: - break - - -def generate_fashion_match( + # prefil catalog + [_ for _ in generator()] + + if pre_init_generator: + return generator(), catalog + else: + return generator, catalog + + +def generate_fashion_match(num_total=100, num_catalog=5000, **kwargs): + return generate_fashion_match_catalog( + num_total=num_total, + num_catalog=num_catalog, + pre_init_generator=False, + **kwargs, + )[0] + + +def generate_fashion_match_catalog( num_total: int = 60000, + num_catalog: int = 60000, num_pos: int = 0, num_neg: int = 0, pos_value: int = 1, @@ -145,6 +175,7 @@ def generate_fashion_match( channels: int = 0, channel_axis: int = -1, is_testset: bool = False, + pre_init_generator: bool = True, ) -> Generator[Document, None, None]: """Get a Generator of fashion-mnist Documents with synthetic matches. @@ -168,44 +199,57 @@ def generate_fashion_match( is_testset=is_testset, ) - n_d = 0 + catalog = DocumentArray(_orginal_fashion_doc) + if len(catalog) > num_catalog: + catalog = catalog.sample(num_catalog) if num_pos > 0 or num_neg > 0: # need to build synthetic matches - all_docs = DocumentArray(_orginal_fashion_doc) - - copy_all_docs = copy.deepcopy(all_docs) - rv = copy_all_docs.split('class') - - for od in all_docs: - pos_label = od.tags['class'] - pos_samples = rv[pos_label].sample(num_pos) - for d in pos_samples: - d.tags[__default_tag_key__] = {'label': pos_value} - - neg_samples = DocumentArray() - while len(neg_samples) < num_neg: - neg_samples.extend( - d - for d in copy_all_docs.sample(num_neg) - if d.tags['class'] != pos_label - ) - neg_samples = neg_samples[:num_neg] + # copy_all_docs = copy.deepcopy(catalog) + rv = catalog.split('class') + + def generator(): + n_d = 0 + for od in catalog: + new_doc = Document(od, copy=True) + pos_label = new_doc.tags['class'] + pos_samples = rv[pos_label].sample(num_pos) + pos_samples = [Document(d, copy=True) for d in pos_samples] + for d in pos_samples: + d.tags[__default_tag_key__] = {'label': pos_value} + + neg_samples = DocumentArray() + while len(neg_samples) < num_neg: + neg_samples.extend( + Document(d, copy=True) + for d in catalog.sample(num_neg) + if d.tags['class'] != pos_label + ) + neg_samples = neg_samples[:num_neg] + + for d in neg_samples: + d.tags[__default_tag_key__] = {'label': neg_value} + + new_doc.matches.extend(pos_samples) + new_doc.matches.extend(neg_samples) + n_d += 1 + yield new_doc + if n_d >= num_total: + break - for d in neg_samples: - d.tags[__default_tag_key__] = {'label': neg_value} + else: - od.matches.extend(pos_samples) - od.matches.extend(neg_samples) - n_d += 1 - yield od - if n_d >= num_total: - break + def generator(): + n_d = 0 + for d in catalog: + n_d += 1 + yield d + if n_d >= num_total: + break + + if pre_init_generator: + return generator(), catalog else: - for d in _orginal_fashion_doc: - n_d += 1 - yield d - if n_d >= num_total: - break + return generator, catalog def _download_qa_data( diff --git a/finetuner/tuner/__init__.py b/finetuner/tuner/__init__.py index a7c7a0abf..d5ef27dc9 100644 --- a/finetuner/tuner/__init__.py +++ b/finetuner/tuner/__init__.py @@ -1,6 +1,7 @@ from typing import Optional, TYPE_CHECKING, Type, Dict from ..helper import AnyDNN, DocumentArrayLike, TunerReturnType, get_framework +from jina import DocumentArray if TYPE_CHECKING: from .base import BaseTuner @@ -26,6 +27,7 @@ def get_tuner_class(dnn_model: AnyDNN) -> Type['BaseTuner']: def fit( embed_model: AnyDNN, train_data: DocumentArrayLike, + catalog: DocumentArrayLike = None, eval_data: Optional[DocumentArrayLike] = None, epochs: int = 10, batch_size: int = 256, @@ -37,8 +39,15 @@ def fit( **kwargs, ) -> TunerReturnType: ft = get_tuner_class(embed_model) + if catalog is None: + train_data = DocumentArray(train_data() if callable(train_data) else train_data) + catalog = DocumentArray() + catalog.extend(train_data.traverse_flat(['r', 'm'])) + if eval_data is not None: + eval_data = DocumentArray(eval_data() if callable(eval_data) else eval_data) + catalog.extend(eval_data.traverse_flat(['r', 'm'])) - return ft(embed_model, loss=loss).fit( + return ft(embed_model, catalog=catalog, loss=loss).fit( train_data, eval_data, epochs=epochs, diff --git a/finetuner/tuner/base.py b/finetuner/tuner/base.py index 110c1246f..5e4058cbb 100644 --- a/finetuner/tuner/base.py +++ b/finetuner/tuner/base.py @@ -8,7 +8,11 @@ Dict, ) +from jina.logging.logger import JinaLogger +from jina import DocumentArrayMemmap, DocumentArray + from ..helper import AnyDNN, AnyDataLoader, AnyOptimizer, DocumentArrayLike +from . import evaluation class BaseLoss: @@ -19,6 +23,7 @@ class BaseTuner(abc.ABC): def __init__( self, embed_model: Optional[AnyDNN] = None, + catalog: DocumentArrayLike = None, loss: Union[AnyDNN, str] = 'CosineSiameseLoss', **kwargs, ): @@ -26,6 +31,8 @@ def __init__( self._loss = self._get_loss(loss) self._train_data_len = 0 self._eval_data_len = 0 + self._catalog = catalog + self.logger = JinaLogger(self.__class__.__name__) def _get_optimizer_kwargs(self, optimizer: str, custom_kwargs: Optional[Dict]): """Merges user-provided optimizer kwargs with default ones.""" @@ -132,11 +139,29 @@ def _eval( """Evaluate the model on given labeled data""" ... + def get_metrics(self, docs: DocumentArrayLike): + docs = DocumentArray(docs()) if callable(docs) else docs + self.get_embeddings(docs) + self.get_embeddings(self._catalog) + if isinstance(self._catalog, DocumentArrayMemmap): + self._catalog.prune() + to_be_scored_docs = evaluation.prepare_eval_docs(docs, self._catalog, limit=10) + return { + 'hits': evaluation.get_hits_at_n(to_be_scored_docs), + 'ndcg': evaluation.get_ndcg_at_n(to_be_scored_docs), + } + + @abc.abstractmethod + def get_embeddings(self, docs: DocumentArrayLike): + """Calculates and adds the embeddings for the given Documents.""" + class BaseDataset: def __init__( self, inputs: DocumentArrayLike, + catalog: DocumentArrayLike, ): super().__init__() self._inputs = inputs() if callable(inputs) else inputs + self._catalog = catalog() if callable(catalog) else catalog diff --git a/finetuner/tuner/dataset/__init__.py b/finetuner/tuner/dataset/__init__.py index b56dc01f5..c1c85b856 100644 --- a/finetuner/tuner/dataset/__init__.py +++ b/finetuner/tuner/dataset/__init__.py @@ -8,7 +8,9 @@ def __iter__(self): for d in self._inputs: d_blob = d.blob for m in d.matches: - yield (d_blob, m.blob), np.float32(m.tags[__default_tag_key__]['label']) + yield (d_blob, self._catalog[m.id].blob), np.float32( + m.tags[__default_tag_key__]['label'] + ) class TripletMixin: @@ -19,9 +21,9 @@ def __iter__(self): negatives = [] for m in d.matches: if m.tags[__default_tag_key__]['label'] > 0: - positives.append(m.blob) + positives.append(self._catalog[m.id].blob) else: - negatives.append(m.blob) + negatives.append(self._catalog[m.id].blob) for p, n in itertools.product(positives, negatives): yield (anchor, p, n), np.float32(0) diff --git a/finetuner/tuner/evaluation.py b/finetuner/tuner/evaluation.py new file mode 100644 index 000000000..49d8aad27 --- /dev/null +++ b/finetuner/tuner/evaluation.py @@ -0,0 +1,53 @@ +import numpy as np +from jina import Document, DocumentArray +from .. import __default_tag_key__ + + +def prepare_eval_docs(docs, catalog, limit=10, sample_size=100, seed=42): + sampled_docs = docs.sample(min(sample_size, len(docs)), seed) + to_be_scored_docs = DocumentArray() + for doc in sampled_docs: + d = Document( + id=doc.id, + embedding=doc.embedding, + tags={ + 'positive_ids': [ + m.id + for m in doc.matches + if m.tags[__default_tag_key__]['label'] > 0 + ] + }, + ) + to_be_scored_docs.append(d) + to_be_scored_docs.match(catalog, limit=limit) + return to_be_scored_docs + + +def get_hits_at_n(to_be_scored_docs, n=-1): + hits = 0 + for doc in to_be_scored_docs: + positive_ids = doc.tags['positive_ids'] + for match in doc.matches[:n]: + if match.id in positive_ids: + hits += 1 + return hits + + +def get_ndcg_at_n(to_be_scored_docs, n=-1): + ndcg = 0 + for doc in to_be_scored_docs: + dcg = 0 + positive_ids = doc.tags['positive_ids'] + first_n = doc.matches[:n] + for position, match in enumerate(first_n): + if match.id in positive_ids: + dcg += 1 / np.log(position + 2) + + max_positives = min(len(positive_ids), len(first_n)) + idcg = max(_get_idcg(max_positives), 1e-10) + ndcg += dcg / idcg + return ndcg + + +def _get_idcg(n): + return sum(1 / np.log(position + 2) for position in range(n)) diff --git a/finetuner/tuner/keras/__init__.py b/finetuner/tuner/keras/__init__.py index 892a32a3c..06d326a03 100644 --- a/finetuner/tuner/keras/__init__.py +++ b/finetuner/tuner/keras/__init__.py @@ -1,15 +1,16 @@ from typing import Dict, Optional, Union +import numpy as np import tensorflow as tf from jina.logging.profile import ProgressBar from tensorflow import keras -from tensorflow.keras.layers import Layer from tensorflow.keras.optimizers import Optimizer from . import losses, datasets from ..base import BaseTuner, BaseLoss from ..dataset.helper import get_dataset from ..logger import LogGenerator +from ..stats import TunerStats from ...helper import DocumentArrayLike @@ -26,7 +27,7 @@ def _get_data_loader(self, inputs, batch_size: int, shuffle: bool): input_shape = self.embed_model.input_shape[1:] tf_data = tf.data.Dataset.from_generator( - lambda: ds(inputs), + lambda: ds(inputs, self._catalog), output_signature=( tuple( tf.TensorSpec(shape=input_shape, dtype=tf.float32) @@ -118,7 +119,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ): + ) -> TunerStats: _train_data = self._get_data_loader( inputs=train_data, batch_size=batch_size, shuffle=False @@ -135,29 +136,35 @@ def fit( device = '/CPU:0' else: raise ValueError(f'Device {device} not recognized') - device = tf.device(device) + self.device = tf.device(device) _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - losses_train = [] - losses_eval = [] + stats = TunerStats() - with device: + with self.device: for epoch in range(epochs): lt = self._train( _train_data, _optimizer, description=f'Epoch {epoch + 1}/{epochs}', ) - losses_train.extend(lt) + stats.add_train_loss(lt) if eval_data: - le = self._eval(_eval_data, train_log=LogGenerator('T', lt)()) - losses_eval.extend(le) - - return { - 'loss': {'train': losses_train, 'eval': losses_eval}, - } + le = self._eval(_eval_data, train_log=LogGenerator("T", lt)()) + stats.add_eval_loss(le) + stats.add_eval_metric(self.get_metrics(eval_data)) + + stats.print_last() + return stats + + def get_embeddings(self, data: DocumentArrayLike): + blobs = data.blobs + with self.device: + embeddings = self.embed_model(blobs) + for doc, embed in zip(data, embeddings): + doc.embedding = np.array(embed) def save(self, *args, **kwargs): self.embed_model.save(*args, **kwargs) diff --git a/finetuner/tuner/paddle/__init__.py b/finetuner/tuner/paddle/__init__.py index b0df4edae..2a7500c11 100644 --- a/finetuner/tuner/paddle/__init__.py +++ b/finetuner/tuner/paddle/__init__.py @@ -1,8 +1,8 @@ from typing import Dict, Optional, Union +import numpy as np import paddle from jina.logging.profile import ProgressBar -from paddle import nn from paddle.io import DataLoader from paddle.optimizer import Optimizer @@ -11,6 +11,7 @@ from ...helper import DocumentArrayLike from ..dataset.helper import get_dataset from ..logger import LogGenerator +from ..stats import TunerStats class PaddleTuner(BaseTuner): @@ -23,7 +24,7 @@ def _get_loss(self, loss: Union[BaseLoss, str]): def _get_data_loader(self, inputs, batch_size: int, shuffle: bool): ds = get_dataset(datasets, self.arity) return DataLoader( - dataset=ds(inputs=inputs), + dataset=ds(inputs=inputs, catalog=self._catalog), batch_size=batch_size, shuffle=shuffle, ) @@ -117,7 +118,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ): + ) -> TunerStats: if device == 'cuda': paddle.set_device('gpu:0') @@ -128,8 +129,7 @@ def fit( _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - losses_train = [] - losses_eval = [] + stats = TunerStats() for epoch in range(epochs): _data = self._get_data_loader( @@ -140,7 +140,7 @@ def fit( _optimizer, description=f'Epoch {epoch + 1}/{epochs}', ) - losses_train.extend(lt) + stats.add_train_loss(lt) if eval_data: _data = self._get_data_loader( @@ -148,9 +148,17 @@ def fit( ) le = self._eval(_data, train_log=LogGenerator('T', lt)()) - losses_eval.extend(le) + stats.add_eval_loss(le) + stats.add_eval_metric(self.get_metrics(eval_data)) - return {'loss': {'train': losses_train, 'eval': losses_eval}} + stats.print_last() + return stats + + def get_embeddings(self, data: DocumentArrayLike): + blobs = data.blobs + embeddings = self.embed_model(paddle.Tensor(blobs)) + for doc, embed in zip(data, embeddings): + doc.embedding = np.array(embed) def save(self, *args, **kwargs): paddle.save(self.embed_model.state_dict(), *args, **kwargs) diff --git a/finetuner/tuner/pytorch/__init__.py b/finetuner/tuner/pytorch/__init__.py index f20ad0f49..a514b2092 100644 --- a/finetuner/tuner/pytorch/__init__.py +++ b/finetuner/tuner/pytorch/__init__.py @@ -10,6 +10,7 @@ from ..dataset.helper import get_dataset from ..logger import LogGenerator from ...helper import DocumentArrayLike +from ..stats import TunerStats class PytorchTuner(BaseTuner): @@ -22,7 +23,7 @@ def _get_loss(self, loss: Union[BaseLoss, str]): def _get_data_loader(self, inputs, batch_size: int, shuffle: bool): ds = get_dataset(datasets, self.arity) return DataLoader( - dataset=ds(inputs=inputs), + dataset=ds(inputs=inputs, catalog=self._catalog), batch_size=batch_size, shuffle=shuffle, ) @@ -127,7 +128,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ): + ) -> TunerStats: if device == 'cpu': self.device = torch.device('cpu') elif device == 'cuda': @@ -141,8 +142,7 @@ def fit( # Get optimizer _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - losses_train = [] - losses_eval = [] + stats = TunerStats() for epoch in range(epochs): _data = self._get_data_loader( @@ -153,7 +153,7 @@ def fit( _optimizer, description=f'Epoch {epoch + 1}/{epochs}', ) - losses_train.extend(lt) + stats.add_train_loss(lt) if eval_data: _data = self._get_data_loader( @@ -161,9 +161,20 @@ def fit( ) le = self._eval(_data, train_log=LogGenerator('T', lt)()) - losses_eval.extend(le) + stats.add_eval_loss(le) + stats.add_eval_metric(self.get_metrics(eval_data)) - return {'loss': {'train': losses_train, 'eval': losses_eval}} + stats.print_last() + return stats + + def get_embeddings(self, data: DocumentArrayLike): + blobs = data.blobs + + tensor = torch.tensor(blobs, device=self.device) + with torch.inference_mode(): + embeddings = self.embed_model(tensor) + for doc, embed in zip(data, embeddings): + doc.embedding = embed.cpu().numpy() def save(self, *args, **kwargs): torch.save(self.embed_model.state_dict(), *args, **kwargs) diff --git a/finetuner/tuner/stats.py b/finetuner/tuner/stats.py new file mode 100644 index 000000000..18eea78e0 --- /dev/null +++ b/finetuner/tuner/stats.py @@ -0,0 +1,43 @@ +import json +from typing import Dict, List + + +class TunerStats: + def __init__( + self, + loss_train: List = None, + loss_eval: List = None, + metrics_eval: List[Dict] = None, + ): + self._loss_train = loss_train if loss_train is not None else [] + self._loss_eval = loss_eval if loss_eval is not None else [] + self._metrics_eval = metrics_eval if metrics_eval is not None else [] + + def save(self, file: str): + with open(file, 'w') as output: + json.dump( + { + 'loss_train': [float(loss) for loss in self._loss_train], + 'loss_eval': [float(loss) for loss in self._loss_eval], + 'metrics_eval': self._metrics_eval, + }, + output, + ) + + def add_train_loss(self, losses: List): + self._loss_train.extend(losses) + + def add_eval_loss(self, losses: List): + self._loss_eval.extend(losses) + + def add_eval_metric(self, metric: Dict): + self._metrics_eval.append(metric) + + def print_last(self): + if self._metrics_eval: + eval_string = TunerStats.get_metrics_string(self._metrics_eval[-1]) + print(f'Evaluation metrics: {eval_string}') + + @staticmethod + def get_metrics_string(metrics: Dict): + return f'hits: {metrics.get("hits", 0):>3}, NDCG: {metrics.get("ndcg", 0):.2f}' diff --git a/tests/integration/fit/test_fit_lstm.py b/tests/integration/fit/test_fit_lstm.py index 5cfe186ec..c7e44a12f 100644 --- a/tests/integration/fit/test_fit_lstm.py +++ b/tests/integration/fit/test_fit_lstm.py @@ -1,11 +1,9 @@ -import json - import paddle import tensorflow as tf import torch from finetuner import fit -from finetuner.toydata import generate_qa_match +from finetuner.toydata import generate_qa_match_catalog all_test_losses = [ 'CosineSiameseLoss', @@ -52,24 +50,19 @@ def test_fit_all(tmpdir): for kb, b in embed_models.items(): for h in all_test_losses: + train_data, train_catalog = generate_qa_match_catalog( + num_total=300, num_neg=5, max_seq_len=10, pre_init_generator=False + ) + eval_data, eval_catalog = generate_qa_match_catalog( + num_total=300, num_neg=5, max_seq_len=10, pre_init_generator=False + ) + train_catalog.extend(eval_catalog) result = fit( b(), loss=h, - train_data=lambda: generate_qa_match( - num_total=300, num_neg=5, max_seq_len=10 - ), - eval_data=lambda: generate_qa_match( - num_total=300, num_neg=5, max_seq_len=10 - ), + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=2, ) - - # convert from numpy to python native float for json dump - result = { - 'loss': { - 'train': [float(v) for v in result['loss']['train']], - 'eval': [float(v) for v in result['loss']['eval']], - }, - } - with open(tmpdir / f'result-{kb}-{h}.json', 'w') as fp: - json.dump(result, fp) + result.save(tmpdir / f'result-{kb}-{h}.json') diff --git a/tests/integration/fit/test_fit_mlp.py b/tests/integration/fit/test_fit_mlp.py index c584392d5..fafbcd152 100644 --- a/tests/integration/fit/test_fit_mlp.py +++ b/tests/integration/fit/test_fit_mlp.py @@ -1,11 +1,9 @@ -import json - import paddle import tensorflow as tf import torch import finetuner -from finetuner.toydata import generate_fashion_match +from finetuner.toydata import generate_fashion_match_catalog all_test_losses = [ 'CosineSiameseLoss', @@ -46,24 +44,28 @@ def test_fit_all(tmpdir): for kb, b in embed_models.items(): for h in all_test_losses: + train_data, train_catalog = generate_fashion_match_catalog( + num_neg=10, + num_pos=10, + num_total=300, + num_catalog=3000, + pre_init_generator=False, + ) + eval_data, eval_catalog = generate_fashion_match_catalog( + num_neg=10, + num_pos=10, + num_total=300, + num_catalog=3000, + is_testset=True, + pre_init_generator=False, + ) + train_catalog.extend(eval_catalog) result = finetuner.fit( b(), loss=h, - train_data=lambda: generate_fashion_match( - num_neg=10, num_pos=10, num_total=300 - ), - eval_data=lambda: generate_fashion_match( - num_neg=10, num_pos=10, num_total=300, is_testset=True - ), + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=2, ) - - # convert from numpy to python native float for json dump - result = { - 'loss': { - 'train': [float(v) for v in result['loss']['train']], - 'eval': [float(v) for v in result['loss']['eval']], - }, - } - with open(tmpdir / f'result-{kb}-{h}.json', 'w') as fp: - json.dump(result, fp) + result.save(tmpdir / f'result-{kb}-{h}.json') diff --git a/tests/integration/keras/test_keras_trainer.py b/tests/integration/keras/test_keras_trainer.py index 7635f0377..74b8beaed 100644 --- a/tests/integration/keras/test_keras_trainer.py +++ b/tests/integration/keras/test_keras_trainer.py @@ -3,9 +3,9 @@ import tensorflow as tf from tensorflow import keras -from finetuner.tuner.keras import KerasTuner -from finetuner.toydata import generate_fashion_match -from finetuner.toydata import generate_qa_match +from finetuner.tuner import fit, save +from finetuner.toydata import generate_fashion_match_catalog +from finetuner.toydata import generate_qa_match_catalog all_test_losses = [ 'CosineSiameseLoss', @@ -29,20 +29,33 @@ def test_simple_sequential_model(tmpdir, params, loss): ] ) - kt = KerasTuner(user_model, loss=loss) - # fit and save the checkpoint - kt.fit( - train_data=lambda: generate_fashion_match( - num_pos=10, num_neg=10, num_total=params['num_train'] - ), - eval_data=lambda: generate_fashion_match( - num_pos=10, num_neg=10, num_total=params['num_eval'], is_testset=True - ), + train_data, train_catalog = generate_fashion_match_catalog( + num_neg=10, + num_pos=10, + num_total=params['num_train'], + num_catalog=params['num_train'] * 10, + pre_init_generator=False, + ) + eval_data, eval_catalog = generate_fashion_match_catalog( + num_neg=10, + num_pos=10, + num_total=params['num_eval'], + num_catalog=params['num_eval'] * 10, + is_testset=True, + pre_init_generator=False, + ) + train_catalog.extend(eval_catalog) + fit( + user_model, + loss=loss, + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=params['epochs'], batch_size=params['batch_size'], ) - kt.save(tmpdir / 'trained.kt') + save(user_model, tmpdir / 'trained.kt') embedding_model = keras.models.load_model(tmpdir / 'trained.kt') r = embedding_model.predict( @@ -63,26 +76,33 @@ def test_simple_lstm_model(tmpdir, params, loss): ] ) - kt = KerasTuner(user_model, loss=loss) - # fit and save the checkpoint - kt.fit( - train_data=lambda: generate_qa_match( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=False, - ), - eval_data=lambda: generate_qa_match( - num_total=params['num_eval'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=True, - ), + train_data, train_catalog = generate_qa_match_catalog( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=False, + pre_init_generator=False, + ) + eval_data, eval_catalog = generate_qa_match_catalog( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=True, + pre_init_generator=False, + ) + train_catalog.extend(eval_catalog) + + fit( + user_model, + loss=loss, + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=params['epochs'], batch_size=params['batch_size'], ) - kt.save(tmpdir / 'trained.kt') + save(user_model, tmpdir / 'trained.kt') embedding_model = keras.models.load_model(tmpdir / 'trained.kt') r = embedding_model.predict( diff --git a/tests/integration/keras/test_overfit.py b/tests/integration/keras/test_overfit.py index d3421445a..830acb6e8 100644 --- a/tests/integration/keras/test_overfit.py +++ b/tests/integration/keras/test_overfit.py @@ -2,7 +2,7 @@ import tensorflow as tf from scipy.spatial.distance import pdist, squareform -from finetuner.tuner.keras import KerasTuner +from finetuner.tuner import fit @pytest.mark.parametrize( @@ -45,8 +45,7 @@ def test_overfit_keras( ) # Train - pt = KerasTuner(embed_model, loss=loss) - pt.fit(train_data=data, epochs=n_epochs, batch_size=batch_size) + fit(embed_model, loss=loss, train_data=data, epochs=n_epochs, batch_size=batch_size) # Compute embedding for original vectors vec_embedings = embed_model(vecs).numpy() diff --git a/tests/integration/keras/test_tail_and_tune.py b/tests/integration/keras/test_tail_and_tune.py index f27a4fd56..2f77e7506 100644 --- a/tests/integration/keras/test_tail_and_tune.py +++ b/tests/integration/keras/test_tail_and_tune.py @@ -28,4 +28,4 @@ def test_tail_and_tune(embed_model, create_easy_data): output_dim=16, layer_name='dense_2', ) - assert rv['loss']['train'] + assert rv._loss_train diff --git a/tests/integration/labeler/test_tune_lstm.py b/tests/integration/labeler/test_tune_lstm.py index 9701c4d21..7e9f4dc2f 100644 --- a/tests/integration/labeler/test_tune_lstm.py +++ b/tests/integration/labeler/test_tune_lstm.py @@ -11,7 +11,7 @@ import paddle import torch -from finetuner.toydata import generate_qa_match +from finetuner.toydata import generate_qa_match_catalog class LastCellPT(torch.nn.Module): @@ -54,10 +54,11 @@ def _run(framework_name, loss, port_expose): paddle.nn.Linear(in_features=2 * 64, out_features=32), ), } - + train_data, catalog = generate_qa_match_catalog(num_total=10, num_neg=0) fit( embed_models[framework_name](), - generate_qa_match(num_total=10, num_neg=0), + train_data, + catalog=catalog, loss=loss, interactive=True, port_expose=port_expose, @@ -71,6 +72,7 @@ def _run(framework_name, loss, port_expose): 'EuclideanTripletLoss', ] + # 'keras' does not work under this test setup # Exception ... ust be from the same graph as Tensor ... # TODO: add keras backend back to the test @@ -95,8 +97,7 @@ def test_all_frameworks(framework, loss, tmpdir): json={ 'data': [], 'parameters': { - 'start': 0, - 'end': 1, + 'new_examples': 1, 'topk': 5, 'sample_size': 10, }, @@ -114,7 +115,7 @@ def test_all_frameworks(framework, loss, tmpdir): f'http://localhost:{port}/next', json={ 'data': [], - 'parameters': {'start': 0, 'end': 1, 'topk': 5, 'sample_size': 10}, + 'parameters': {'new_examples': 1, 'topk': 5, 'sample_size': 10}, }, ) assert req.status_code == 200 diff --git a/tests/integration/labeler/test_tune_mlp.py b/tests/integration/labeler/test_tune_mlp.py index f4175a039..07f9a817d 100644 --- a/tests/integration/labeler/test_tune_mlp.py +++ b/tests/integration/labeler/test_tune_mlp.py @@ -5,7 +5,7 @@ import pytest import requests -from finetuner.toydata import generate_fashion_match +from finetuner.toydata import generate_fashion_match_catalog from jina.helper import random_port os.environ['JINA_LOG_LEVEL'] = 'DEBUG' @@ -52,10 +52,14 @@ def _run(framework_name, loss, port_expose): paddle.nn.Linear(in_features=128, out_features=32), ), } + data, catalog = generate_fashion_match_catalog( + num_total=10, num_catalog=100, num_pos=0, num_neg=0 + ) fit( embed_models[framework_name](), - generate_fashion_match(num_total=10, num_pos=0, num_neg=0), + data, + catalog=catalog, loss=loss, interactive=True, port_expose=port_expose, @@ -86,8 +90,7 @@ def test_all_frameworks(framework, loss): json={ 'data': [], 'parameters': { - 'start': 0, - 'end': 1, + 'new_examples': 1, 'topk': 5, 'sample_size': 10, }, @@ -105,7 +108,7 @@ def test_all_frameworks(framework, loss): f'http://localhost:{port}/next', json={ 'data': [], - 'parameters': {'start': 0, 'end': 1, 'topk': 5, 'sample_size': 10}, + 'parameters': {'new_examples': 1, 'topk': 5, 'sample_size': 10}, }, ) assert req.status_code == 200 diff --git a/tests/integration/paddle/test_overfit.py b/tests/integration/paddle/test_overfit.py index 11b397738..5813eebf4 100644 --- a/tests/integration/paddle/test_overfit.py +++ b/tests/integration/paddle/test_overfit.py @@ -3,7 +3,7 @@ from paddle import nn from scipy.spatial.distance import pdist, squareform -from finetuner.tuner.paddle import PaddleTuner +from finetuner.tuner import fit @pytest.mark.parametrize( @@ -47,8 +47,7 @@ def test_overfit_paddle( ) # Train - pt = PaddleTuner(embed_model, loss=loss) - pt.fit(train_data=data, epochs=n_epochs, batch_size=batch_size) + fit(embed_model, loss=loss, train_data=data, epochs=n_epochs, batch_size=batch_size) # Compute embedding for original vectors vec_embedings = embed_model(paddle.Tensor(vecs)).numpy() diff --git a/tests/integration/paddle/test_paddle_trainer.py b/tests/integration/paddle/test_paddle_trainer.py index 6f9693dd9..905cdffaa 100644 --- a/tests/integration/paddle/test_paddle_trainer.py +++ b/tests/integration/paddle/test_paddle_trainer.py @@ -3,9 +3,9 @@ import pytest from paddle import nn -from finetuner.tuner.paddle import PaddleTuner -from finetuner.toydata import generate_fashion_match -from finetuner.toydata import generate_qa_match +from finetuner.tuner import fit, save +from finetuner.toydata import generate_fashion_match_catalog +from finetuner.toydata import generate_qa_match_catalog @pytest.mark.parametrize( @@ -28,21 +28,31 @@ def test_simple_sequential_model(tmpdir, params, loss): nn.Linear(in_features=params['feature_dim'], out_features=params['output_dim']), ) - pt = PaddleTuner(user_model, loss=loss) model_path = tmpdir / 'trained.pd' # fit and save the checkpoint - pt.fit( - train_data=lambda: generate_fashion_match( - num_pos=10, num_neg=10, num_total=params['num_train'] - ), - eval_data=lambda: generate_fashion_match( - num_pos=10, num_neg=10, num_total=params['num_eval'], is_testset=True - ), + train_data, train_catalog = generate_fashion_match_catalog( + num_neg=10, num_pos=10, num_total=params['num_train'], pre_init_generator=False + ) + eval_data, eval_catalog = generate_fashion_match_catalog( + num_neg=10, + num_pos=10, + num_total=params['num_eval'], + is_testset=True, + pre_init_generator=False, + ) + train_catalog.extend(eval_catalog) + + fit( + user_model, + loss=loss, + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=params['epochs'], batch_size=params['batch_size'], ) - pt.save(model_path) + save(user_model, model_path) user_model.set_state_dict(paddle.load(model_path)) user_model.eval() @@ -84,26 +94,33 @@ def forward(self, x): ) model_path = tmpdir / 'trained.pd' - pt = PaddleTuner(user_model, loss=loss) - # fit and save the checkpoint - pt.fit( - train_data=lambda: generate_qa_match( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=False, - ), - eval_data=lambda: generate_qa_match( - num_total=params['num_eval'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=True, - ), + train_data, train_catalog = generate_qa_match_catalog( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=False, + pre_init_generator=False, + ) + eval_data, eval_catalog = generate_qa_match_catalog( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=True, + pre_init_generator=False, + ) + train_catalog.extend(eval_catalog) + + fit( + user_model, + loss=loss, + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=params['epochs'], batch_size=params['batch_size'], ) - pt.save(model_path) + save(user_model, model_path) # load the checkpoint and ensure the dim user_model.set_state_dict(paddle.load(model_path)) diff --git a/tests/integration/paddle/test_tail_and_tune.py b/tests/integration/paddle/test_tail_and_tune.py index 5a9f01036..b7bb80539 100644 --- a/tests/integration/paddle/test_tail_and_tune.py +++ b/tests/integration/paddle/test_tail_and_tune.py @@ -29,4 +29,4 @@ def test_tail_and_tune(embed_model, create_easy_data): output_dim=16, layer_name='linear_4', ) - assert rv['loss']['train'] + assert rv._loss_train diff --git a/tests/integration/torch/test_overfit.py b/tests/integration/torch/test_overfit.py index 49b6a50bf..21f228cf9 100644 --- a/tests/integration/torch/test_overfit.py +++ b/tests/integration/torch/test_overfit.py @@ -2,7 +2,7 @@ import torch from scipy.spatial.distance import pdist, squareform -from finetuner.tuner.pytorch import PytorchTuner +from finetuner.tuner import fit @pytest.mark.parametrize( @@ -44,10 +44,8 @@ def test_overfit_pytorch( torch.nn.ReLU(), torch.nn.Linear(in_features=64, out_features=32), ) - # Train - pt = PytorchTuner(embed_model, loss=loss) - pt.fit(train_data=data, epochs=n_epochs, batch_size=batch_size) + fit(embed_model, loss=loss, train_data=data, epochs=n_epochs, batch_size=batch_size) # Compute embedding for original vectors with torch.inference_mode(): diff --git a/tests/integration/torch/test_tail_and_tune.py b/tests/integration/torch/test_tail_and_tune.py index 8fabc095e..0a0cfead0 100644 --- a/tests/integration/torch/test_tail_and_tune.py +++ b/tests/integration/torch/test_tail_and_tune.py @@ -29,4 +29,4 @@ def test_tail_and_tune(embed_model, create_easy_data): output_dim=16, layer_name='linear_4', ) - assert rv['loss']['train'] + assert rv._loss_train diff --git a/tests/integration/torch/test_torch_trainer.py b/tests/integration/torch/test_torch_trainer.py index ca81c0118..5a1f64378 100644 --- a/tests/integration/torch/test_torch_trainer.py +++ b/tests/integration/torch/test_torch_trainer.py @@ -5,9 +5,9 @@ import torch import torch.nn as nn -from finetuner.tuner.pytorch import PytorchTuner -from finetuner.toydata import generate_fashion_match -from finetuner.toydata import generate_qa_match +from finetuner.tuner import fit, save +from finetuner.toydata import generate_fashion_match_catalog +from finetuner.toydata import generate_qa_match_catalog @pytest.mark.parametrize( @@ -31,20 +31,29 @@ def test_simple_sequential_model(tmpdir, params, loss): ) model_path = os.path.join(tmpdir, 'trained.pth') - pt = PytorchTuner(user_model, loss=loss) - # fit and save the checkpoint - pt.fit( - train_data=lambda: generate_fashion_match( - num_pos=10, num_neg=10, num_total=params['num_train'] - ), - eval_data=lambda: generate_fashion_match( - num_pos=10, num_neg=10, num_total=params['num_eval'], is_testset=True - ), + train_data, train_catalog = generate_fashion_match_catalog( + num_neg=10, num_pos=10, num_total=params['num_train'], pre_init_generator=False + ) + eval_data, eval_catalog = generate_fashion_match_catalog( + num_neg=10, + num_pos=10, + num_total=params['num_eval'], + is_testset=True, + pre_init_generator=False, + ) + train_catalog.extend(eval_catalog) + + fit( + user_model, + loss=loss, + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=params['epochs'], batch_size=params['batch_size'], ) - pt.save(model_path) + save(user_model, model_path) # load the checkpoint and ensure the dim user_model.load_state_dict(torch.load(model_path)) @@ -88,26 +97,33 @@ def forward(self, x): ) model_path = os.path.join(tmpdir, 'trained.pth') - pt = PytorchTuner(user_model, loss=loss) - # fit and save the checkpoint - pt.fit( - train_data=lambda: generate_qa_match( - num_total=params['num_train'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=False, - ), - eval_data=lambda: generate_qa_match( - num_total=params['num_eval'], - max_seq_len=params['max_seq_len'], - num_neg=5, - is_testset=True, - ), + train_data, train_catalog = generate_qa_match_catalog( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=False, + pre_init_generator=False, + ) + eval_data, eval_catalog = generate_qa_match_catalog( + num_total=params['num_train'], + max_seq_len=params['max_seq_len'], + num_neg=5, + is_testset=True, + pre_init_generator=False, + ) + train_catalog.extend(eval_catalog) + + fit( + user_model, + loss=loss, + train_data=train_data, + eval_data=eval_data, + catalog=train_catalog, epochs=params['epochs'], batch_size=params['batch_size'], ) - pt.save(model_path) + save(user_model, model_path) # load the checkpoint and ensure the dim user_model.load_state_dict(torch.load(model_path)) diff --git a/tests/unit/toydata/test_data_gen.py b/tests/unit/toydata/test_data_gen.py index 05e28ba20..1cec95db8 100644 --- a/tests/unit/toydata/test_data_gen.py +++ b/tests/unit/toydata/test_data_gen.py @@ -8,7 +8,7 @@ def test_qa_data_generator(): - for d in generate_qa_match(): + for d in generate_qa_match_catalog()[0]: assert d.tags['question'] assert d.tags['answer'] assert d.tags['wrong_answer'] @@ -16,31 +16,33 @@ def test_qa_data_generator(): def test_train_test_generator(): - fmdg_train = generate_fashion_match(is_testset=True) - fmdg_test = generate_fashion_match(is_testset=False) + fmdg_train, _ = generate_fashion_match_catalog(is_testset=True) + fmdg_test, _ = generate_fashion_match_catalog(is_testset=False) for d1, d2 in zip(fmdg_train, fmdg_test): assert np.any(np.not_equal(d1.blob, d2.blob)) break def test_train_test_qa_generator(): - fmdg_train = generate_qa_match(is_testset=True) - fmdg_test = generate_qa_match(is_testset=False) + fmdg_train = generate_qa_match_catalog(is_testset=True)[0] + fmdg_test = generate_qa_match_catalog(is_testset=False)[0] for d1, d2 in zip(fmdg_train, fmdg_test): assert d1.id != d2.id assert np.any(np.not_equal(d1.blob, d2.blob)) def test_doc_generator(): - for d in generate_fashion_match(): + for d in generate_fashion_match_catalog()[0]: assert d.tags['class'] break @pytest.mark.parametrize('channels', [0, 1, 3]) -@pytest.mark.parametrize('upsampling', [1, 2, 4]) +@pytest.mark.parametrize('upsampling', [1, 2]) def test_doc_generator_channel(channels, upsampling): - for d in generate_fashion_match(channels=channels, upsampling=upsampling): + for d in generate_fashion_match_catalog(channels=channels, upsampling=upsampling)[ + 0 + ]: if channels == 0: assert d.blob.ndim == 2 else: @@ -57,9 +59,9 @@ def test_doc_generator_channel(channels, upsampling): @pytest.mark.parametrize('pos_value, neg_value', [(1, 0), (1, -1)]) @pytest.mark.parametrize('num_pos, num_neg', [(5, 7), (10, 10)]) def test_fashion_matches_generator(num_pos, num_neg, pos_value, neg_value): - for d in generate_fashion_match( + for d in generate_fashion_match_catalog( num_pos=num_pos, num_neg=num_neg, pos_value=pos_value, neg_value=neg_value - ): + )[0]: assert len(d.matches) == num_pos + num_neg all_labels = [int(d.tags[__default_tag_key__]['label']) for d in d.matches] assert all_labels.count(pos_value) == num_pos @@ -73,13 +75,17 @@ def test_fashion_matches_generator(num_pos, num_neg, pos_value, neg_value): def test_fashion_documentarray(): - da = DocumentArray(generate_fashion_match(num_total=10, num_pos=2, num_neg=3)) + da = DocumentArray( + generate_fashion_match_catalog( + num_total=10, num_catalog=1000, num_pos=2, num_neg=3 + )[0] + ) assert len(da) == 10 assert len(da[0].matches) == 5 def test_qa_documentarray(): - da = DocumentArray(generate_qa_match(num_total=10, num_neg=3)) + da = DocumentArray(generate_qa_match_catalog(num_total=10, num_neg=3)[0]) assert len(da) == 10 assert len(da[0].matches) == 4 @@ -88,9 +94,9 @@ def test_qa_documentarray(): @pytest.mark.parametrize('num_neg', [1, 2, 10]) @pytest.mark.parametrize('to_ndarray', [True, False]) def test_generate_qa_doc_match(pos_value, neg_value, num_neg, to_ndarray): - for d in generate_qa_match( + for d in generate_qa_match_catalog( num_neg=num_neg, pos_value=pos_value, neg_value=neg_value, to_ndarray=to_ndarray - ): + )[0]: assert len(d.matches) == 1 + num_neg all_labels = [int(d.tags[__default_tag_key__]['label']) for d in d.matches] assert all_labels.count(pos_value) == 1 @@ -105,7 +111,7 @@ def test_generate_qa_doc_match(pos_value, neg_value, num_neg, to_ndarray): @pytest.mark.parametrize('max_length', [1, 10, 100]) def test_qa_sequence_same_length(max_length): num_neg = 5 - for s in generate_qa_match(num_neg=num_neg, max_seq_len=max_length): + for s in generate_qa_match_catalog(num_neg=num_neg, max_seq_len=max_length)[0]: assert s.blob.shape[0] == max_length assert len(s.matches) == num_neg + 1 for m in s.matches: diff --git a/tests/unit/toydata/test_dataset.py b/tests/unit/toydata/test_dataset.py index 5cea052c3..462abdd8b 100644 --- a/tests/unit/toydata/test_dataset.py +++ b/tests/unit/toydata/test_dataset.py @@ -1,23 +1,26 @@ import numpy as np import pytest -from finetuner.toydata import generate_fashion_match +from finetuner.toydata import generate_fashion_match_catalog from finetuner.tuner.base import BaseDataset from finetuner.tuner.dataset import SiameseMixin, TripletMixin @pytest.mark.parametrize( - 'data_src', - [ - generate_fashion_match(num_pos=10, num_neg=10, num_total=100), - lambda: generate_fashion_match(num_pos=10, num_neg=10, num_total=100), - ], + 'pre_init_generator', + [True, False], ) -def test_siamese_dataset(data_src): +def test_siamese_dataset(pre_init_generator): class SD(SiameseMixin, BaseDataset): ... - sd = SD(data_src) + data, catalog = generate_fashion_match_catalog( + num_pos=10, + num_neg=10, + num_total=100, + pre_init_generator=pre_init_generator, + ) + sd = SD(data, catalog) for d in sd: assert len(d) == 2 assert len(d[0]) == 2 @@ -28,17 +31,21 @@ class SD(SiameseMixin, BaseDataset): @pytest.mark.parametrize( - 'data_src', - [ - generate_fashion_match(num_pos=10, num_neg=10, num_total=100), - lambda: generate_fashion_match(num_pos=10, num_neg=10, num_total=100), - ], + 'pre_init_generator', + [True, False], ) -def test_triplet_dataset(data_src): +def test_triplet_dataset(pre_init_generator): class SD(TripletMixin, BaseDataset): ... - sd = SD(data_src) + data, catalog = generate_fashion_match_catalog( + num_pos=10, + num_neg=10, + num_total=100, + pre_init_generator=pre_init_generator, + ) + + sd = SD(data, catalog) for d in sd: assert len(d) == 2 assert len(d[0]) == 3 diff --git a/tests/unit/tuner/keras/test_gpu.py b/tests/unit/tuner/keras/test_gpu.py index a7a418e69..b200c35cf 100644 --- a/tests/unit/tuner/keras/test_gpu.py +++ b/tests/unit/tuner/keras/test_gpu.py @@ -1,6 +1,6 @@ import pytest import tensorflow as tf - +from jina import DocumentArray from finetuner.tuner.keras import KerasTuner all_test_losses = [ @@ -24,10 +24,11 @@ def tf_gpu_config(): @pytest.mark.parametrize('loss', all_test_losses) def test_gpu_keras(generate_random_triplets, loss, caplog): data = generate_random_triplets(4, 4) + catalog = DocumentArray(data.traverse_flat(['m'])) embed_model = tf.keras.models.Sequential() embed_model.add(tf.keras.layers.InputLayer(input_shape=(4,))) embed_model.add(tf.keras.layers.Dense(4)) - tuner = KerasTuner(embed_model, loss) + tuner = KerasTuner(embed_model, catalog, loss) tuner.fit(data, data, epochs=2, batch_size=4, device='cuda') diff --git a/tests/unit/tuner/paddle/test_gpu.py b/tests/unit/tuner/paddle/test_gpu.py index f3a347c5b..2931000d2 100644 --- a/tests/unit/tuner/paddle/test_gpu.py +++ b/tests/unit/tuner/paddle/test_gpu.py @@ -1,6 +1,6 @@ import pytest import paddle.nn as nn - +from jina import DocumentArray from finetuner.tuner.paddle import PaddleTuner all_test_losses = [ @@ -16,11 +16,13 @@ def test_gpu_paddle(generate_random_triplets, loss): data = generate_random_triplets(4, 4) + catalog = DocumentArray(data.traverse_flat(['m'])) + embed_model = nn.Sequential( nn.Linear(in_features=4, out_features=4), ) - tuner = PaddleTuner(embed_model, loss=loss) + tuner = PaddleTuner(embed_model, catalog=catalog, loss=loss) tuner.fit(data, data, epochs=2, batch_size=4, device='cuda') diff --git a/tests/unit/tuner/torch/test_gpu.py b/tests/unit/tuner/torch/test_gpu.py index 13d802e15..0927e7739 100644 --- a/tests/unit/tuner/torch/test_gpu.py +++ b/tests/unit/tuner/torch/test_gpu.py @@ -1,6 +1,6 @@ import pytest import torch - +from jina import DocumentArray from finetuner.tuner.pytorch import PytorchTuner @@ -17,11 +17,13 @@ def test_gpu_pytorch(generate_random_triplets, loss): data = generate_random_triplets(4, 4) + catalog = DocumentArray(data.traverse_flat(['m'])) + embed_model = torch.nn.Sequential( torch.nn.Linear(in_features=4, out_features=4), ) - tuner = PytorchTuner(embed_model, loss) + tuner = PytorchTuner(embed_model, catalog, loss) # Run quick training - mainly makes sure no errors appear, and that the model # is moved to GPU