From 6fd3e1ea63ced61c32d1a12e4cb2d4ba76a8d870 Mon Sep 17 00:00:00 2001
From: Tadej Svetina <tadej.svetina@jina.ai>
Date: Tue, 19 Oct 2021 16:20:22 +0200
Subject: [PATCH] docs(tuner): add docstrings (#148)

* docs(tuner): add docstrings

* fix: typo

* fix: apply suggestion

Co-authored-by: Wang Bo <bo.wang@jina.ai>

Co-authored-by: Wang Bo <bo.wang@jina.ai>
---
 finetuner/tuner/__init__.py         | 36 ++++++++++++
 finetuner/tuner/base.py             |  7 +++
 finetuner/tuner/keras/__init__.py   | 46 ++++++++++++++++
 finetuner/tuner/keras/losses.py     | 85 +++++++++++++++++++++++++++--
 finetuner/tuner/paddle/__init__.py  | 45 +++++++++++++++
 finetuner/tuner/paddle/losses.py    | 76 ++++++++++++++++++++++++++
 finetuner/tuner/pytorch/__init__.py | 45 +++++++++++++++
 finetuner/tuner/pytorch/losses.py   | 76 ++++++++++++++++++++++++++
 8 files changed, 411 insertions(+), 5 deletions(-)

diff --git a/finetuner/tuner/__init__.py b/finetuner/tuner/__init__.py
index d5ef27dc9..af1bb7684 100644
--- a/finetuner/tuner/__init__.py
+++ b/finetuner/tuner/__init__.py
@@ -38,6 +38,34 @@ def fit(
     device: str = 'cpu',
     **kwargs,
 ) -> TunerReturnType:
+    """Finetune the model on the training data.
+
+    :param train_data: Data on which to train the model
+    :param eval_data: Data on which to evaluate the model at the end of each epoch
+    :param epoch: Number of epochs to train the model
+    :param batch_size: The batch size to use for training and evaluation
+    :param learning_rate: Learning rate to use in training
+    :param optimizer: Which optimizer to use in training. Supported
+        values/optimizers are:
+        - ``"adam"`` for the Adam optimizer
+        - ``"rmsprop"`` for the RMSProp optimizer
+        - ``"sgd"`` for the SGD optimizer with momentum
+    :param optimizer_kwargs: Keyword arguments to pass to the optimizer. The
+        supported arguments, togethere with their defailt values, are:
+        - ``"adam"``:  ``{'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08}``
+        - ``"rmsprop"``::
+
+            {
+                'rho': 0.99,
+                'momentum': 0.0,
+                'epsilon': 1e-08,
+                'centered': False,
+            }
+
+        - ``"sgd"``: ``{'momentum': 0.0, 'nesterov': False}``
+    :param device: The device to which to move the model. Supported options are
+        ``"cpu"`` and ``"cuda"`` (for GPU)
+    """
     ft = get_tuner_class(embed_model)
     if catalog is None:
         train_data = DocumentArray(train_data() if callable(train_data) else train_data)
@@ -60,6 +88,14 @@ def fit(
 
 
 def save(embed_model: AnyDNN, model_path: str, *args, **kwargs) -> None:
+    """Save the embedding model.
+
+    :param embed_model: The embedding model to save
+    :param model_path: Path to file/folder where to save the model
+    :param args: Arguments to pass to framework-specific tuner's ``save`` method
+    :param kwargs: Keyword arguments to pass to framework-specific tuner's ``save``
+        method
+    """
     ft = get_tuner_class(embed_model)
 
     ft(embed_model).save(model_path, *args, **kwargs)
diff --git a/finetuner/tuner/base.py b/finetuner/tuner/base.py
index 5e4058cbb..d141b1915 100644
--- a/finetuner/tuner/base.py
+++ b/finetuner/tuner/base.py
@@ -27,6 +27,13 @@ def __init__(
         loss: Union[AnyDNN, str] = 'CosineSiameseLoss',
         **kwargs,
     ):
+        """Create the tuner instance.
+
+        :param embed_model: Model that produces embeddings from inputs
+        :param loss: Either the loss object instance, or the name of the loss function.
+            Currently available losses are ``CosineSiameseLoss``,
+            ``EuclideanSiameseLoss``, ``EuclideanTripletLoss`` and ``CosineTripletLoss``
+        """
         self._embed_model = embed_model
         self._loss = self._get_loss(loss)
         self._train_data_len = 0
diff --git a/finetuner/tuner/keras/__init__.py b/finetuner/tuner/keras/__init__.py
index 06d326a03..8394731e8 100644
--- a/finetuner/tuner/keras/__init__.py
+++ b/finetuner/tuner/keras/__init__.py
@@ -16,12 +16,15 @@
 
 class KerasTuner(BaseTuner):
     def _get_loss(self, loss: Union[BaseLoss, str]):
+        """Get the loss layer."""
+
         if isinstance(loss, str):
             return getattr(losses, loss)()
         elif isinstance(loss, BaseLoss):
             return loss
 
     def _get_data_loader(self, inputs, batch_size: int, shuffle: bool):
+        """Get tensorflow ``Dataset`` from the input data. """
 
         ds = get_dataset(datasets, self.arity)
         input_shape = self.embed_model.input_shape[1:]
@@ -45,6 +48,8 @@ def _get_data_loader(self, inputs, batch_size: int, shuffle: bool):
     def _get_optimizer(
         self, optimizer: str, optimizer_kwargs: Optional[dict], learning_rate: float
     ) -> Optimizer:
+        """Get the optimizer for training."""
+
         optimizer_kwargs = self._get_optimizer_kwargs(optimizer, optimizer_kwargs)
 
         if optimizer == 'adam':
@@ -59,6 +64,8 @@ def _get_optimizer(
             return keras.optimizers.SGD(learning_rate=learning_rate, **optimizer_kwargs)
 
     def _train(self, data, optimizer, description: str):
+        """Train the model on given labeled data"""
+
         losses = []
 
         log_generator = LogGenerator('T', losses)
@@ -88,6 +95,7 @@ def _train(self, data, optimizer, description: str):
         return losses
 
     def _eval(self, data, description: str = 'Evaluating', train_log: str = ''):
+        """Evaluate the model on given labeled data"""
 
         losses = []
 
@@ -120,6 +128,34 @@ def fit(
         device: str = 'cpu',
         **kwargs,
     ) -> TunerStats:
+        """Finetune the model on the training data.
+
+        :param train_data: Data on which to train the model
+        :param eval_data: Data on which to evaluate the model at the end of each epoch
+        :param epoch: Number of epochs to train the model
+        :param batch_size: The batch size to use for training and evaluation
+        :param learning_rate: Learning rate to use in training
+        :param optimizer: Which optimizer to use in training. Supported
+            values/optimizers are:
+            - ``"adam"`` for the Adam optimizer
+            - ``"rmsprop"`` for the RMSProp optimizer
+            - ``"sgd"`` for the SGD optimizer with momentum
+        :param optimizer_kwargs: Keyword arguments to pass to the optimizer. The
+            supported arguments, togethere with their defailt values, are:
+            - ``"adam"``:  ``{'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08}``
+            - ``"rmsprop"``::
+
+                {
+                    'rho': 0.99,
+                    'momentum': 0.0,
+                    'epsilon': 1e-08,
+                    'centered': False,
+                }
+
+            - ``"sgd"``: ``{'momentum': 0.0, 'nesterov': False}``
+        :param device: The device to which to move the model. Supported options are
+            ``"cpu"`` and ``"cuda"`` (for GPU)
+        """
 
         _train_data = self._get_data_loader(
             inputs=train_data, batch_size=batch_size, shuffle=False
@@ -167,4 +203,14 @@ def get_embeddings(self, data: DocumentArrayLike):
             doc.embedding = np.array(embed)
 
     def save(self, *args, **kwargs):
+        """Save the embedding model.
+
+        You need to pass the path where to save the model in either ``args`` or
+        ``kwargs`` (for ``filepath`` key).
+
+        :param args: Arguments to pass to ``save`` method of the embedding model
+        :param kwargs: Keyword arguments to pass to ``save`` method of the embedding
+            model
+        """
+
         self.embed_model.save(*args, **kwargs)
diff --git a/finetuner/tuner/keras/losses.py b/finetuner/tuner/keras/losses.py
index fa04a2534..f37e2d5de 100644
--- a/finetuner/tuner/keras/losses.py
+++ b/finetuner/tuner/keras/losses.py
@@ -5,9 +5,27 @@
 
 
 class CosineSiameseLoss(BaseLoss, Layer):
+    """Computes the loss for a siamese network using cosine distance.
+
+    The loss for a pair of objects equals ``(target - cos_sim)^2``, where ``target``
+    should equal 1 when both objects belong to the same class, and to -1 when they
+    belong to different classes. The ``cos_sim`` represents the cosime similarity
+    between both objects.
+
+    The final loss is the average over losses for all pairs of objects in the batch.
+    """
+
     arity = 2
 
     def call(self, inputs, **kwargs):
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the first objects of the pair
+            - ``[N, D]`` tensor of embeddings of the second objects of the pair
+            - ``[N, ]`` tensor of target values
+        """
+
         l_emb, r_emb, target = inputs
         normalize_a = tf.nn.l2_normalize(l_emb, axis=-1)
         normalize_b = tf.nn.l2_normalize(r_emb, axis=-1)
@@ -16,6 +34,22 @@ def call(self, inputs, **kwargs):
 
 
 class EuclideanSiameseLoss(BaseLoss, Layer):
+    """Computes the loss for a siamese network using cosine distance.
+
+    This loss is also known as contrastive loss.
+
+    The loss being optimized equals::
+
+        [is_sim * dist + (1 - is_sim) * max(margin - dist, 0)]^2
+
+    where ``target`` should equal 1 when both objects belong to the same class,
+    and 0 otheriwse. The ``dist`` is the euclidean distance between the embeddings of
+    the objects, and ``margin`` is some number, used here to ensure better stability
+    of training.
+
+    The final loss is the average over losses for all pairs of objects in the batch.
+    """
+
     arity = 2
 
     def __init__(self, margin: float = 1.0):
@@ -23,6 +57,13 @@ def __init__(self, margin: float = 1.0):
         self.margin = margin
 
     def call(self, inputs, **kwargs):
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the first objects of the pair
+            - ``[N, D]`` tensor of embeddings of the second objects of the pair
+            - ``[N, ]`` tensor of target values
+        """
         l_emb, r_emb, target = inputs
         eucl_dist = tf.reduce_sum(tf.math.squared_difference(l_emb, r_emb), axis=-1)
         is_similar = tf.cast(target > 0, tf.float32)
@@ -35,6 +76,17 @@ def call(self, inputs, **kwargs):
 
 
 class EuclideanTripletLoss(BaseLoss, Layer):
+    """Compute the loss for a triplet network using euclidean distance.
+
+    The loss is computed as ``max(dist_pos - dist_neg + margin, 0)``, where ``dist_pos``
+    is the euclidean distance between the anchor embedding and positive embedding,
+    ``dist_neg`` is the euclidean distance between the anchor and negative embedding,
+    and ``margin`` represents a wedge between the desired wedge between anchor-negative
+    and anchor-positive distances.
+
+    The final loss is the average over losses for all triplets in the batch.
+    """
+
     arity = 3
 
     def __init__(self, margin: float = 1.0, **kwargs):
@@ -42,6 +94,13 @@ def __init__(self, margin: float = 1.0, **kwargs):
         self._margin = margin
 
     def call(self, inputs, **kwargs):
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the anchor objects
+            - ``[N, D]`` tensor of embeddings of the positive objects
+            - ``[N, D]`` tensor of embeddings of the negative objects
+        """
         anchor, positive, negative, _ = inputs
 
         # Seems that tf.norm suffers from numeric instability as explained here
@@ -49,15 +108,24 @@ def call(self, inputs, **kwargs):
         dist_pos = tf.reduce_sum(tf.math.squared_difference(anchor, positive), axis=-1)
         dist_neg = tf.reduce_sum(tf.math.squared_difference(anchor, negative), axis=-1)
 
-        dist_pos = tf.maximum(dist_pos, 1e-9)
-        dist_neg = tf.maximum(dist_neg, 1e-9)
+        dist_pos = tf.sqrt(tf.maximum(dist_pos, 1e-9))
+        dist_neg = tf.sqrt(tf.maximum(dist_neg, 1e-9))
 
-        return tf.reduce_mean(
-            tf.nn.relu(tf.sqrt(dist_pos) - tf.sqrt(dist_neg) + self._margin)
-        )
+        return tf.reduce_mean(tf.nn.relu(dist_pos - dist_neg + self._margin))
 
 
 class CosineTripletLoss(BaseLoss, Layer):
+    """Compute the loss for a triplet network using cosine distance.
+
+    The loss is computed as ``max(dist_pos - dist_neg + margin, 0)``, where ``dist_pos``
+    is the cosine distance between the anchor embedding and positive embedding,
+    ``dist_neg`` is the cosine distance between the anchor and negative embedding, and
+    ``margin`` represents a wedge between the desired wedge between anchor-negative and
+    anchor-positive distances.
+
+    The final loss is the average over losses for all triplets in the batch.
+    """
+
     arity = 3
 
     def __init__(self, margin: float = 1.0):
@@ -65,6 +133,13 @@ def __init__(self, margin: float = 1.0):
         self._margin = margin
 
     def call(self, inputs, **kwargs):
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the anchor objects
+            - ``[N, D]`` tensor of embeddings of the positive objects
+            - ``[N, D]`` tensor of embeddings of the negative objects
+        """
         anchor, positive, negative, _ = inputs
 
         # Seems that tf.norm suffers from numeric instability as explained here
diff --git a/finetuner/tuner/paddle/__init__.py b/finetuner/tuner/paddle/__init__.py
index 2a7500c11..0632a75e9 100644
--- a/finetuner/tuner/paddle/__init__.py
+++ b/finetuner/tuner/paddle/__init__.py
@@ -16,12 +16,16 @@
 
 class PaddleTuner(BaseTuner):
     def _get_loss(self, loss: Union[BaseLoss, str]):
+        """Get the loss layer."""
+
         if isinstance(loss, str):
             return getattr(losses, loss)()
         elif isinstance(loss, BaseLoss):
             return loss
 
     def _get_data_loader(self, inputs, batch_size: int, shuffle: bool):
+        """Get the paddle ``DataLoader`` from the input data. """
+
         ds = get_dataset(datasets, self.arity)
         return DataLoader(
             dataset=ds(inputs=inputs, catalog=self._catalog),
@@ -32,6 +36,8 @@ def _get_data_loader(self, inputs, batch_size: int, shuffle: bool):
     def _get_optimizer(
         self, optimizer: str, optimizer_kwargs: Optional[dict], learning_rate: float
     ) -> Optimizer:
+        """Get the optimizer for training."""
+
         params = self._embed_model.parameters()
         optimizer_kwargs = self._get_optimizer_kwargs(optimizer, optimizer_kwargs)
 
@@ -56,6 +62,8 @@ def _get_optimizer(
             )
 
     def _eval(self, data, description: str = 'Evaluating', train_log: str = ''):
+        """Evaluate the model on given labeled data"""
+
         self._embed_model.eval()
 
         losses = []
@@ -78,6 +86,7 @@ def _eval(self, data, description: str = 'Evaluating', train_log: str = ''):
         return losses
 
     def _train(self, data, optimizer: Optimizer, description: str):
+        """Train the model on given labeled data"""
 
         self._embed_model.train()
 
@@ -119,6 +128,34 @@ def fit(
         device: str = 'cpu',
         **kwargs,
     ) -> TunerStats:
+        """Finetune the model on the training data.
+
+        :param train_data: Data on which to train the model
+        :param eval_data: Data on which to evaluate the model at the end of each epoch
+        :param epoch: Number of epochs to train the model
+        :param batch_size: The batch size to use for training and evaluation
+        :param learning_rate: Learning rate to use in training
+        :param optimizer: Which optimizer to use in training. Supported
+            values/optimizers are:
+            - ``"adam"`` for the Adam optimizer
+            - ``"rmsprop"`` for the RMSProp optimizer
+            - ``"sgd"`` for the SGD optimizer with momentum
+        :param optimizer_kwargs: Keyword arguments to pass to the optimizer. The
+            supported arguments, togethere with their defailt values, are:
+            - ``"adam"``:  ``{'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08}``
+            - ``"rmsprop"``::
+
+                {
+                    'rho': 0.99,
+                    'momentum': 0.0,
+                    'epsilon': 1e-08,
+                    'centered': False,
+                }
+
+            - ``"sgd"``: ``{'momentum': 0.0, 'nesterov': False}``
+        :param device: The device to which to move the model. Supported options are
+            ``"cpu"`` and ``"cuda"`` (for GPU)
+        """
 
         if device == 'cuda':
             paddle.set_device('gpu:0')
@@ -161,4 +198,12 @@ def get_embeddings(self, data: DocumentArrayLike):
             doc.embedding = np.array(embed)
 
     def save(self, *args, **kwargs):
+        """Save the embedding model.
+
+        You need to pass the path where to save the model in either ``args`` or
+        ``kwargs`` (for ``path`` key).
+
+        :param args: Arguments to pass to ``paddle.save`` function
+        :param kwargs: Keyword arguments to pass to ``paddle.save`` function
+        """
         paddle.save(self.embed_model.state_dict(), *args, **kwargs)
diff --git a/finetuner/tuner/paddle/losses.py b/finetuner/tuner/paddle/losses.py
index 8c48a4569..30cab642c 100644
--- a/finetuner/tuner/paddle/losses.py
+++ b/finetuner/tuner/paddle/losses.py
@@ -8,11 +8,28 @@
 
 
 class CosineSiameseLoss(BaseLoss, nn.Layer):
+    """Computes the loss for a siamese network using cosine distance.
+
+    The loss for a pair of objects equals ``(target - cos_sim)^2``, where ``target``
+    should equal 1 when both objects belong to the same class, and to -1 when they
+    belong to different classes. The ``cos_sim`` represents the cosime similarity
+    between both objects.
+
+    The final loss is the average over losses for all pairs of objects in the batch.
+    """
+
     arity = 2
 
     def forward(
         self, embeddings: List[paddle.Tensor], target: paddle.Tensor
     ) -> paddle.Tensor:
+        """Compute the loss.
+
+        :param embeddings: Should be a list or a tuple containing two tensors:
+            - ``[N, D]`` tensor of embeddings of the first objects of the pair
+            - ``[N, D]`` tensor of embeddings of the second objects of the pair
+        :param target: A ``[N, ]`` tensor of target values
+        """
         l_emb, r_emb = embeddings
         cos_sim = F.cosine_similarity(l_emb, r_emb)
         loss = F.mse_loss(cos_sim, target)
@@ -20,6 +37,22 @@ def forward(
 
 
 class EuclideanSiameseLoss(BaseLoss, nn.Layer):
+    """Computes the loss for a siamese network using cosine distance.
+
+    This loss is also known as contrastive loss.
+
+    The loss being optimized equals::
+
+        [is_sim * dist + (1 - is_sim) * max(margin - dist, 0)]^2
+
+    where ``target`` should equal 1 when both objects belong to the same class,
+    and 0 otheriwse. The ``dist`` is the euclidean distance between the embeddings of
+    the objects, and ``margin`` is some number, used here to ensure better stability
+    of training.
+
+    The final loss is the average over losses for all pairs of objects in the batch.
+    """
+
     arity = 2
 
     def __init__(self, margin: float = 1.0):
@@ -30,6 +63,13 @@ def __init__(self, margin: float = 1.0):
     def forward(
         self, embeddings: List[paddle.Tensor], target: paddle.Tensor
     ) -> paddle.Tensor:
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the first objects of the pair
+            - ``[N, D]`` tensor of embeddings of the second objects of the pair
+        :param target: A ``[N, ]`` tensor of target values
+        """
         l_emb, r_emb = embeddings
         eucl_dist = self._dist(l_emb, r_emb)
         is_similar = paddle.cast(target > 0, paddle.float32)
@@ -41,6 +81,17 @@ def forward(
 
 
 class EuclideanTripletLoss(BaseLoss, nn.Layer):
+    """Compute the loss for a triplet network using euclidean distance.
+
+    The loss is computed as ``max(dist_pos - dist_neg + margin, 0)``, where ``dist_pos``
+    is the euclidean distance between the anchor embedding and positive embedding,
+    ``dist_neg`` is the euclidean distance between the anchor and negative embedding,
+    and ``margin`` represents a wedge between the desired wedge between anchor-negative
+    and anchor-positive distances.
+
+    The final loss is the average over losses for all triplets in the batch.
+    """
+
     arity = 3
 
     def __init__(self, margin: float = 1.0):
@@ -51,6 +102,13 @@ def __init__(self, margin: float = 1.0):
     def forward(
         self, embeddings: List[paddle.Tensor], target: paddle.Tensor
     ) -> paddle.Tensor:
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the anchor objects
+            - ``[N, D]`` tensor of embeddings of the positive objects
+            - ``[N, D]`` tensor of embeddings of the negative objects
+        """
         anchor, positive, negative = embeddings
         dist_pos = self._dist(anchor, positive)
         dist_neg = self._dist(anchor, negative)
@@ -59,6 +117,17 @@ def forward(
 
 
 class CosineTripletLoss(BaseLoss, nn.Layer):
+    """Compute the loss for a triplet network using cosine distance.
+
+    The loss is computed as ``max(dist_pos - dist_neg + margin, 0)``, where ``dist_pos``
+    is the cosine distance between the anchor embedding and positive embedding,
+    ``dist_neg`` is the cosine distance between the anchor and negative embedding, and
+    ``margin`` represents a wedge between the desired wedge between anchor-negative and
+    anchor-positive distances.
+
+    The final loss is the average over losses for all triplets in the batch.
+    """
+
     arity = 3
 
     def __init__(self, margin: float = 1.0):
@@ -68,6 +137,13 @@ def __init__(self, margin: float = 1.0):
     def forward(
         self, embeddings: List[paddle.Tensor], target: paddle.Tensor
     ) -> paddle.Tensor:
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the anchor objects
+            - ``[N, D]`` tensor of embeddings of the positive objects
+            - ``[N, D]`` tensor of embeddings of the negative objects
+        """
         anchor, positive, negative = embeddings
         dist_pos = 1 - F.cosine_similarity(anchor, positive)
         dist_neg = 1 - F.cosine_similarity(anchor, negative)
diff --git a/finetuner/tuner/pytorch/__init__.py b/finetuner/tuner/pytorch/__init__.py
index a514b2092..c85011c66 100644
--- a/finetuner/tuner/pytorch/__init__.py
+++ b/finetuner/tuner/pytorch/__init__.py
@@ -15,12 +15,16 @@
 
 class PytorchTuner(BaseTuner):
     def _get_loss(self, loss: Union[BaseLoss, str]):
+        """Get the loss layer."""
+
         if isinstance(loss, str):
             return getattr(losses, loss)()
         elif isinstance(loss, BaseLoss):
             return loss
 
     def _get_data_loader(self, inputs, batch_size: int, shuffle: bool):
+        """Get pytorch ``DataLoader`` data loader from the input data. """
+
         ds = get_dataset(datasets, self.arity)
         return DataLoader(
             dataset=ds(inputs=inputs, catalog=self._catalog),
@@ -31,6 +35,8 @@ def _get_data_loader(self, inputs, batch_size: int, shuffle: bool):
     def _get_optimizer(
         self, optimizer: str, optimizer_kwargs: Optional[dict], learning_rate: float
     ) -> Optimizer:
+        """Get the optimizer for training."""
+
         params = self._embed_model.parameters()
         optimizer_kwargs = self._get_optimizer_kwargs(optimizer, optimizer_kwargs)
 
@@ -59,6 +65,8 @@ def _get_optimizer(
             )
 
     def _eval(self, data, description: str = 'Evaluating', train_log: str = ''):
+        """Evaluate the model on given labeled data"""
+
         self._embed_model.eval()
 
         losses = []
@@ -85,6 +93,7 @@ def _eval(self, data, description: str = 'Evaluating', train_log: str = ''):
         return losses
 
     def _train(self, data, optimizer: Optimizer, description: str):
+        """Train the model on given labeled data"""
 
         self._embed_model.train()
 
@@ -129,6 +138,34 @@ def fit(
         device: str = 'cpu',
         **kwargs,
     ) -> TunerStats:
+        """Finetune the model on the training data.
+
+        :param train_data: Data on which to train the model
+        :param eval_data: Data on which to evaluate the model at the end of each epoch
+        :param epoch: Number of epochs to train the model
+        :param batch_size: The batch size to use for training and evaluation
+        :param learning_rate: Learning rate to use in training
+        :param optimizer: Which optimizer to use in training. Supported
+            values/optimizers are:
+            - ``"adam"`` for the Adam optimizer
+            - ``"rmsprop"`` for the RMSProp optimizer
+            - ``"sgd"`` for the SGD optimizer with momentum
+        :param optimizer_kwargs: Keyword arguments to pass to the optimizer. The
+            supported arguments, togethere with their defailt values, are:
+            - ``"adam"``:  ``{'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-08}``
+            - ``"rmsprop"``::
+
+                {
+                    'rho': 0.99,
+                    'momentum': 0.0,
+                    'epsilon': 1e-08,
+                    'centered': False,
+                }
+
+            - ``"sgd"``: ``{'momentum': 0.0, 'nesterov': False}``
+        :param device: The device to which to move the model. Supported options are
+            ``"cpu"`` and ``"cuda"`` (for GPU)
+        """
         if device == 'cpu':
             self.device = torch.device('cpu')
         elif device == 'cuda':
@@ -177,4 +214,12 @@ def get_embeddings(self, data: DocumentArrayLike):
                 doc.embedding = embed.cpu().numpy()
 
     def save(self, *args, **kwargs):
+        """Save the embedding model.
+
+        You need to pass the path where to save the model in either ``args`` or
+        ``kwargs`` (for ``f`` key).
+
+        :param args: Arguments to pass to ``torch.save`` function
+        :param kwargs: Keyword arguments to pass to ``torch.save`` function
+        """
         torch.save(self.embed_model.state_dict(), *args, **kwargs)
diff --git a/finetuner/tuner/pytorch/losses.py b/finetuner/tuner/pytorch/losses.py
index 2eb5469b5..c8ee3e2a4 100644
--- a/finetuner/tuner/pytorch/losses.py
+++ b/finetuner/tuner/pytorch/losses.py
@@ -8,11 +8,28 @@
 
 
 class CosineSiameseLoss(BaseLoss, nn.Module):
+    """Computes the loss for a siamese network using cosine distance.
+
+    The loss for a pair of objects equals ``(target - cos_sim)^2``, where ``target``
+    should equal 1 when both objects belong to the same class, and to -1 when they
+    belong to different classes. The ``cos_sim`` represents the cosime similarity
+    between both objects.
+
+    The final loss is the average over losses for all pairs of objects in the batch.
+    """
+
     arity = 2
 
     def forward(
         self, embeddings: List[torch.Tensor], target: torch.Tensor
     ) -> torch.Tensor:
+        """Compute the loss.
+
+        :param embeddings: Should be a list or a tuple containing two tensors:
+            - ``[N, D]`` tensor of embeddings of the first objects of the pair
+            - ``[N, D]`` tensor of embeddings of the second object of the pair
+        :param target: A ``[N, ]`` tensor of target values
+        """
         l_emb, r_emb = embeddings
         cos_sim = F.cosine_similarity(l_emb, r_emb)
         loss = F.mse_loss(cos_sim, target)
@@ -20,6 +37,22 @@ def forward(
 
 
 class EuclideanSiameseLoss(BaseLoss, nn.Module):
+    """Computes the loss for a siamese network using cosine distance.
+
+    This loss is also known as contrastive loss.
+
+    The loss being optimized equals::
+
+        [is_sim * dist + (1 - is_sim) * max(margin - dist, 0)]^2
+
+    where ``target`` should equal 1 when both objects belong to the same class,
+    and 0 otheriwse. The ``dist`` is the euclidean distance between the embeddings of
+    the objects, and ``margin`` is some number, used here to ensure better stability
+    of training.
+
+    The final loss is the average over losses for all pairs of objects in the batch.
+    """
+
     arity = 2
 
     def __init__(self, margin: float = 1.0):
@@ -29,6 +62,13 @@ def __init__(self, margin: float = 1.0):
     def forward(
         self, embeddings: List[torch.Tensor], target: torch.Tensor
     ) -> torch.Tensor:
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the first objects of the pair
+            - ``[N, D]`` tensor of embeddings of the second objects of the pair
+        :param target: A ``[N, ]`` tensor of target values
+        """
         l_emb, r_emb = embeddings
         eucl_dist = F.pairwise_distance(l_emb, r_emb, p=2)
         is_similar = (target > 0).to(torch.float32)
@@ -40,6 +80,17 @@ def forward(
 
 
 class EuclideanTripletLoss(BaseLoss, nn.Module):
+    """Compute the loss for a triplet network using euclidean distance.
+
+    The loss is computed as ``max(dist_pos - dist_neg + margin, 0)``, where ``dist_pos``
+    is the euclidean distance between the anchor embedding and positive embedding,
+    ``dist_neg`` is the euclidean distance between the anchor and negative embedding,
+    and ``margin`` represents a wedge between the desired wedge between anchor-negative
+    and anchor-positive distances.
+
+    The final loss is the average over losses for all triplets in the batch.
+    """
+
     arity = 3
 
     def __init__(self, margin: float = 1.0):
@@ -49,6 +100,13 @@ def __init__(self, margin: float = 1.0):
     def forward(
         self, embeddings: List[torch.Tensor], target: torch.Tensor
     ) -> torch.Tensor:
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the anchor objects
+            - ``[N, D]`` tensor of embeddings of the positive objects
+            - ``[N, D]`` tensor of embeddings of the negative objects
+        """
         anchor, positive, negative = embeddings
         dist_pos = F.pairwise_distance(anchor, positive, p=2)
         dist_neg = F.pairwise_distance(anchor, negative, p=2)
@@ -57,9 +115,27 @@ def forward(
 
 
 class CosineTripletLoss(EuclideanTripletLoss):
+    """Compute the loss for a triplet network using cosine distance.
+
+    The loss is computed as ``max(dist_pos - dist_neg + margin, 0)``, where ``dist_pos``
+    is the cosine distance between the anchor embedding and positive embedding,
+    ``dist_neg`` is the cosine distance between the anchor and negative embedding, and
+    ``margin`` represents a wedge between the desired wedge between anchor-negative and
+    anchor-positive distances.
+
+    The final loss is the average over losses for all triplets in the batch.
+    """
+
     def forward(
         self, embeddings: List[torch.Tensor], target: torch.Tensor
     ) -> torch.Tensor:
+        """Compute the loss.
+
+        :param inputs: Should be a list or a tuple containing three tensors:
+            - ``[N, D]`` tensor of embeddings of the anchor objects
+            - ``[N, D]`` tensor of embeddings of the positive objects
+            - ``[N, D]`` tensor of embeddings of the negative objects
+        """
         anchor, positive, negative = embeddings
         dist_pos = 1 - F.cosine_similarity(anchor, positive)
         dist_neg = 1 - F.cosine_similarity(anchor, negative)