From 45fcb9f89d85fab2854d5ffdd63a085f7ad6ab28 Mon Sep 17 00:00:00 2001
From: Pritha Gupta <prithagupta.nsit@gmail.com>
Date: Mon, 3 Jun 2019 12:51:58 +0200
Subject: [PATCH 1/5] Completed the description for context-dependent models

---
 csrank/choicefunctions/fate_choice.py         | 13 ++++++-------
 csrank/choicefunctions/feta_choice.py         | 15 ++++++++++++++-
 csrank/core/cmpnet_core.py                    |  2 +-
 csrank/discretechoice/fate_discrete_choice.py | 12 ++++++------
 csrank/discretechoice/feta_discrete_choice.py | 13 +++++++++++++
 csrank/objectranking/fate_object_ranker.py    | 12 ++++++------
 csrank/objectranking/feta_object_ranker.py    | 14 +++++++++++++-
 7 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/csrank/choicefunctions/fate_choice.py b/csrank/choicefunctions/fate_choice.py
index 59e44be0..41e43130 100644
--- a/csrank/choicefunctions/fate_choice.py
+++ b/csrank/choicefunctions/fate_choice.py
@@ -17,11 +17,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units=
                  optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9), batch_size=256, metrics=None, random_state=None,
                  **kwargs):
         """
-            Create a FATE-network architecture for leaning discrete choice function. Training complexity is quadratic in
-            the number of objects and prediction complexity is only linear. The first-aggregate-then-evaluate approach
-            learns an embedding of each object and then aggregates that into a context representation
-            :math:`\\mu_{C(x)}`, where :math`C(x) = Q \setminus \{x\}` and then scores each object :math:`x` using a
-            generalized utility function :math:`U (x, \\mu_{C(x)})`.
+            Create a FATE-network architecture for leaning discrete choice function. The first-aggregate-then-evaluate
+            approach learns an embedding of each object and then aggregates that into a context representation
+            :math:`\\mu_{C(x)}` and then scores each object :math:`x` using a generalized utility function
+            :math:`U (x, \\mu_{C(x)})`.
+            To make it computationally efficient we take the the context :math:`C(x)` as query set :math:`Q`.
             The context-representation is evaluated as:
 
             .. math::
@@ -29,12 +29,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units=
 
             where :math:`\phi \colon \mathcal{X} \\to \mathcal{Z}` maps each object :math:`y` to an
             :math:`m`-dimensional embedding space :math:`\mathcal{Z} \subseteq \mathbb{R}^m`.
-            To make it computationally efficient we take the the context as query set :math:`Q`.
             The choice set is defined as:
 
             .. math::
 
-                c(Q) = \{ x_i \in Q \lvert \, U (x, \\mu_{C(x)}) > t \}
+                c(Q) = \{ x \in Q \lvert \, U (x, \\mu_{C(x)}) > t \}
 
 
             Parameters
diff --git a/csrank/choicefunctions/feta_choice.py b/csrank/choicefunctions/feta_choice.py
index ff0140e0..6cb2dd89 100644
--- a/csrank/choicefunctions/feta_choice.py
+++ b/csrank/choicefunctions/feta_choice.py
@@ -22,8 +22,21 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero
                  metrics=['binary_accuracy'], batch_size=256, random_state=None,
                  **kwargs):
         """
-            Create a FETA-network architecture for learning the choice functions.
+            Create a FETA-network architecture for learning choice functions.
+            The first-evaluate-then-aggregate approach approximates the context-dependent utility function using the
+            first-order utility function :math:`U_1 \colon \mathcal{X} \\times \mathcal{X} \\rightarrow [0,1]`
+            and zeroth-order utility function  :math:`U_0 \colon \mathcal{X} \\rightarrow [0,1]`.
+            The scores each object :math:`x` using a context-dependent utility function :math:`U (x, C_i)`:
+
+            .. math::
+                 U(x_i, C_i) = U_0(x_i) + \\frac{1}{n-1} \sum_{x_j \in Q \\setminus \{x_i\}} U_1(x_i , x_j) \, .
+
             Training and prediction complexity is quadratic in the number of objects.
+            The choice set is defined as:
+
+            .. math::
+
+                c(Q) = \{ x_i \in Q \lvert \, U (x_i, C_i) > t \}
 
             Parameters
             ----------
diff --git a/csrank/core/cmpnet_core.py b/csrank/core/cmpnet_core.py
index 530b3ecf..f3c7deeb 100644
--- a/csrank/core/cmpnet_core.py
+++ b/csrank/core/cmpnet_core.py
@@ -67,7 +67,7 @@ def _convert_instances(self, X, Y):
 
     def construct_model(self):
         """
-            Construct the CmpNEt which is used to approximate the :math:`U_1(x_i,x_j)`. For each pair of objects in
+            Construct the CmpNet which is used to approximate the :math:`U_1(x_i,x_j)`. For each pair of objects in
             :math:`x_i, x_j \in Q` we construct two sub-networks with weight sharing in all hidden layers.
             The output of these networks are connected to two sigmoid units that produces the outputs of the network,
             i.e., :math:`U(x_1,x_2), U(x_2,x_1)` for each pair of objects are evaluated.
diff --git a/csrank/discretechoice/fate_discrete_choice.py b/csrank/discretechoice/fate_discrete_choice.py
index 39501b8e..cff7697c 100644
--- a/csrank/discretechoice/fate_discrete_choice.py
+++ b/csrank/discretechoice/fate_discrete_choice.py
@@ -15,11 +15,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units=
                  kernel_regularizer=l2(l=0.01), optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9), batch_size=256,
                  random_state=None, **kwargs):
         """
-            Create a FATE-network architecture for leaning discrete choice function. Training complexity is quadratic in
-            the number of objects and prediction complexity is only linear. The first-aggregate-then-evaluate approach
-            learns an embedding of each object and then aggregates that into a context representation
-            :math:`\\mu_{C(x)}`, where :math`C(x) = Q \setminus \{x\}` and then scores each object :math:`x` using a
-            generalized utility function :math:`U (x, \\mu_{C(x)})`.
+            Create a FATE-network architecture for leaning discrete choice function. The first-aggregate-then-evaluate
+            approach learns an embedding of each object and then aggregates that into a context representation
+            :math:`\\mu_{C(x)}` and then scores each object :math:`x` using a generalized utility function
+            :math:`U (x, \\mu_{C(x)})`.
+            To make it computationally efficient we take the the context :math:`C(x)` as query set :math:`Q`.
             The context-representation is evaluated as:
 
             .. math::
@@ -27,7 +27,7 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units=
 
             where :math:`\phi \colon \mathcal{X} \\to \mathcal{Z}` maps each object :math:`y` to an
             :math:`m`-dimensional embedding space :math:`\mathcal{Z} \subseteq \mathbb{R}^m`.
-            To make it computationally efficient we take the the context as query set :math:`Q`.
+            Training complexity is quadratic in the number of objects and prediction complexity is only linear.
             The discrete choice for the given query set :math:`Q` is defined as:
 
             .. math::
diff --git a/csrank/discretechoice/feta_discrete_choice.py b/csrank/discretechoice/feta_discrete_choice.py
index f0941e0e..8d404f7c 100644
--- a/csrank/discretechoice/feta_discrete_choice.py
+++ b/csrank/discretechoice/feta_discrete_choice.py
@@ -20,7 +20,20 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero
                  metrics=['categorical_accuracy'], batch_size=256, random_state=None, **kwargs):
         """
             Create a FETA-network architecture for learning the discrete choice functions.
+            The first-evaluate-then-aggregate approach approximates the context-dependent utility function using the
+            first-order utility function :math:`U_1 \colon \mathcal{X} \\times \mathcal{X} \\rightarrow [0,1]`
+            and zeroth-order utility function  :math:`U_0 \colon \mathcal{X} \\rightarrow [0,1]`.
+            The scores each object :math:`x` using a context-dependent utility function :math:`U (x, C_i)`:
+
+            .. math::
+                 U(x_i, C_i) = U_0(x_i) + \\frac{1}{n-1} \sum_{x_j \in Q \\setminus \{x_i\}} U_1(x_i , x_j) \, .
+
             Training and prediction complexity is quadratic in the number of objects.
+            The discrete choice for the given query set :math:`Q` is defined as:
+
+            .. math::
+
+                dc(Q) := \operatorname{argmax}_{x_i \in Q}  \;  U (x_i, C_i)
 
             Parameters
             ----------
diff --git a/csrank/objectranking/fate_object_ranker.py b/csrank/objectranking/fate_object_ranker.py
index cf7f317b..119cea21 100644
--- a/csrank/objectranking/fate_object_ranker.py
+++ b/csrank/objectranking/fate_object_ranker.py
@@ -16,11 +16,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units=
                  loss_function=hinged_rank_loss, metrics=[zero_one_rank_loss_for_scores_ties], random_state=None,
                  **kwargs):
         """
-            Create a FATE-network architecture for leaning discrete choice function. Training complexity is quadratic in
-            the number of objects and prediction complexity is only linear. The first-aggregate-then-evaluate approach
-            learns an embedding of each object and then aggregates that into a context representation
-            :math:`\\mu_{C(x)}`, where :math`C(x) = Q \setminus \{x\}` and then scores each object :math:`x` using a
-            generalized utility function :math:`U (x, \\mu_{C(x)})`.
+            Create a FATE-network architecture for leaning object ranking function. The first-aggregate-then-evaluate
+            approach learns an embedding of each object and then aggregates that into a context representation
+            :math:`\\mu_{C(x)}` and then scores each object :math:`x` using a context-dependent utility function
+            :math:`U (x, \\mu_{C(x)})`.
+            To make it computationally efficient we take the the context :math:`C(x)` as query set :math:`Q`.
             The context-representation is evaluated as:
 
             .. math::
@@ -28,7 +28,7 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units=
 
             where :math:`\phi \colon \mathcal{X} \\to \mathcal{Z}` maps each object :math:`y` to an
             :math:`m`-dimensional embedding space :math:`\mathcal{Z} \subseteq \mathbb{R}^m`.
-            To make it computationally efficient we take the the context as query set :math:`Q`.
+            Training complexity is quadratic in the number of objects and prediction complexity is only linear.
             The ranking for the given query set :math:`Q` is defined as:
 
             .. math::
diff --git a/csrank/objectranking/feta_object_ranker.py b/csrank/objectranking/feta_object_ranker.py
index 79a74d3a..6c044c3b 100644
--- a/csrank/objectranking/feta_object_ranker.py
+++ b/csrank/objectranking/feta_object_ranker.py
@@ -17,8 +17,20 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero
                  optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9), metrics=None, batch_size=256, random_state=None,
                  **kwargs):
         """
-            Create a FETA-network architecture for object ranking.
+            Create a FETA-network architecture for object ranking. The first-evaluate-then-aggregate approach
+            approximates the context-dependent utility function using the first-order utility function
+            :math:`U_1 \colon \mathcal{X} \\times \mathcal{X} \\rightarrow [0,1]` and zeroth-order utility
+            function :math:`U_0 \colon \mathcal{X} \\rightarrow [0,1]`.
+            The scores each object :math:`x` using a context-dependent utility function :math:`U (x, C_i)`:
+
+            .. math::
+                 U(x_i, C_i) = U_0(x_i) + \\frac{1}{n-1} \sum_{x_j \in Q \\setminus \{x_i\}} U_1(x_i , x_j) \, .
+
             Training and prediction complexity is quadratic in the number of objects.
+            The ranking for the given query set :math:`Q` is defined as:
+
+            .. math::
+                ρ(Q)  = \operatorname{argsort}_{x_i \in Q}  \; U (x_i, C_i)
 
             Parameters
             ----------

From 404b1bcb4408b9d9c01cffa45e2feaf161566c45 Mon Sep 17 00:00:00 2001
From: Pritha Gupta <prithagupta.nsit@gmail.com>
Date: Mon, 3 Jun 2019 14:42:30 +0200
Subject: [PATCH 2/5] Completed description of models to learn the choice
 function

---
 csrank/choicefunctions/choice_functions.py    |  29 +--
 csrank/choicefunctions/cmpnet_choice.py       |  40 +++-
 csrank/choicefunctions/fate_choice.py         |  46 ++++-
 csrank/choicefunctions/feta_choice.py         |  25 +++
 .../generalized_linear_model.py               | 176 +++++++++---------
 csrank/choicefunctions/pairwise_choice.py     |  18 ++
 csrank/choicefunctions/ranknet_choice.py      |  39 +++-
 csrank/choicefunctions/util.py                |  83 +++++++++
 csrank/core/cmpnet_core.py                    |  27 +--
 csrank/core/fate_network.py                   |  10 +-
 csrank/core/feta_network.py                   |   2 +-
 csrank/core/ranknet_core.py                   |  33 ++--
 csrank/discretechoice/discrete_choice.py      |  36 ++--
 csrank/learner.py                             |   2 +-
 csrank/objectranking/object_ranker.py         |   7 +-
 15 files changed, 412 insertions(+), 161 deletions(-)

diff --git a/csrank/choicefunctions/choice_functions.py b/csrank/choicefunctions/choice_functions.py
index 20695b62..a488bead 100644
--- a/csrank/choicefunctions/choice_functions.py
+++ b/csrank/choicefunctions/choice_functions.py
@@ -16,22 +16,25 @@ def learning_problem(self):
         return CHOICE_FUNCTION
 
     def predict_for_scores(self, scores, **kwargs):
-        """ Predict choices for scores for a given collection of sets of objects.
+        """
+            Binary choice vector :math:`y` represents the choices amongst the objects in :math:`Q`, such that
+            :math:`y(k) = 1` represents that the object :math:`x_k` is chosen and :math:`y(k) = 0` represents it is not
+            chosen. Predict choices for the scores for a given collection of sets of objects (query sets).
 
-        Parameters
-        ----------
-        scores : dict or numpy array
-            Dictionary with a mapping from ranking size to numpy arrays
-            or a single numpy array of size containing scores of each object of size:
-            (n_instances, n_objects)
+            Parameters
+            ----------
+            scores : dict or numpy array
+                Dictionary with a mapping from query set size to numpy arrays
+                or a single numpy array of size containing scores of each object of size:
+                (n_instances, n_objects)
 
 
-        Returns
-        -------
-        Y : dict or numpy array
-            Dictionary with a mapping from ranking size to numpy arrays
-            or a single numpy array containing predicted ranking of size:
-            (n_instances, n_objects)
+            Returns
+            -------
+            Y : dict or numpy array
+                Dictionary with a mapping from query set size to numpy arrays
+                or a single numpy array containing predicted choice vectors of size:
+                (n_instances, n_objects)
         """
 
         if isinstance(scores, dict):
diff --git a/csrank/choicefunctions/cmpnet_choice.py b/csrank/choicefunctions/cmpnet_choice.py
index 4ff7ac61..93bb9173 100644
--- a/csrank/choicefunctions/cmpnet_choice.py
+++ b/csrank/choicefunctions/cmpnet_choice.py
@@ -89,8 +89,44 @@ def _convert_instances(self, X, Y):
     def construct_model(self):
         return super().construct_model()
 
-    def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1,
-            thin_thresholds=1, verbose=0, **kwd):
+    def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1, thin_thresholds=1, verbose=0,
+            **kwd):
+        """
+            Fit a CmptNet model for learning a choice fucntion on the provided set of queries X and preferences Y of
+            those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). For
+            learning this network the binary cross entropy loss function for a pair of objects :math:`x_i, x_j \in Q`
+            is defined as:
+
+            .. math::
+
+                C_{ij} =  -\\tilde{P_{ij}}(0)\\cdot \log(U(x_i,x_j)) - \\tilde{P_{ij}}(1) \\cdot \log(U(x_j,x_i)) \ ,
+
+            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
+            :math:`\\tilde{P_{ij}} = (1,0)` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = (0,1)`.
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Preferences in form of Orderings or Choices for given n_objects
+            epochs : int
+                Number of epochs to run if training for a fixed query size
+            callbacks : list
+                List of callbacks to be called during optimization
+            validation_split : float (range : [0,1])
+                Percentage of instances to split off to validate on
+            tune_size: float (range : [0,1])
+                Percentage of instances to split off to tune the threshold for the choice function
+            thin_thresholds: int
+                The number of instances of scores to skip while tuning the threshold
+            verbose : bool
+                Print verbose information
+            **kwd :
+                Keyword arguments for the fit function
+        """
         if tune_size > 0:
             X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state)
             try:
diff --git a/csrank/choicefunctions/fate_choice.py b/csrank/choicefunctions/fate_choice.py
index 41e43130..348ca094 100644
--- a/csrank/choicefunctions/fate_choice.py
+++ b/csrank/choicefunctions/fate_choice.py
@@ -102,7 +102,51 @@ def _construct_layers(self, **kwargs):
     def construct_model(self, n_features, n_objects):
         return super().construct_model(n_features, n_objects)
 
-    def fit(self, X, Y, tune_size=0.1, thin_thresholds=1, **kwargs):
+    def fit(self, X, Y, epochs=35, inner_epochs=1, callbacks=None, validation_split=0.1, verbose=0, global_lr=1.0,
+            global_momentum=0.9, min_bucket_size=500, refit=False, tune_size=0.1, thin_thresholds=1, **kwargs):
+        """
+            Fit a generic FATE-network model for learning a choice function on a provided set of queries.
+
+            The provided queries can be of a fixed size (numpy arrays) or of varying sizes in which case dictionaries
+            are expected as input. For varying sizes a meta gradient descent is performed across the
+            different query sizes.
+
+            Parameters
+            ----------
+            X : numpy array or dict
+                Feature vectors of the objects
+                (n_instances, n_objects, n_features) if numpy array or map from n_objects to numpy arrays
+            Y : numpy array or dict
+                Choices for given objects in the query
+                (n_instances, n_objects) if numpy array or map from n_objects to numpy arrays
+            epochs : int
+                Number of epochs to run if training for a fixed query size or
+                number of epochs of the meta gradient descent for the variadic model
+            inner_epochs : int
+                Number of epochs to train for each query size inside the variadic
+                model
+            callbacks : list
+                List of callbacks to be called during optimization
+            validation_split : float (range : [0,1])
+                Percentage of instances to split off to validate on
+            verbose : bool
+                Print verbose information
+            global_lr : float
+                Learning rate of the meta gradient descent (variadic model only)
+            global_momentum : float
+                Momentum for the meta gradient descent (variadic model only)
+            min_bucket_size : int
+                Restrict the training to queries of a minimum size
+            refit : bool
+                If True, create a new model object, otherwise continue fitting the
+                existing one if one exists.
+            tune_size: float (range : [0,1])
+                Percentage of instances to split off to tune the threshold for the choice function
+            thin_thresholds: int
+                The number of instances of scores to skip while tuning the threshold
+            **kwargs :
+                Keyword arguments for the fit function
+        """
         if tune_size > 0:
             X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state)
             try:
diff --git a/csrank/choicefunctions/feta_choice.py b/csrank/choicefunctions/feta_choice.py
index 6cb2dd89..3d32f4ea 100644
--- a/csrank/choicefunctions/feta_choice.py
+++ b/csrank/choicefunctions/feta_choice.py
@@ -178,6 +178,31 @@ def create_input_lambda(i):
 
     def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1, thin_thresholds=1, verbose=0,
             **kwd):
+        """
+            Fit a FETA-Network for learning a choice function on the provided set of queries X and preferences Y of
+            those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays).
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            epochs : int
+                Number of epochs to run if training for a fixed query size
+            callbacks : list
+                List of callbacks to be called during optimization
+            validation_split : float (range : [0,1])
+                Percentage of instances to split off to validate on
+            verbose : bool
+                Print verbose information
+            tune_size: float (range : [0,1])
+                Percentage of instances to split off to tune the threshold for the choice function
+            thin_thresholds: int
+                The number of instances of scores to skip while tuning the threshold
+            **kwd :
+                Keyword arguments for the fit function
+        """
         if tune_size > 0:
             X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state)
             try:
diff --git a/csrank/choicefunctions/generalized_linear_model.py b/csrank/choicefunctions/generalized_linear_model.py
index 16bf01fa..0a14d82d 100644
--- a/csrank/choicefunctions/generalized_linear_model.py
+++ b/csrank/choicefunctions/generalized_linear_model.py
@@ -5,12 +5,11 @@
 import pymc3 as pm
 import theano
 import theano.tensor as tt
-from pymc3 import Discrete
-from pymc3.distributions.dist_math import bound
 from sklearn.model_selection import train_test_split
 from sklearn.utils import check_random_state
 
 import csrank.theano_util as ttu
+from csrank.choicefunctions.util import create_weight_dictionary, BinaryCrossEntropyLikelihood
 from csrank.learner import Learner
 from csrank.util import print_dictionary
 from .choice_functions import ChoiceFunctions
@@ -19,7 +18,20 @@
 class GeneralizedLinearModel(ChoiceFunctions, Learner):
     def __init__(self, n_object_features, regularization='l2', random_state=None, **kwargs):
         """
-            Create an instance of the GeneralizedLinearModel model.
+            Create an instance of the GeneralizedLinearModel model for learning the choice function. This model is
+            adapted from the multinomial logit model :class:`MultinomialLogitModel`. The utility score for each object
+            in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is the weight vector.
+            The probability of choosing an object :math:`x_i` is defined by taking sigmoid over the utility scores:
+
+            .. math::
+
+                P(x_i \\lvert Q) = \\frac{1}{1+exp(-U(x_i))}
+
+            The choice set is defined as:
+
+            .. math::
+
+                c(Q) = \{ x_i \in Q \lvert \, P(x_i \\lvert Q) > t \}
 
             Parameters
             ----------
@@ -51,26 +63,56 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, **
         self.p = None
 
     @property
-    def default_configuration(self):
+    def model_configuration(self):
+        """
+            Constructs the dictionary containing the priors for the parameters for the model according to the
+            regularization function.
+            Returns
+            -------
+                configuration : dict
+                    Dictionary containing the priors applies on the weights
+        """
         if self.regularization == 'l2':
             weight = pm.Normal
             prior = 'sd'
         elif self.regularization == 'l1':
             weight = pm.Laplace
             prior = 'b'
-        config_dict = {
+        configuration = {
             'weights': [weight, {'mu': (pm.Normal, {'mu': 0, 'sd': 10}), prior: (pm.HalfCauchy, {'beta': 1})}]}
-        self.logger.info('Creating default config {}'.format(print_dictionary(config_dict)))
-        return config_dict
+        self.logger.info('Creating default config {}'.format(print_dictionary(configuration)))
+        return configuration
 
     def construct_model(self, X, Y):
-        self.logger.info('Creating model_args config {}'.format(print_dictionary(self.default_configuration)))
+        """
+            Constructs the linear logit model which evaluated the utility score as :math:`U(x) = w \cdot x`, where
+            :math:`w` is the weight vector. The probability of choosing the object :math:`x_i` from the query set
+            :math:`Q = \{x_1, \ldots ,x_n\}` is:
+
+            .. math::
+
+                P_i =  P(x_i \\lvert Q) = \\frac{1}{1+exp(-U(x_i))}
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Preferences in form of Choices for given objects
+
+            Returns
+            -------
+             model : pymc3 Model :class:`pm.Model`
+        """
+        self.logger.info('Creating model_args config {}'.format(print_dictionary(self.model_configuration)))
         with pm.Model() as self.model:
             self.Xt = theano.shared(X)
             self.Yt = theano.shared(Y)
             shapes = {'weights': self.n_object_features}
             # shapes = {'weights': (self.n_object_features, 3)}
-            weights_dict = create_weight_dictionary(self.default_configuration, shapes)
+            weights_dict = create_weight_dictionary(self.model_configuration, shapes)
             intercept = pm.Normal('intercept', mu=0, sd=10)
             utility = tt.dot(self.Xt, weights_dict['weights']) + intercept
             self.p = ttu.sigmoid(utility)
@@ -78,6 +120,36 @@ def construct_model(self, X, Y):
         self.logger.info("Model construction completed")
 
     def fit(self, X, Y, sampler='vi', tune_size=0.1, thin_thresholds=1, **kwargs):
+        """
+            Fit a generalized logit model on the provided set of queries X and preferences Y of those objects. The
+            provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
+            the binary cross entropy loss function for each object :math:`x_i \in Q` is defined as:
+
+            .. math::
+
+                C_{ij} =  -y(i)\log(P_i) - (1 - y(i))\log(1 - P_i) \enspace,
+
+            where :math:`y` is ground-truth choice vector of the objects in the given query set :math:`Q`.
+            The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`.
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
+                The sampler used to estimate the posterior mean and mass matrix from the trace.
+                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                * **nuts** : Use the No-U-Turn sampler
+            tune_size: float (range : [0,1])
+                Percentage of instances to split off to tune the threshold for the choice function
+            thin_thresholds: int
+                The number of instances of scores to skip while tuning the threshold
+            **kwargs :
+                Keyword arguments for the fit function
+        """
         if tune_size > 0:
             X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state)
             try:
@@ -153,6 +225,16 @@ def predict_for_scores(self, scores, **kwargs):
         return ChoiceFunctions.predict_for_scores(self, scores, **kwargs)
 
     def set_tunable_parameters(self, regularization="l1", **point):
+        """
+            Set tunable parameters of the Generalized Linear model to the values provided.
+
+            Parameters
+            ----------
+            regularization : {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            point: dict
+                Dictionary containing parameter values which are not tuned for the network
+        """
         self.regularization = regularization
         self.model = None
         self.trace = None
@@ -163,79 +245,3 @@ def set_tunable_parameters(self, regularization="l1", **point):
         if len(point) > 0:
             self.logger.warning('This ranking algorithm does not support'
                                 ' tunable parameters called: {}'.format(print_dictionary(point)))
-
-
-def create_weight_dictionary(model_args, shapes):
-    weights_dict = dict()
-    for key, value in model_args.items():
-        prior, params = copy.deepcopy(value)
-        for k in params.keys():
-            if isinstance(params[k], tuple):
-                params[k][1]['name'] = '{}_{}'.format(key, k)
-                params[k] = params[k][0](**params[k][1])
-        params['name'] = key
-        params['shape'] = shapes[key]
-        weights_dict[key] = prior(**params)
-    return weights_dict
-
-
-def binary_crossentropy(p, y_true):
-    if p.ndim > 1:
-        l = (tt.nnet.binary_crossentropy(p, y_true).sum(axis=1)).mean()
-    else:
-        l = tt.nnet.binary_crossentropy(p, y_true).mean(axis=0)
-    return -l
-
-
-def categorical_crossentropy(p, y_true):
-    return -tt.nnet.categorical_crossentropy(p, y_true)
-
-
-def categorical_hinge(p, y_true):
-    pos = tt.sum(y_true * p, axis=-1)
-    neg = tt.max((1. - y_true) * p, axis=-1)
-    return -tt.maximum(0., neg - pos + 1.)
-
-
-class BinaryCrossEntropyLikelihood(Discrete):
-    R"""
-    Categorical log-likelihood.
-
-    The most general discrete distribution.
-
-    .. math:: f(x \mid p) = p_x
-
-    ========  ===================================
-    Support   :math:`x \in \{0, 1, \ldots, |p|-1\}`
-    ========  ===================================
-
-    Parameters
-    ----------
-    p : array of floats
-        p > 0 and the elements of p must sum to 1. They will be automatically
-        rescaled otherwise.
-    """
-
-    def __init__(self, p, *args, **kwargs):
-        super(BinaryCrossEntropyLikelihood, self).__init__(*args, **kwargs)
-        self.loss_func = categorical_hinge
-        try:
-            self.k = tt.shape(p)[-1].tag.test_value
-        except AttributeError:
-            self.k = tt.shape(p)[-1]
-        self.p = tt.as_tensor_variable(p)
-        self.mode = tt.argmax(p)
-
-    def random(self, **kwargs):
-        return NotImplemented
-
-    def logp(self, value):
-        p = self.p
-        k = self.k
-        a = self.loss_func(p, value)
-        p = ttu.normalize(p)
-        sum_to1 = theano.gradient.zero_grad(
-            tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5))
-
-        value_k = tt.argmax(value, axis=1)
-        return bound(a, value_k >= 0, value_k <= (k - 1), sum_to1)
diff --git a/csrank/choicefunctions/pairwise_choice.py b/csrank/choicefunctions/pairwise_choice.py
index 897e0a08..842b7422 100644
--- a/csrank/choicefunctions/pairwise_choice.py
+++ b/csrank/choicefunctions/pairwise_choice.py
@@ -61,6 +61,24 @@ def _convert_instances(self, X, Y):
         return x_train, y_single
 
     def fit(self, X, Y, tune_size=0.1, thin_thresholds=1, **kwd):
+        """
+            Fit a generic preference learning model on a provided set of queries.
+            The provided queries can be of a fixed size (numpy arrays).
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            tune_size: float (range : [0,1])
+                Percentage of instances to split off to tune the threshold for the choice function
+            thin_thresholds: int
+                The number of instances of scores to skip while tuning the threshold
+            **kwd :
+                Keyword arguments for the fit function
+
+        """
         if tune_size > 0:
             X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state)
             try:
diff --git a/csrank/choicefunctions/ranknet_choice.py b/csrank/choicefunctions/ranknet_choice.py
index 08f5d3c2..3871468a 100644
--- a/csrank/choicefunctions/ranknet_choice.py
+++ b/csrank/choicefunctions/ranknet_choice.py
@@ -88,17 +88,48 @@ def _convert_instances(self, X, Y):
 
     def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1, thin_thresholds=1, verbose=0,
             **kwd):
+        """
+            Fit RankNet model for learning choice function on a provided set of queries. The provided queries can be of 
+            a fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of
+            objects :math:`x_i, x_j \in Q` is defined as:
+
+            .. math::
+
+                C_{ij} =  -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace,
+
+            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
+            :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = 0`.
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Preferences in form of Orderings or Choices for given n_objects
+            epochs : int
+                Number of epochs to run if training for a fixed query size
+            callbacks : list
+                List of callbacks to be called during optimization
+            validation_split : float (range : [0,1])
+                Percentage of instances to split off to validate on
+            tune_size: float (range : [0,1])
+                Percentage of instances to split off to tune the threshold for the choice function
+            thin_thresholds: int
+                The number of instances of scores to skip while tuning the threshold
+            verbose : bool
+                Print verbose information
+            **kwd :
+                Keyword arguments for the fit function
+        """
         if tune_size > 0:
             X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state)
             try:
-                super().fit(X_train, Y_train, epochs, callbacks,
-                            validation_split, verbose, **kwd)
+                super().fit(X_train, Y_train, epochs, callbacks, validation_split, verbose, **kwd)
             finally:
                 self.logger.info('Fitting utility function finished. Start tuning threshold.')
                 self.threshold = self._tune_threshold(X_val, Y_val, thin_thresholds=thin_thresholds)
         else:
-            super().fit(X, Y, epochs, callbacks, validation_split, verbose,
-                        **kwd)
+            super().fit(X, Y, epochs, callbacks, validation_split, verbose, **kwd)
             self.threshold = 0.5
 
     def _predict_scores_fixed(self, X, **kwargs):
diff --git a/csrank/choicefunctions/util.py b/csrank/choicefunctions/util.py
index 356fb9f5..19d4dd08 100644
--- a/csrank/choicefunctions/util.py
+++ b/csrank/choicefunctions/util.py
@@ -1,6 +1,13 @@
+import copy
 from itertools import product
 
 import numpy as np
+import theano
+from pymc3 import Discrete
+from pymc3.distributions.dist_math import bound
+from theano import tensor as tt
+
+from csrank import theano_util as ttu
 
 
 def generate_pairwise_instances(x, choice):
@@ -38,3 +45,79 @@ def generate_complete_pairwise_dataset(X, Y):
     Y_single = np.array(Y_single)
     X_train = X1 - X2
     return X1, X2, X_train, Y_double, Y_single
+
+
+def create_weight_dictionary(model_args, shapes):
+    weights_dict = dict()
+    for key, value in model_args.items():
+        prior, params = copy.deepcopy(value)
+        for k in params.keys():
+            if isinstance(params[k], tuple):
+                params[k][1]['name'] = '{}_{}'.format(key, k)
+                params[k] = params[k][0](**params[k][1])
+        params['name'] = key
+        params['shape'] = shapes[key]
+        weights_dict[key] = prior(**params)
+    return weights_dict
+
+
+def binary_crossentropy(p, y_true):
+    if p.ndim > 1:
+        l = (tt.nnet.binary_crossentropy(p, y_true).sum(axis=1)).mean()
+    else:
+        l = tt.nnet.binary_crossentropy(p, y_true).mean(axis=0)
+    return -l
+
+
+def categorical_crossentropy(p, y_true):
+    return -tt.nnet.categorical_crossentropy(p, y_true)
+
+
+def categorical_hinge(p, y_true):
+    pos = tt.sum(y_true * p, axis=-1)
+    neg = tt.max((1. - y_true) * p, axis=-1)
+    return -tt.maximum(0., neg - pos + 1.)
+
+
+class BinaryCrossEntropyLikelihood(Discrete):
+    R"""
+    Categorical log-likelihood.
+
+    The most general discrete distribution.
+
+    .. math:: f(x \mid p) = p_x
+
+    ========  ===================================
+    Support   :math:`x \in \{0, 1, \ldots, |p|-1\}`
+    ========  ===================================
+
+    Parameters
+    ----------
+    p : array of floats
+        p > 0 and the elements of p must sum to 1. They will be automatically
+        rescaled otherwise.
+    """
+
+    def __init__(self, p, *args, **kwargs):
+        super(BinaryCrossEntropyLikelihood, self).__init__(*args, **kwargs)
+        self.loss_func = categorical_hinge
+        try:
+            self.k = tt.shape(p)[-1].tag.test_value
+        except AttributeError:
+            self.k = tt.shape(p)[-1]
+        self.p = tt.as_tensor_variable(p)
+        self.mode = tt.argmax(p)
+
+    def random(self, **kwargs):
+        return NotImplemented
+
+    def logp(self, value):
+        p = self.p
+        k = self.k
+        a = self.loss_func(p, value)
+        p = ttu.normalize(p)
+        sum_to1 = theano.gradient.zero_grad(
+            tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5))
+
+        value_k = tt.argmax(value, axis=1)
+        return bound(a, value_k >= 0, value_k <= (k - 1), sum_to1)
diff --git a/csrank/core/cmpnet_core.py b/csrank/core/cmpnet_core.py
index f3c7deeb..a2895371 100644
--- a/csrank/core/cmpnet_core.py
+++ b/csrank/core/cmpnet_core.py
@@ -70,17 +70,8 @@ def construct_model(self):
             Construct the CmpNet which is used to approximate the :math:`U_1(x_i,x_j)`. For each pair of objects in
             :math:`x_i, x_j \in Q` we construct two sub-networks with weight sharing in all hidden layers.
             The output of these networks are connected to two sigmoid units that produces the outputs of the network,
-            i.e., :math:`U(x_1,x_2), U(x_2,x_1)` for each pair of objects are evaluated.
-            :math:`U(x_1,x_2)` is a measure of how favorable it is to choose :math:`x_1` over :math:`x_2`.
-            For learning this network the binary cross entropy loss function for a pair of example :math:`x_i, x_j \in Q`
-            is defined as:
-
-            .. math::
-
-                C_{ij} =  -\\tilde{P_{ij}}\log(U(x_i,x_j)) - (1 - \\tilde{P_{ij}})\log(U(x_j,x_i)) \enspace,
-
-            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
-            :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else 0.
+            i.e., :math:`U(x_1,x_2), U(x_2,x_1)` for each pair of objects are evaluated. :math:`U(x_1,x_2)` is a measure
+            of how favorable it is to choose :math:`x_1` over :math:`x_2`.
 
             Returns
             -------
@@ -106,6 +97,15 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0,
         """
             Fit a generic preference learning CmptNet on the provided set of queries X and preferences Y of those
             objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays).
+            For learning this network the binary cross entropy loss function for a pair of objects
+            :math:`x_i, x_j \in Q` is defined as:
+
+            .. math::
+
+                C_{ij} =  -\\tilde{P_{ij}}(0)\\cdot \log(U(x_i,x_j)) - \\tilde{P_{ij}}(1) \\cdot \log(U(x_j,x_i)) \ ,
+
+            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
+            :math:`\\tilde{P_{ij}} = (1,0)` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = (0,1)`.
 
             Parameters
             ----------
@@ -115,15 +115,16 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0,
             Y : numpy array
                 (n_instances, n_objects)
                 Preferences in form of Orderings or Choices for given n_objects
-
             epochs : int
                 Number of epochs to run if training for a fixed query size
             callbacks : list
                 List of callbacks to be called during optimization
-            validation_split : float
+            validation_split : float (range : [0,1])
                 Percentage of instances to split off to validate on
             verbose : bool
                 Print verbose information
+            **kwd :
+                Keyword arguments for the fit function
         """
         x1, x2, y_double = self._convert_instances(X, Y)
 
diff --git a/csrank/core/fate_network.py b/csrank/core/fate_network.py
index 827dc0ae..7c337d6c 100644
--- a/csrank/core/fate_network.py
+++ b/csrank/core/fate_network.py
@@ -358,11 +358,11 @@ def fit(self, X, Y, epochs=35, inner_epochs=1, callbacks=None, validation_split=
             Parameters
             ----------
             X : numpy array or dict
-                (n_instances, n_objects, n_features) if numpy array or
-                map from n_objects to numpy arrays
+                Feature vectors of the objects
+                (n_instances, n_objects, n_features) if numpy array or map from n_objects to numpy arrays
             Y : numpy array or dict
-                (n_instances, n_objects) if numpy array or
-                map from n_objects to numpy arrays
+                Preferences in form of rankings or choices for given objects
+                (n_instances, n_objects) if numpy array or map from n_objects to numpy arrays
             epochs : int
                 Number of epochs to run if training for a fixed query size or
                 number of epochs of the meta gradient descent for the variadic model
@@ -371,7 +371,7 @@ def fit(self, X, Y, epochs=35, inner_epochs=1, callbacks=None, validation_split=
                 model
             callbacks : list
                 List of callbacks to be called during optimization
-            validation_split : float
+            validation_split : float (range : [0,1])
                 Percentage of instances to split off to validate on
             verbose : bool
                 Print verbose information
diff --git a/csrank/core/feta_network.py b/csrank/core/feta_network.py
index 4155b40c..67671ad8 100644
--- a/csrank/core/feta_network.py
+++ b/csrank/core/feta_network.py
@@ -234,7 +234,7 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0,
                 Number of epochs to run if training for a fixed query size
             callbacks : list
                 List of callbacks to be called during optimization
-            validation_split : float
+            validation_split : float (range : [0,1])
                 Percentage of instances to split off to validate on
             verbose : bool
                 Print verbose information
diff --git a/csrank/core/ranknet_core.py b/csrank/core/ranknet_core.py
index 9c112c50..977ae6dd 100644
--- a/csrank/core/ranknet_core.py
+++ b/csrank/core/ranknet_core.py
@@ -64,16 +64,7 @@ def construct_model(self):
             :math:`x_i, x_j \in Q` we construct two sub-networks with weight sharing in all hidden layer apart form the
             last layer for which weights are mirrored version of each other. The output of these networks are connected
             to a sigmoid unit that produces the output :math:`P_{ij}` which is the probability of preferring object
-            :math:`x_i` over :math:`x_j`,  to approximate the :math:`U(x)`.
-            For learning this network the binary cross entropy loss function for a pair of example :math:`x_i, x_j \in Q`
-            is defined as:
-
-            .. math::
-
-                C_{ij} =  -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace,
-
-            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
-            :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else 0.
+            :math:`x_i` over :math:`x_j`, to approximate the :math:`U(x)`.
 
             Returns
             -------
@@ -98,25 +89,33 @@ def _convert_instances(self, X, Y):
 
     def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd):
         """
-            Fit a generic preference learning model on a provided set of queries.
-            The provided queries can be of a fixed size (numpy arrays).
+            Fit a generic preference learning RankNet model on a provided set of queries. The provided queries can be of
+            a fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of
+            objects :math:`x_i, x_j \in Q` is defined as:
+
+            .. math::
+
+                C_{ij} =  -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace,
+
+            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
+            :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = 0`.
 
             Parameters
             ----------
-            X : numpy array
-                (n_instances, n_objects, n_features)
+            X : numpy array (n_instances, n_objects, n_features)
                 Feature vectors of the objects
-            Y : numpy array
-                (n_instances, n_objects)
+            Y : numpy array (n_instances, n_objects)
                 Preferences in form of Orderings or Choices for given n_objects
             epochs : int
                 Number of epochs to run if training for a fixed query size
             callbacks : list
                 List of callbacks to be called during optimization
-            validation_split : float
+            validation_split : float (range : [0,1])
                 Percentage of instances to split off to validate on
             verbose : bool
                 Print verbose information
+            **kwd :
+                Keyword arguments for the fit function
         """
         X1, X2, Y_single = self._convert_instances(X, Y)
 
diff --git a/csrank/discretechoice/discrete_choice.py b/csrank/discretechoice/discrete_choice.py
index 2c6642fb..bf2ea081 100644
--- a/csrank/discretechoice/discrete_choice.py
+++ b/csrank/discretechoice/discrete_choice.py
@@ -13,22 +13,26 @@ def learning_problem(self):
         return DISCRETE_CHOICE
 
     def predict_for_scores(self, scores):
-        """ Predict discrete choice for a given collection scores for the sets of objects.
-
-        Parameters
-        ----------
-        scores : dict or numpy array
-            Dictionary with a mapping from size of the choice set to numpy arrays
-            or a single numpy array of size containing scores of each object of size:
-            (n_instances, n_objects)
-
-
-        Returns
-        -------
-        Y : dict or numpy array
-            Dictionary with a mapping from size of the choice set to numpy arrays
-            or a single numpy array containing discrete choices of size:
-            (n_instances, 1)
+        """
+            Binary discrete choice vector :math:`y` represents the choices amongst the objects in :math:`Q`, such that
+            :math:`y(k) = 1` represents that the object :math:`x_k` is chosen and :math:`y(k) = 0` represents
+            it is not chosen. For choice to be discrete :math:`\sum_{x_i \in Q} y(i) = 1`. Predict discrete choices for
+            the scores for a given collection of sets of objects (query sets).
+
+            Parameters
+            ----------
+            scores : dict or numpy array
+                Dictionary with a mapping from query set size to numpy arrays
+                or a single numpy array of size containing scores of each object of size:
+                (n_instances, n_objects)
+
+
+            Returns
+            -------
+            Y : dict or numpy array
+                Dictionary with a mapping from query set size to numpy arrays
+                or a single numpy array containing predicted discrete choice vectors of size:
+                (n_instances, n_objects)
         """
         if isinstance(scores, dict):
             result = dict()
diff --git a/csrank/learner.py b/csrank/learner.py
index 57e762fe..0bbaec00 100644
--- a/csrank/learner.py
+++ b/csrank/learner.py
@@ -75,7 +75,7 @@ def predict_scores(self, X, **kwargs):
     def predict(self, X, **kwargs):
         """
             Predict preferences in the form of rankings or choices for a given collection of sets of objects called
-            a query set.
+            a query set using the function :meth:`.predict_for_scores`.
 
             Parameters
             ----------
diff --git a/csrank/objectranking/object_ranker.py b/csrank/objectranking/object_ranker.py
index ce35e982..9be76abc 100644
--- a/csrank/objectranking/object_ranker.py
+++ b/csrank/objectranking/object_ranker.py
@@ -14,9 +14,10 @@ def learning_problem(self):
 
     def predict_for_scores(self, scores, **kwargs):
         """
-            :math:`\pi` represents the ranking amongst the objects in :math:`Q`, such that :math:`\pi(k)` is the
-            position of the :math:`k`-th object :math:`x_k`, and :math:`\pi^{-1}(k)` is the index of the object on
-            position :math:`k`. Predict rankings for the scores for a given collection of sets of objects (query set).
+            The permutation vector :math:`\pi` represents the ranking amongst the objects in :math:`Q`, such that
+            :math:`\pi(k)` is the position of the :math:`k`-th object :math:`x_k`, and :math:`\pi^{-1}(k)` is the index
+            of the object on position :math:`k`. Predict rankings for the scores for a given collection of sets of
+            objects (query sets).
 
             Parameters
             ----------

From fcd9e8f76f3fa870f2b2b4c64a7340b634aad2e8 Mon Sep 17 00:00:00 2001
From: Pritha Gupta <prithagupta.nsit@gmail.com>
Date: Mon, 3 Jun 2019 15:15:00 +0200
Subject: [PATCH 3/5] Fine tuned remaining rankers

---
 csrank/objectranking/cmp_net.py  | 35 +++++++++++++++++++++--
 csrank/objectranking/list_net.py | 49 ++++++++++++++++++++------------
 csrank/objectranking/rank_net.py | 12 ++++++--
 3 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/csrank/objectranking/cmp_net.py b/csrank/objectranking/cmp_net.py
index 3461c24e..9d320309 100644
--- a/csrank/objectranking/cmp_net.py
+++ b/csrank/objectranking/cmp_net.py
@@ -93,8 +93,39 @@ def _convert_instances(self, X, Y):
     def construct_model(self):
         return super().construct_model()
 
-    def fit(self, X, Y, **kwd):
-        super().fit(X, Y, **kwd)
+    def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd):
+        """
+            Fit an object ranking learning CmpNet model on a provided set of queries. The provided queries can be of a
+            fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of
+            objects :math:`x_i, x_j \in Q` is defined as:
+
+            .. math::
+
+                C_{ij} =  -\\tilde{P_{ij}}(0)\\cdot \log(U(x_i,x_j)) - \\tilde{P_{ij}}(1) \\cdot \log(U(x_j,x_i)) \ ,
+
+            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
+            :math:`\\tilde{P_{ij}} = (1,0)` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = (0,1)`.
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Rankings of the given objects
+            epochs : int
+                Number of epochs to run if training for a fixed query size
+            callbacks : list
+                List of callbacks to be called during optimization
+            validation_split : float
+                Percentage of instances to split off to validate on
+            verbose : bool
+                Print verbose information
+            **kwd
+                Keyword arguments for the fit function
+        """
+        super().fit(X, Y, epochs=epochs, callbacks=callbacks, validation_split=validation_split, verbose=verbose, **kwd)
 
     def _predict_scores_fixed(self, X, **kwargs):
         return super()._predict_scores_fixed(X, **kwargs)
diff --git a/csrank/objectranking/list_net.py b/csrank/objectranking/list_net.py
index 82457660..bc23f16d 100644
--- a/csrank/objectranking/list_net.py
+++ b/csrank/objectranking/list_net.py
@@ -25,16 +25,15 @@ def __init__(self, n_object_features, n_top, n_hidden=2, n_units=8, loss_functio
                  batch_normalization=False, kernel_regularizer=l2(l=1e-4), activation="selu",
                  kernel_initializer='lecun_normal', optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9),
                  metrics=[zero_one_rank_loss_for_scores_ties], batch_size=256, random_state=None, **kwargs):
-        """ Create an instance of the ListNet architecture.
-            ListNet trains a latent utility model based on top-k-subrankings of the objects.
-            A listwise loss function like the negative Plackett-Luce likelihood is used for training.
-            For example for query set :math:`Q = \{x_1,x_2,x_3\}`, the scores are :math:`Q = (s_1,s_2,s_3)`
-            and the ranking is :math:`\pi = (3,1,2)`. The  Plackett-Luce likelihood is defined as:
+        """ Create an instance of the ListNet architecture. ListNet trains a latent utility model based on
+            top-k-subrankings of the objects. This network learns a latent utility score for each object in the given
+            query set :math:`Q = \{x_1, \ldots ,x_n\}` using the equation :math:`U(x) = F(x, w)` where :math:`w` is the
+            weight vector. A listwise loss function like the negative Plackett-Luce likelihood is used for training.
+            The ranking for the given query set :math:`Q` is defined as:
 
             .. math::
-                P_l(\pi) = \\frac{s_2}{s_1+s_2+s_3} \cdot \\frac{s_3}{s_1+s_3} \cdot \\frac{s_1}{s_1}
 
-            Note: For k=2 we obtain :class:`RankNet` as a special case.
+                ρ(Q)  = \operatorname{argsort}_{x \in Q}  \; U(x)
 
             Parameters
             ----------
@@ -125,8 +124,15 @@ def _create_topk(self, X, Y):
 
     def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd):
         """
-            Fit an object ranking learning model on a provided set of queries.
-            The provided queries can be of a fixed size (numpy arrays).
+            Fit an object ranking learning ListNet on the top-k-subrankings in the provided set of queries. The provided
+            queries can be of a fixed size (numpy arrays). For fitting the model we maximize the Plackett-Luce
+            likelihood. For example for query set :math:`Q = \{x_1,x_2,x_3\}`, the scores are :math:`Q = (s_1,s_2,s_3)`
+            and the ranking is :math:`\pi = (3,1,2)`. The  Plackett-Luce likelihood is defined as:
+
+            .. math::
+                P_l(\pi) = \\frac{s_2}{s_1+s_2+s_3} \cdot \\frac{s_3}{s_1+s_3} \cdot \\frac{s_1}{s_1}
+
+            Note: For k=2 we obtain :class:`RankNet` as a special case.
 
             Parameters
             ----------
@@ -153,9 +159,7 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0,
         self.logger.debug("Finished creating the dataset")
 
         self.logger.debug("Creating the model")
-        output = self.construct_model()
-        self.model = Model(inputs=self.input_layer, outputs=output)
-        self.model.compile(loss=self.loss_function, optimizer=self.optimizer, metrics=self.metrics)
+        self.model = self.construct_model()
         self.logger.debug("Finished creating the model, now fitting...")
         self.model.fit(X, Y, batch_size=self.batch_size, epochs=epochs, callbacks=callbacks,
                        validation_split=validation_split, verbose=verbose, **kwd)
@@ -163,20 +167,30 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0,
 
     def construct_model(self):
         """
-            Construct the ListNet architecture.
-            Weight sharing guarantees that we have a latent utility model for any given object.
+            Construct the ListNet architecture which takes topk-subrankings from the given queries and minimize a
+            listwise loss on the utility scores of top objects. Weight sharing guarantees that we learn the shared
+            weights :math:`w` of the latent utility function :math:`U(x) = F(x, w)`.
+
+            Returns
+            -------
+            model: keras model :class:`Model`
+                ListNet model used to learn the utiliy function using the top-k-subrankings in the provided set of queries.
         """
         hid = [create_input_lambda(i)(self.input_layer) for i in range(self.n_top)]
         for hidden_layer in self.hidden_layers:
             hid = [hidden_layer(x) for x in hid]
         outputs = [self.output_node(x) for x in hid]
         merged = concatenate(outputs)
-        return merged
+        model = Model(inputs=self.input_layer, outputs=merged)
+        model.compile(loss=self.loss_function, optimizer=self.optimizer, metrics=self.metrics)
+        return model
 
     @property
     def scoring_model(self):
         """
-            Creates a scoring model for the trained ListNet, which predicts the utility scores for given set of objects.
+            Creates a scoring model from the trained ListNet, which predicts the utility scores for given set of objects.
+            This network consist of a sequential network which predicts the utility score for each object :math:`x \in Q`
+            using the latent utility function :math:`U(x) = F(x, w)` where :math:`w` is the weights of the model.
 
             Returns
             -------
@@ -230,8 +244,7 @@ def clear_memory(self, **kwargs):
             self._construct_layers(kernel_regularizer=self.kernel_regularizer,
                                    kernel_initializer=self.kernel_initializer,
                                    activation=self.activation, **self.kwargs)
-            output = self.construct_model()
-            self.model = Model(inputs=self.input_layer, outputs=output)
+            self.model = self.construct_model()
             self.model.load_weights(self.hash_file)
         else:
             self.logger.info("Cannot clear the memory")
diff --git a/csrank/objectranking/rank_net.py b/csrank/objectranking/rank_net.py
index c61786f5..938346f4 100644
--- a/csrank/objectranking/rank_net.py
+++ b/csrank/objectranking/rank_net.py
@@ -88,8 +88,16 @@ def _convert_instances(self, X, Y):
 
     def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd):
         """
-            Fit an object ranking learning model on a provided set of queries.
-            The provided queries can be of a fixed size (numpy arrays).
+            Fit an object ranking learning RankNet model on a provided set of queries. The provided queries can be of a
+            fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of
+            objects :math:`x_i, x_j \in Q` is defined as:
+
+            .. math::
+
+                C_{ij} =  -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace,
+
+            where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`.
+            :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = 0`.
 
             Parameters
             ----------

From ff3d65184bf029237d8faee55c857165f18484f5 Mon Sep 17 00:00:00 2001
From: Pritha Gupta <prithagupta.nsit@gmail.com>
Date: Mon, 3 Jun 2019 15:55:53 +0200
Subject: [PATCH 4/5] Improved the documentation of discrete choice functions

---
 csrank/choicefunctions/feta_choice.py         |   2 +-
 .../generalized_linear_model.py               |  14 +-
 csrank/discretechoice/feta_discrete_choice.py |   2 +-
 .../generalized_nested_logit.py               |  30 +++-
 .../discretechoice/multinomial_logit_model.py | 112 ++++++++++++-
 csrank/discretechoice/nested_logit_model.py   | 157 ++++++++++++++----
 .../paired_combinatorial_logit.py             |  30 +++-
 7 files changed, 296 insertions(+), 51 deletions(-)

diff --git a/csrank/choicefunctions/feta_choice.py b/csrank/choicefunctions/feta_choice.py
index 3d32f4ea..ccf5bea8 100644
--- a/csrank/choicefunctions/feta_choice.py
+++ b/csrank/choicefunctions/feta_choice.py
@@ -41,7 +41,7 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero
             Parameters
             ----------
             n_objects : int
-                Number of objects to be ranked
+                Number of objects in each query set
             n_object_features : int
                 Dimensionality of the feature space of each object
             n_hidden : int
diff --git a/csrank/choicefunctions/generalized_linear_model.py b/csrank/choicefunctions/generalized_linear_model.py
index 0a14d82d..a173d4d6 100644
--- a/csrank/choicefunctions/generalized_linear_model.py
+++ b/csrank/choicefunctions/generalized_linear_model.py
@@ -19,9 +19,10 @@ class GeneralizedLinearModel(ChoiceFunctions, Learner):
     def __init__(self, n_object_features, regularization='l2', random_state=None, **kwargs):
         """
             Create an instance of the GeneralizedLinearModel model for learning the choice function. This model is
-            adapted from the multinomial logit model :class:`MultinomialLogitModel`. The utility score for each object
-            in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is the weight vector.
-            The probability of choosing an object :math:`x_i` is defined by taking sigmoid over the utility scores:
+            adapted from the multinomial logit model :class:`csrank.discretechoice.multinomial_logit_model.MultinomialLogitModel`.
+            The utility score for each object in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`,
+            where :math:`w` is the weight vector. The probability of choosing an object :math:`x_i` is defined by taking
+            sigmoid over the utility scores:
 
             .. math::
 
@@ -65,8 +66,9 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, **
     @property
     def model_configuration(self):
         """
-            Constructs the dictionary containing the priors for the parameters for the model according to the
+            Constructs the dictionary containing the priors for the weight vector for the model according to the
             regularization function.
+
             Returns
             -------
                 configuration : dict
@@ -121,13 +123,13 @@ def construct_model(self, X, Y):
 
     def fit(self, X, Y, sampler='vi', tune_size=0.1, thin_thresholds=1, **kwargs):
         """
-            Fit a generalized logit model on the provided set of queries X and preferences Y of those objects. The
+            Fit a generalized logit model on the provided set of queries X and choices Y of those objects. The
             provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
             the binary cross entropy loss function for each object :math:`x_i \in Q` is defined as:
 
             .. math::
 
-                C_{ij} =  -y(i)\log(P_i) - (1 - y(i))\log(1 - P_i) \enspace,
+                C_{i} =  -y(i)\log(P_i) - (1 - y(i))\log(1 - P_i) \enspace,
 
             where :math:`y` is ground-truth choice vector of the objects in the given query set :math:`Q`.
             The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`.
diff --git a/csrank/discretechoice/feta_discrete_choice.py b/csrank/discretechoice/feta_discrete_choice.py
index 8d404f7c..4143a757 100644
--- a/csrank/discretechoice/feta_discrete_choice.py
+++ b/csrank/discretechoice/feta_discrete_choice.py
@@ -38,7 +38,7 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero
             Parameters
             ----------
             n_objects : int
-                Number of objects to be ranked
+                Number of objects in each query set
             n_object_features : int
                 Dimensionality of the feature space of each object
             n_hidden : int
diff --git a/csrank/discretechoice/generalized_nested_logit.py b/csrank/discretechoice/generalized_nested_logit.py
index 7a71247a..4cb58caa 100644
--- a/csrank/discretechoice/generalized_nested_logit.py
+++ b/csrank/discretechoice/generalized_nested_logit.py
@@ -107,6 +107,32 @@ def construct_model(self, X, Y):
         self.logger.info("Model construction completed")
 
     def fit(self, X, Y, sampler="vi", **kwargs):
+        """
+            Fit a generalized nested logit model on the provided set of queries X and choices Y of those objects. The
+            provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
+            the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as:
+
+            .. math::
+
+                C_{i} =  -y(i)\log(P_i) \enspace,
+
+            where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`.
+            The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`.
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
+                The sampler used to estimate the posterior mean and mass matrix from the trace.
+                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                * **nuts** : Use the No-U-Turn sampler
+            **kwargs :
+                Keyword arguments for the fit function
+        """
         self.construct_model(X, Y)
         kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32')
         callbacks = kwargs['vi_params'].get('callbacks', [])
@@ -175,10 +201,6 @@ def predict_scores(self, X, **kwargs):
     def predict_for_scores(self, scores, **kwargs):
         return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs)
 
-    def clear_memory(self, **kwargs):
-        self.logger.info("Clearing memory")
-        pass
-
     def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization='l2', **point):
         if alpha is not None:
             self.alpha = alpha
diff --git a/csrank/discretechoice/multinomial_logit_model.py b/csrank/discretechoice/multinomial_logit_model.py
index 44872af1..49ffc553 100644
--- a/csrank/discretechoice/multinomial_logit_model.py
+++ b/csrank/discretechoice/multinomial_logit_model.py
@@ -14,7 +14,40 @@
 
 
 class MultinomialLogitModel(DiscreteObjectChooser, Learner):
-    def __init__(self, n_object_features, loss_function='', regularization='l2', model_args={}, **kwargs):
+    def __init__(self, n_object_features, loss_function='', regularization='l2', **kwargs):
+        """
+            Create an instance of the MultinomialLogitModel model for learning the discrete choice function. The utility
+            score for each object in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is
+            the weight vector. The probability of choosing an object :math:`x_i` is defined by taking softmax over the
+            utility scores of the objects:
+
+            .. math::
+
+                P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))}
+
+            The discrete choice for the given query set :math:`Q` is defined as:
+
+            .. math::
+
+                dc(Q) := \operatorname{argmax}_{x_i \in Q }  \; P(x_i \\lvert Q)
+
+            Parameters
+            ----------
+            n_object_features : int
+                Number of features of the object space
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            random_state : int or object
+                Numpy random state
+            **kwargs
+                Keyword arguments for the algorithms
+
+            References
+            ----------
+                [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86.
+        """
         self.logger = logging.getLogger(MultinomialLogitModel.__name__)
         self.n_object_features = n_object_features
         self.loss_function = likelihood_dict.get(loss_function, None)
@@ -31,7 +64,16 @@ def __init__(self, n_object_features, loss_function='', regularization='l2', mod
         self.p = None
 
     @property
-    def model_priors(self):
+    def model_configuration(self):
+        """
+            Constructs the dictionary containing the priors for the weight vector for the model according to the
+            regularization function.
+
+            Returns
+            -------
+                configuration : dict
+                    Dictionary containing the priors applies on the weights
+        """
         if self._config is None:
             if self.regularization == 'l2':
                 weight = pm.Normal
@@ -45,7 +87,29 @@ def model_priors(self):
         return self._config
 
     def construct_model(self, X, Y):
-        self.logger.info('Creating model_args config {}'.format(print_dictionary(self.model_priors)))
+        """
+            Constructs the multinomial logit model which evaluated the utility score as :math:`U(x) = w \cdot x`, where
+            :math:`w` is the weight vector. The probability of choosing the object :math:`x_i` from the query set
+            :math:`Q = \{x_1, \ldots ,x_n\}` is:
+
+            .. math::
+
+                P_i = P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))}
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Preferences in the form of discrete choices for given objects
+
+            Returns
+            -------
+             model : pymc3 Model :class:`pm.Model`
+        """
+        self.logger.info('Creating model_args config {}'.format(print_dictionary(self.model_configuration)))
         with pm.Model() as self.model:
             self.Xt = theano.shared(X)
             self.Yt = theano.shared(Y)
@@ -60,6 +124,32 @@ def construct_model(self, X, Y):
         self.logger.info("Model construction completed")
 
     def fit(self, X, Y, sampler='vi', **kwargs):
+        """
+            Fit a multinomial logit model on the provided set of queries X and choices Y of those objects. The
+            provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
+            the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as:
+
+            .. math::
+
+                C_{i} =  -y(i)\log(P_i) \enspace,
+
+            where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`.
+            The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`.
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
+                The sampler used to estimate the posterior mean and mass matrix from the trace.
+                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                * **nuts** : Use the No-U-Turn sampler
+            **kwargs :
+                Keyword arguments for the fit function
+        """
         self.construct_model(X, Y)
         kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32')
         callbacks = kwargs['vi_params'].get('callbacks', [])
@@ -123,11 +213,19 @@ def predict_scores(self, X, **kwargs):
     def predict_for_scores(self, scores, **kwargs):
         return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs)
 
-    def clear_memory(self, **kwargs):
-        self.logger.info("Clearing memory")
-        pass
-
     def set_tunable_parameters(self, loss_function='', regularization="l1", **point):
+        """
+            Set tunable parameters of the Multinomial Logit model to the values provided.
+
+            Parameters
+            ----------
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            point: dict
+                Dictionary containing parameter values which are not tuned for the network
+        """
         if loss_function in likelihood_dict.keys():
             self.loss_function = likelihood_dict.get(loss_function, None)
         self.regularization = regularization
diff --git a/csrank/discretechoice/nested_logit_model.py b/csrank/discretechoice/nested_logit_model.py
index 2e61f33b..1eba43f3 100644
--- a/csrank/discretechoice/nested_logit_model.py
+++ b/csrank/discretechoice/nested_logit_model.py
@@ -20,6 +20,38 @@
 class NestedLogitModel(DiscreteObjectChooser, Learner):
     def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='', regularization='l1', alpha=1e-2,
                  random_state=None, **kwd):
+        """
+            Create an instance of the NestedLogitModel model for learning the discrete choice function.
+
+            .. math::
+
+                P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))}
+
+            The discrete choice for the given query set :math:`Q` is defined as:
+
+            .. math::
+
+                dc(Q) := \operatorname{argmax}_{x_i \in Q }  \; P(x_i \\lvert Q)
+
+            Parameters
+            ----------
+            n_object_features : int
+                Number of features of the object space
+            n_objects: int
+                Number of objects in each query set
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            random_state : int or object
+                Numpy random state
+            **kwargs
+                Keyword arguments for the algorithms
+
+            References
+            ----------
+                [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86.
+        """
         self.logger = logging.getLogger(NestedLogitModel.__name__)
         self.n_object_features = n_object_features
         self.n_objects = n_objects
@@ -46,7 +78,18 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='',
         self.y_nests = None
 
     @property
-    def model_priors(self):
+    def model_configuration(self):
+        """
+            Constructs the dictionary containing the priors for the weight vectors for the model according to the
+            regularization function. The parameters are:
+                * weights :
+                * weights_k :
+
+            Returns
+            -------
+                configuration : dict
+                    Dictionary containing the priors applies on the weights
+        """
         if self._config is None:
             if self.regularization == 'l2':
                 weight = pm.Normal
@@ -60,7 +103,23 @@ def model_priors(self):
             self.logger.info('Creating model with config {}'.format(print_dictionary(self._config)))
         return self._config
 
-    def eval_utility(self, weights):
+    def create_nests(self, X):
+        n, n_obj, n_dim = X.shape
+        objects = X.reshape(n * n_obj, n_dim)
+        if self.cluster_model is None:
+            self.cluster_model = MiniBatchKMeans(n_clusters=self.n_nests, random_state=self.random_state).fit(objects)
+            self.features_nests = self.cluster_model.cluster_centers_
+            prediction = self.cluster_model.labels_
+        else:
+            prediction = self.cluster_model.predict(objects)
+        y_nests = []
+        for i in np.arange(0, n * n_obj, step=n_obj):
+            nest_ids = prediction[i:i + n_obj]
+            y_nests.append(nest_ids)
+        y_nests = np.array(y_nests)
+        return y_nests
+
+    def _eval_utility(self, weights):
         utility = tt.zeros(tuple(self.y_nests.shape))
         for i in range(self.n_nests):
             rows, cols = tt.eq(self.y_nests, i).nonzero()
@@ -87,14 +146,14 @@ def get_probability(self, utility, lambda_k, utility_k):
         p = pni_k * pn_k
         return p
 
-    def eval_utility_np(self, x_t, y_nests, weights):
+    def _eval_utility_np(self, x_t, y_nests, weights):
         utility = np.zeros(tuple(y_nests.shape))
         for i in range(self.n_nests):
             rows, cols = np.where(y_nests == i)
             utility[rows, cols] = np.dot(x_t[rows, cols], weights[i])
         return utility
 
-    def get_probability_np(self, y_nests, utility, lambda_k, utility_k):
+    def _get_probability_np(self, y_nests, utility, lambda_k, utility_k):
         n_instances, n_objects = y_nests.shape
         pni_k = np.zeros((n_instances, n_objects))
         ivm = np.zeros((n_instances, self.n_nests))
@@ -113,23 +172,27 @@ def get_probability_np(self, y_nests, utility, lambda_k, utility_k):
         p = pni_k * pn_k
         return p
 
-    def create_nests(self, X):
-        n, n_obj, n_dim = X.shape
-        objects = X.reshape(n * n_obj, n_dim)
-        if self.cluster_model is None:
-            self.cluster_model = MiniBatchKMeans(n_clusters=self.n_nests, random_state=self.random_state).fit(objects)
-            self.features_nests = self.cluster_model.cluster_centers_
-            prediction = self.cluster_model.labels_
-        else:
-            prediction = self.cluster_model.predict(objects)
-        y_nests = []
-        for i in np.arange(0, n * n_obj, step=n_obj):
-            nest_ids = prediction[i:i + n_obj]
-            y_nests.append(nest_ids)
-        y_nests = np.array(y_nests)
-        return y_nests
-
     def construct_model(self, X, Y):
+        """
+            Constructs the nested logit model.
+
+            .. math::
+
+                P_i = P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))}
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Preferences in the form of discrete choices for given objects
+
+            Returns
+            -------
+             model : pymc3 Model :class:`pm.Model`
+        """
         y_nests = self.create_nests(X)
         with pm.Model() as self.model:
             self.Xt = theano.shared(X)
@@ -137,10 +200,10 @@ def construct_model(self, X, Y):
             self.y_nests = theano.shared(y_nests)
             shapes = {'weights': self.n_object_features, 'weights_k': self.n_object_features}
 
-            weights_dict = create_weight_dictionary(self.model_priors, shapes)
+            weights_dict = create_weight_dictionary(self.model_configuration, shapes)
             lambda_k = pm.Uniform('lambda_k', self.alpha, 1.0, shape=self.n_nests)
             weights = (weights_dict['weights'] / lambda_k[:, None])
-            utility = self.eval_utility(weights)
+            utility = self._eval_utility(weights)
             utility_k = tt.dot(self.features_nests, weights_dict['weights_k'])
             self.p = self.get_probability(utility, lambda_k, utility_k)
 
@@ -148,6 +211,32 @@ def construct_model(self, X, Y):
         self.logger.info("Model construction completed")
 
     def fit(self, X, Y, sampler="vi", **kwargs):
+        """
+            Fit a nested logit model on the provided set of queries X and choices Y of those objects. The
+            provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
+            the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as:
+
+            .. math::
+
+                C_{i} =  -y(i)\log(P_i) \enspace,
+
+            where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`.
+            The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`.
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
+                The sampler used to estimate the posterior mean and mass matrix from the trace.
+                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                * **nuts** : Use the No-U-Turn sampler
+            **kwargs :
+                Keyword arguments for the fit function
+        """
         self.construct_model(X, Y)
         kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32')
         callbacks = kwargs['vi_params'].get('callbacks', [])
@@ -202,8 +291,8 @@ def _predict_scores_fixed(self, X, **kwargs):
         lambda_k = np.array([mean_trace['lambda_k__{}'.format(i)] for i in range(self.n_nests)])
         weights = (weights / lambda_k[:, None])
         utility_k = np.dot(self.features_nests, weights_k)
-        utility = self.eval_utility_np(X, y_nests, weights)
-        scores = self.get_probability_np(y_nests, utility, lambda_k, utility_k)
+        utility = self._eval_utility_np(X, y_nests, weights)
+        scores = self._get_probability_np(y_nests, utility, lambda_k, utility_k)
         return scores
 
     def predict(self, X, **kwargs):
@@ -215,11 +304,23 @@ def predict_scores(self, X, **kwargs):
     def predict_for_scores(self, scores, **kwargs):
         return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs)
 
-    def clear_memory(self, **kwargs):
-        self.logger.info("Clearing memory")
-        pass
-
     def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization="l1", **point):
+        """
+            Set tunable parameters of the Multinomial Logit model to the values provided.
+
+            Parameters
+            ----------
+            alpha: float (range : [0,1])
+                The lower bound of the correlations between the objects in a nest
+            n_nests: int (range : [2,n_objects])
+                The number of nests in which the objects are divided
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            point: dict
+                Dictionary containing parameter values which are not tuned for the network
+        """
         if alpha is not None:
             self.alpha = alpha
         if n_nests is None:
diff --git a/csrank/discretechoice/paired_combinatorial_logit.py b/csrank/discretechoice/paired_combinatorial_logit.py
index 9951ff0e..015f3013 100644
--- a/csrank/discretechoice/paired_combinatorial_logit.py
+++ b/csrank/discretechoice/paired_combinatorial_logit.py
@@ -112,6 +112,32 @@ def construct_model(self, X, Y):
         self.logger.info("Model construction completed")
 
     def fit(self, X, Y, sampler="vi", **kwargs):
+        """
+            Fit a paired combinatorial logit model on the provided set of queries X and choices Y of those objects. The
+            provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
+            the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as:
+
+            .. math::
+
+                C_{i} =  -y(i)\log(P_i) \enspace,
+
+            where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`.
+            The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`.
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
+                The sampler used to estimate the posterior mean and mass matrix from the trace.
+                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                * **nuts** : Use the No-U-Turn sampler
+            **kwargs :
+                Keyword arguments for the fit function
+        """
         self.construct_model(X, Y)
         kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32')
         callbacks = kwargs['vi_params'].get('callbacks', [])
@@ -175,10 +201,6 @@ def predict_scores(self, X, **kwargs):
     def predict_for_scores(self, scores, **kwargs):
         return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs)
 
-    def clear_memory(self, **kwargs):
-        self.logger.info("Clearing memory")
-        pass
-
     def set_tunable_parameters(self, alpha=5e-2, loss_function='', regularization='l2', **point):
         if alpha is not None:
             self.alpha = alpha

From b10e373917bebeaa99debbf0905af915bbb2db56 Mon Sep 17 00:00:00 2001
From: Pritha Gupta <prithagupta.nsit@gmail.com>
Date: Tue, 4 Jun 2019 10:56:48 +0200
Subject: [PATCH 5/5] Completed the description of logit models

Completed Paired combinatorial Logit model

completed the documentation of logit models
---
 .../generalized_linear_model.py               |  35 ++--
 .../generalized_nested_logit.py               | 164 ++++++++++++++++--
 csrank/discretechoice/mixed_logit_model.py    | 135 +++++++++++++-
 .../discretechoice/multinomial_logit_model.py |  37 ++--
 csrank/discretechoice/nested_logit_model.py   | 136 ++++++++++++---
 .../paired_combinatorial_logit.py             | 159 ++++++++++++++++-
 .../discretechoice/ranknet_discrete_choice.py |   2 +-
 7 files changed, 593 insertions(+), 75 deletions(-)

diff --git a/csrank/choicefunctions/generalized_linear_model.py b/csrank/choicefunctions/generalized_linear_model.py
index a173d4d6..ba243177 100644
--- a/csrank/choicefunctions/generalized_linear_model.py
+++ b/csrank/choicefunctions/generalized_linear_model.py
@@ -48,6 +48,8 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, **
             References
             ----------
                 [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86.
+
+                [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986
         """
         self.logger = logging.getLogger(GeneralizedLinearModel.__name__)
         self.n_object_features = n_object_features
@@ -66,13 +68,25 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, **
     @property
     def model_configuration(self):
         """
-            Constructs the dictionary containing the priors for the weight vector for the model according to the
-            regularization function.
+            Constructs the dictionary containing the priors for the weight vectors for the model according to the
+            regularization function. The parameters are:
+                * **weights** : Weights to evaluates the utility of the objects
 
-            Returns
-            -------
-                configuration : dict
-                    Dictionary containing the priors applies on the weights
+            For ``l1`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w)
+
+            For ``l2`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w)
         """
         if self.regularization == 'l2':
             weight = pm.Normal
@@ -141,10 +155,11 @@ def fit(self, X, Y, sampler='vi', tune_size=0.1, thin_thresholds=1, **kwargs):
             Y : numpy array (n_instances, n_objects)
                 Choices for given objects in the query
             sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
-                The sampler used to estimate the posterior mean and mass matrix from the trace.
-                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
-                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
-                * **nuts** : Use the No-U-Turn sampler
+                The sampler used to estimate the posterior mean and mass matrix from the trace
+
+                    * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                    * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                    * **nuts** : Use the No-U-Turn sampler
             tune_size: float (range : [0,1])
                 Percentage of instances to split off to tune the threshold for the choice function
             thin_thresholds: int
diff --git a/csrank/discretechoice/generalized_nested_logit.py b/csrank/discretechoice/generalized_nested_logit.py
index 4cb58caa..b65c30d9 100644
--- a/csrank/discretechoice/generalized_nested_logit.py
+++ b/csrank/discretechoice/generalized_nested_logit.py
@@ -18,7 +18,54 @@
 
 class GeneralizedNestedLogitModel(DiscreteObjectChooser, Learner):
     def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='None', regularization='l2',
-                 alpha=5e-2, random_state=None, model_args={}, **kwd):
+                 alpha=5e-2, random_state=None, **kwd):
+        """
+            Create an instance of the Generalized Nested Logit model for learning the discrete choice function. This
+            model divides objects into subsets called nests, such that the each object is associtated to each nest to some degree.
+            This model structure is 1-layer of hierarchy and the :math:`\lambda` for each nest :math:`B_k` signifies the degree of independence
+            and  :math:`1-\lambda` signifies the correlations between the object in it. We learn two weight vectors and the  :math:`\lambda s`.
+            The probability of choosing an object :math:`x_i` from the given query set :math:`Q` is defined by product
+            of choosing the nest in which :math:`x_i` exists and then choosing the the object from the nest.
+
+            .. math::
+
+                P(x_i \\lvert Q) = P_i = \sum_{\substack{B_k \in \mathcal{B} \\ i \in B_k}}P_{i \\lvert B_k} P_{B_k} \enspace ,
+
+
+            The discrete choice for the given query set :math:`Q` is defined as:
+
+            .. math::
+
+                dc(Q) := \operatorname{argmax}_{x_i \in Q }  \; P(x_i \\lvert Q)
+
+            Parameters
+            ----------
+            n_object_features : int
+                Number of features of the object space
+            n_objects: int
+                Number of objects in each query set
+            n_nests : int range : [2,n_objects/2]
+                The number of nests/subsets in which the objects are divided
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            alpha: float (range : [0,1])
+                The lower bound of the correlations between the objects in a nest
+            random_state : int or object
+                Numpy random state
+            **kwargs
+                Keyword arguments for the algorithms
+
+            References
+            ----------
+                [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap GEV, pp. 87–111.
+
+                [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986
+
+                [3] Chieh-Hua Wen and Frank S Koppelman. „The generalized nested logit model“. In: Transportation Research Part B: Methodological 35.7 (2001), pp. 627–641
+
+        """
         self.logger = logging.getLogger(GeneralizedNestedLogitModel.__name__)
 
         self.n_object_features = n_object_features
@@ -45,7 +92,34 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='No
         self._config = None
 
     @property
-    def model_priors(self):
+    def model_configuration(self):
+        """
+            Constructs the dictionary containing the priors for the weight vectors for the model according to the
+            regularization function. The parameters are:
+                * **weights** : Weights to evaluates the utility of the objects
+                * **weights_k** : Weights to evaluates the fractional allocation of each object in :math:'Q' to each nest
+
+            For ``l1`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w)
+
+            For ``l2`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w)
+
+            Returns
+            -------
+                configuration : dict
+                    Dictionary containing the priors applies on the weights
+        """
         if self._config is None:
             if self.regularization == 'l2':
                 weight = pm.Normal
@@ -60,6 +134,40 @@ def model_priors(self):
         return self._config
 
     def get_probabilities(self, utility, lambda_k, alpha_ik):
+        """
+            This method calculates the probability of choosing an object from the query set using the following parameters of the model which are used:
+
+                * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i`
+                * **weights_k** (:math:`w_k`): Weights to get fractional allocation of each object :math:'x_j'  in :math:'Q' to each nest math:`B_k` as :math:`\alpha_{ik} = w_k \cdot x_i`.
+                * **lambda_k** (:math:`\lambda_k`): Lambda for nest :math:`B_k` for correlations between the obejcts.
+
+            The probability of choosing the object :math:`x_i` from the query set :math:`Q`:
+
+            .. math::
+                P_i = \sum_{\substack{B_k \in \mathcal{B} \\ i \in B_k}} P_{i \\lvert {B_k}} P_{B_k} \enspace where, \\\\
+                P_{B_k} = \\frac{{\\left(\sum_{j \in B_k} {\\left(\\alpha_{jk} \\boldsymbol{e}^{V_j} \\right)}^ {^{1}/{\lambda_k}} \\right)}^{\lambda_k}}{\sum_{\ell = 1}^{K} {\\left( \sum_{j \in B_{\ell}} {\\left( \\alpha_{j\ell} \\boldsymbol{e}^{V_j} \\right)}^{^{1}/{\lambda_\ell}} \\right)^{\lambda_{\ell}}}} \\\\
+                P_{{i} \\lvert {B_k}} = \\frac{{\\left(\\alpha_{ik} \\boldsymbol{e}^{V_i} \\right)}^{^{1}/{\lambda_k}}}{\sum_{j \in B_k} {\\left(\\alpha_{jk} \\boldsymbol{e}^{V_j} \\right)}^{^{1}/{\lambda_k}}} \enspace ,
+
+
+            Parameters
+            ----------
+            utility : theano tensor
+                (n_instances, n_objects)
+                Utility :math:`Y_i` of the objects :math:`x_i \in Q` in the query sets
+            lambda_k : theano tensor (range : [alpha, 1.0])
+                (n_nests)
+                Measure of independence amongst the obejcts in each nests
+            alpha_ik : theano tensor
+                (n_instances, n_objects, n_nests)
+                Fractional allocation of each object :math:`x_i` in each nest math:`B_k`
+
+            Returns
+            -------
+            p : theano tensor
+                (n_instances, n_objects)
+                Choice probabilities :math:`P_i` of the objects :math:`x_i \in Q` in the query sets
+
+        """
         n_nests = self.n_nests
         n_instances, n_objects = utility.shape
         pik = tt.zeros((n_instances, n_objects, n_nests))
@@ -75,7 +183,7 @@ def get_probabilities(self, utility, lambda_k, alpha_ik):
         p = p.sum(axis=2)
         return p
 
-    def get_probabilities_np(self, utility, lambda_k, alpha_ik):
+    def _get_probabilities_np(self, utility, lambda_k, alpha_ik):
         n_nests = self.n_nests
         n_instances, n_objects = utility.shape
         pik = np.zeros((n_instances, n_objects, n_nests))
@@ -92,11 +200,30 @@ def get_probabilities_np(self, utility, lambda_k, alpha_ik):
         return p
 
     def construct_model(self, X, Y):
+        """
+            Constructs the nested logit model by applying priors on weight vectors **weights** and **weights_k** as per
+            :meth:`model_configuration`. Then we apply a uniform prior to the :math:`\lambda s`, i.e.
+            :math:`\lambda s \sim Uniform(\\text{alpha}, 1.0)`.The probability of choosing the object :math:`x_i` from the
+            query set :math:`Q = \{x_1, \ldots ,x_n\}` is evaluated in :meth:`get_probabilities`.
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Preferences in the form of discrete choices for given objects
+
+            Returns
+            -------
+             model : pymc3 Model :class:`pm.Model`
+        """
         with pm.Model() as self.model:
             self.Xt = theano.shared(X)
             self.Yt = theano.shared(Y)
             shapes = {'weights': self.n_object_features, 'weights_ik': (self.n_object_features, self.n_nests)}
-            weights_dict = create_weight_dictionary(self.model_priors, shapes)
+            weights_dict = create_weight_dictionary(self.model_configuration, shapes)
 
             alpha_ik = tt.dot(self.Xt, weights_dict['weights_ik'])
             alpha_ik = ttu.softmax(alpha_ik, axis=2)
@@ -125,11 +252,12 @@ def fit(self, X, Y, sampler="vi", **kwargs):
                 Feature vectors of the objects
             Y : numpy array (n_instances, n_objects)
                 Choices for given objects in the query
-            sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
-                The sampler used to estimate the posterior mean and mass matrix from the trace.
-                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
-                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
-                * **nuts** : Use the No-U-Turn sampler
+           sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
+                The sampler used to estimate the posterior mean and mass matrix from the trace
+
+                    * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                    * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                    * **nuts** : Use the No-U-Turn sampler
             **kwargs :
                 Keyword arguments for the fit function
         """
@@ -189,7 +317,7 @@ def _predict_scores_fixed(self, X, **kwargs):
         alpha_ik = np.dot(X, weights_ik)
         alpha_ik = npu.softmax(alpha_ik, axis=2)
         utility = np.dot(X, weights)
-        p = self.get_probabilities_np(utility, lambda_k, alpha_ik)
+        p = self._get_probabilities_np(utility, lambda_k, alpha_ik)
         return p
 
     def predict(self, X, **kwargs):
@@ -202,6 +330,22 @@ def predict_for_scores(self, scores, **kwargs):
         return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs)
 
     def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization='l2', **point):
+        """
+            Set tunable parameters of the Nested Logit model to the values provided.
+
+            Parameters
+            ----------
+            alpha: float (range : [0,1])
+                The lower bound of the correlations between the objects in a nest
+            n_nests: int (range : [2,n_objects])
+                The number of nests in which the objects are divided
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            point: dict
+                Dictionary containing parameter values which are not tuned for the network
+        """
         if alpha is not None:
             self.alpha = alpha
         if n_nests is None:
diff --git a/csrank/discretechoice/mixed_logit_model.py b/csrank/discretechoice/mixed_logit_model.py
index 6bdab0c0..bbe4876f 100644
--- a/csrank/discretechoice/mixed_logit_model.py
+++ b/csrank/discretechoice/mixed_logit_model.py
@@ -16,7 +16,48 @@
 
 
 class MixedLogitModel(DiscreteObjectChooser, Learner):
-    def __init__(self, n_object_features, n_mixtures=4, loss_function='', regularization='l2', model_args={}, **kwargs):
+    def __init__(self, n_object_features, n_mixtures=4, loss_function='', regularization='l2', **kwargs):
+        """
+            Create an instance of the Mixed Logit model for learning the discrete choice function. In this model we
+            assume weights of this model to be random due to which this model can learn different variations in choices
+            amongst the individuals. The utility score for each object in query set :math:`Q` is defined as
+            :math:`U_r(x) = w_r \cdot x`, where :math:`w_r` is the k-th sample weight vector from the underlying distribution
+            The probability of choosing an object :math:`x_i` is defined by taking softmax over the
+            utility scores of the objects:
+
+            .. math::
+
+                P(x_i \\lvert Q) = \\frac{1}{R} \sum_{r=1}^R \\frac{exp(U_r(x_i))}{\sum_{x_j \in Q} exp(U_r(x_j))}
+
+            The discrete choice for the given query set :math:`Q` is defined as:
+
+            .. math::
+
+                dc(Q) := \operatorname{argmax}_{x_i \in Q }  \; P(x_i \\lvert Q)
+
+            Parameters
+            ----------
+            n_object_features : int
+                Number of features of the object space
+            n_mixtures: int (range : [2, inf])
+                The number of logit models (:math:`R`) which are used to estimate the choice probability
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            random_state : int or object
+                Numpy random state
+            **kwargs
+                Keyword arguments for the algorithms
+
+            References
+            ----------
+                [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Mixed Logit, pp. 153–172.
+
+                [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986
+
+                [3] Daniel McFadden and Kenneth Train. „Mixed MNL models for discrete response“. In: Journal of applied Econometrics 15.5 (2000), pp. 447–470
+        """
         self.logger = logging.getLogger(MixedLogitModel.__name__)
         self.n_object_features = n_object_features
         self.loss_function = likelihood_dict.get(loss_function, None)
@@ -34,7 +75,28 @@ def __init__(self, n_object_features, n_mixtures=4, loss_function='', regulariza
         self.p = None
 
     @property
-    def model_priors(self):
+    def model_configuration(self):
+        """
+            Constructs the dictionary containing the priors for the weight vectors for the model according to the
+            regularization function. The parameters are:
+                * **weights** : Distribution of the weigh vectors to evaluates the utility of the objects
+
+            For ``l1`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w)
+
+            For ``l2`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w)
+        """
         if self._config is None:
             if self.regularization == 'l2':
                 weight = pm.Normal
@@ -48,17 +110,66 @@ def model_priors(self):
         return self._config
 
     def construct_model(self, X, Y):
+        """
+            Constructs the mixed logit model by applying priors on weight vectors **weights** as per
+            :meth:`model_configuration`. The probability of choosing the object :math:`x_i` from the query set
+            :math:`Q = \{x_1, \ldots ,x_n\}` assuming we draw :math:`R` samples of the weight vectors is:
+
+            .. math::
+
+                P(x_i \\lvert Q) = \\frac{1}{R} \sum_{r=1}^R \\frac{exp(U_r(x_i))}{\sum_{x_j \in Q} exp(U_r(x_j))}
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Preferences in the form of discrete choices for given objects
+
+            Returns
+            -------
+             model : pymc3 Model :class:`pm.Model`
+        """
         with pm.Model() as self.model:
             self.Xt = theano.shared(X)
             self.Yt = theano.shared(Y)
             shapes = {'weights': (self.n_object_features, self.n_mixtures)}
-            weights_dict = create_weight_dictionary(self.model_priors, shapes)
+            weights_dict = create_weight_dictionary(self.model_configuration, shapes)
             utility = tt.dot(self.Xt, weights_dict['weights'])
             self.p = tt.mean(ttu.softmax(utility, axis=1), axis=2)
             yl = LogLikelihood('yl', loss_func=self.loss_function, p=self.p, observed=self.Yt)
         self.logger.info("Model construction completed")
 
     def fit(self, X, Y, sampler='vi', **kwargs):
+        """
+            Fit a multinomial logit model on the provided set of queries X and choices Y of those objects. The
+            provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
+            the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as:
+
+            .. math::
+
+                C_{i} =  -y(i)\log(P_i) \enspace,
+
+            where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`.
+            The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`.
+
+            Parameters
+            ----------
+            X : numpy array (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array (n_instances, n_objects)
+                Choices for given objects in the query
+            sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
+                The sampler used to estimate the posterior mean and mass matrix from the trace
+
+                    * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                    * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                    * **nuts** : Use the No-U-Turn sampler
+            **kwargs :
+                Keyword arguments for the fit function
+        """
         self.construct_model(X, Y)
         kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32')
         callbacks = kwargs['vi_params'].get('callbacks', [])
@@ -123,11 +234,21 @@ def predict_scores(self, X, **kwargs):
     def predict_for_scores(self, scores, **kwargs):
         return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs)
 
-    def clear_memory(self, **kwargs):
-        self.logger.info("Clearing memory")
-        pass
+    def set_tunable_parameters(self, n_mixtures=4, loss_function='', regularization="l1", **point):
+        """
+            Set tunable parameters of the Mixed Logit model to the values provided.
 
-    def set_tunable_parameters(self, loss_function='', regularization="l1", n_mixtures=4, **point):
+            Parameters
+            ----------
+            n_mixtures: int (range : [2, inf])
+                The number of logit models (:math:`R`) which are used to estimate the choice probability
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            point: dict
+                Dictionary containing parameter values which are not tuned for the network
+        """
         if loss_function in likelihood_dict.keys():
             self.loss_function = likelihood_dict.get(loss_function, None)
         self.n_mixtures = n_mixtures
diff --git a/csrank/discretechoice/multinomial_logit_model.py b/csrank/discretechoice/multinomial_logit_model.py
index 49ffc553..5613e59f 100644
--- a/csrank/discretechoice/multinomial_logit_model.py
+++ b/csrank/discretechoice/multinomial_logit_model.py
@@ -16,7 +16,7 @@
 class MultinomialLogitModel(DiscreteObjectChooser, Learner):
     def __init__(self, n_object_features, loss_function='', regularization='l2', **kwargs):
         """
-            Create an instance of the MultinomialLogitModel model for learning the discrete choice function. The utility
+            Create an instance of the Multinomial Logit model for learning the discrete choice function. The utility
             score for each object in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is
             the weight vector. The probability of choosing an object :math:`x_i` is defined by taking softmax over the
             utility scores of the objects:
@@ -47,6 +47,8 @@ def __init__(self, n_object_features, loss_function='', regularization='l2', **k
             References
             ----------
                 [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86.
+
+                [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986
         """
         self.logger = logging.getLogger(MultinomialLogitModel.__name__)
         self.n_object_features = n_object_features
@@ -66,13 +68,25 @@ def __init__(self, n_object_features, loss_function='', regularization='l2', **k
     @property
     def model_configuration(self):
         """
-            Constructs the dictionary containing the priors for the weight vector for the model according to the
-            regularization function.
+            Constructs the dictionary containing the priors for the weight vectors for the model according to the
+            regularization function. The parameters are:
+                * **weights** : Weights to evaluates the utility of the objects
 
-            Returns
-            -------
-                configuration : dict
-                    Dictionary containing the priors applies on the weights
+            For ``l1`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w)
+
+            For ``l2`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w)
         """
         if self._config is None:
             if self.regularization == 'l2':
@@ -143,10 +157,11 @@ def fit(self, X, Y, sampler='vi', **kwargs):
             Y : numpy array (n_instances, n_objects)
                 Choices for given objects in the query
             sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
-                The sampler used to estimate the posterior mean and mass matrix from the trace.
-                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
-                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
-                * **nuts** : Use the No-U-Turn sampler
+                The sampler used to estimate the posterior mean and mass matrix from the trace
+
+                    * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                    * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                    * **nuts** : Use the No-U-Turn sampler
             **kwargs :
                 Keyword arguments for the fit function
         """
diff --git a/csrank/discretechoice/nested_logit_model.py b/csrank/discretechoice/nested_logit_model.py
index 1eba43f3..396cd28d 100644
--- a/csrank/discretechoice/nested_logit_model.py
+++ b/csrank/discretechoice/nested_logit_model.py
@@ -21,11 +21,19 @@ class NestedLogitModel(DiscreteObjectChooser, Learner):
     def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='', regularization='l1', alpha=1e-2,
                  random_state=None, **kwd):
         """
-            Create an instance of the NestedLogitModel model for learning the discrete choice function.
+            Create an instance of the Nested Logit model for learning the discrete choice function. This model divides
+            objects into disjoint subsets called nests,such that the objects which are similar to each other are in same
+            nest. This model structure is 1-layer of hierarchy and the :math:`\lambda` for each nest :math:`B_k` signifies
+            the degree of independence and  :math:`1-\lambda` signifies the correlations between the object in it. We
+            learn two weight vectors and the  :math:`\lambda s`.
+
+            The probability of choosing an object :math:`x_i` from the given query set :math:`Q` is defined by product
+            of choosing the nest in which :math:`x_i` exists and then choosing the the object from the nest.
 
             .. math::
 
-                P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))}
+                P(x_i \\lvert Q) = P_i = P_{i \lvert B_k} P_{B_k} \enspace ,
+
 
             The discrete choice for the given query set :math:`Q` is defined as:
 
@@ -39,10 +47,14 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='',
                 Number of features of the object space
             n_objects: int
                 Number of objects in each query set
+            n_nests : int range : [2,n_objects/2]
+                The number of nests/subsets in which the objects are divided
             loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
                 Loss function to be used for the discrete choice decision from the query set
             regularization : string, {‘l1’, ‘l2’}, string
                Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            alpha: float (range : [0,1])
+                The lower bound of the correlations between the objects in a nest
             random_state : int or object
                 Numpy random state
             **kwargs
@@ -50,7 +62,11 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='',
 
             References
             ----------
-                [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86.
+                [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap GEV, pp. 87–111.
+
+                [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986
+
+                [3] Kenneth Train and Daniel McFadden. „The goods/leisure tradeoff and disaggregate work trip mode choice models“. In: Transportation research 12.5 (1978), pp. 349–353
         """
         self.logger = logging.getLogger(NestedLogitModel.__name__)
         self.n_object_features = n_object_features
@@ -82,8 +98,25 @@ def model_configuration(self):
         """
             Constructs the dictionary containing the priors for the weight vectors for the model according to the
             regularization function. The parameters are:
-                * weights :
-                * weights_k :
+                * **weights** : Weights to evaluates the utility of the objects
+                * **weights_k** : Weights to evaluates the utility of the nests
+
+            For ``l1`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w)
+
+            For ``l2`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w)
+
 
             Returns
             -------
@@ -104,6 +137,22 @@ def model_configuration(self):
         return self._config
 
     def create_nests(self, X):
+        """
+            For allocating the objects to different nests we use the clustering algorithm with number of clusters
+            :math:`k` and allocate the similar objects in query set :math:`Q`.
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects in the query sets
+
+            Returns
+            -------
+            Yn : numpy array
+                (n_instances, n_objects) Values for each object implying the nest it belongs to. For example for :math:`2` nests the value 0 implies that object is allocated to nest 1 and value 1 implies it is allocated to nest 2.
+
+        """
         n, n_obj, n_dim = X.shape
         objects = X.reshape(n * n_obj, n_dim)
         if self.cluster_model is None:
@@ -112,12 +161,12 @@ def create_nests(self, X):
             prediction = self.cluster_model.labels_
         else:
             prediction = self.cluster_model.predict(objects)
-        y_nests = []
+        Yn = []
         for i in np.arange(0, n * n_obj, step=n_obj):
             nest_ids = prediction[i:i + n_obj]
-            y_nests.append(nest_ids)
-        y_nests = np.array(y_nests)
-        return y_nests
+            Yn.append(nest_ids)
+        Yn = np.array(Yn)
+        return Yn
 
     def _eval_utility(self, weights):
         utility = tt.zeros(tuple(self.y_nests.shape))
@@ -126,7 +175,40 @@ def _eval_utility(self, weights):
             utility = tt.set_subtensor(utility[rows, cols], tt.dot(self.Xt[rows, cols], weights[i]))
         return utility
 
-    def get_probability(self, utility, lambda_k, utility_k):
+    def get_probabilities(self, utility, lambda_k, utility_k):
+        """
+            This method calculates the probability of choosing an object from the query set using the following parameters of the model which are used:
+
+                * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i`
+                * **weights_k** (:math:`w_k`): Weights to get the utility of the next  :math:`W_k = U_k(x) = w_k \cdot c_k`, where :math:`c_k` is the center of the object space of nest :math:`B_k`
+                * **lambda_k** (:math:`\lambda_k`): Lambda is the measure of independence amongst the obejcts in the nest :math:`B_k`
+
+            The probability of choosing the object  :math:`x_i` from the query set :math:`Q`:
+
+            .. math::
+                P_i = \\frac{\\boldsymbol{e}^{ ^{Y_i} /_{\lambda_k}}}{\sum_{j \in B_k} \\boldsymbol{e}^{^{Y_j} /_{\lambda_k}}} \\frac {\\boldsymbol{e}^{W_k + \lambda_k I_k}} {\sum_{\\ell = 1}^{K} \\boldsymbol{e}^{ W_{\\ell } + \lambda_{\\ell} I_{\\ell}}} \quad i \in B_k  \enspace , \\\\
+                where,\enspace I_k = \ln \sum_{ j \in B_k} \\boldsymbol{e}^{^{Y_j} /_{\lambda_k}}
+
+
+            Parameters
+            ----------
+            utility : theano tensor
+                (n_instances, n_objects)
+                Utility :math:`Y_i` of the objects :math:`x_i \in Q` in the query sets
+            lambda_k : theano tensor (range : [alpha, 1.0])
+                (n_nests)
+                Measure of independence amongst the obejcts in each nests
+            utility_k : theano tensor
+                (n_instances, n_nests)
+                Utilities of the nests :math:`B_k \in \mathcal{B}`
+
+            Returns
+            -------
+            p : theano tensor
+                (n_instances, n_objects)
+                Choice probabilities :math:`P_i` of the objects :math:`x_i \in Q` in the query sets
+
+        """
         n_instances, n_objects = self.y_nests.shape
         pni_k = tt.zeros((n_instances, n_objects))
         ivm = tt.zeros((n_instances, self.n_nests))
@@ -153,20 +235,20 @@ def _eval_utility_np(self, x_t, y_nests, weights):
             utility[rows, cols] = np.dot(x_t[rows, cols], weights[i])
         return utility
 
-    def _get_probability_np(self, y_nests, utility, lambda_k, utility_k):
-        n_instances, n_objects = y_nests.shape
+    def _get_probabilities_np(self, Y_n, utility, lambda_k, utility_k):
+        n_instances, n_objects = Y_n.shape
         pni_k = np.zeros((n_instances, n_objects))
         ivm = np.zeros((n_instances, self.n_nests))
         for i in range(self.n_nests):
             sub_tensor = np.copy(utility)
-            sub_tensor[np.where(y_nests != i)] = -1e50
+            sub_tensor[np.where(Y_n != i)] = -1e50
             ink = npu.logsumexp(sub_tensor)
-            pni_k[np.where(y_nests == i)] = np.exp(sub_tensor - ink)[np.where(y_nests == i)]
+            pni_k[np.where(Y_n == i)] = np.exp(sub_tensor - ink)[np.where(Y_n == i)]
             ivm[:, i] = lambda_k[i] * ink[:, 0] + utility_k[i]
         pk = np.exp(ivm - npu.logsumexp(ivm))
         pn_k = np.zeros((n_instances, n_objects))
         for i in range(self.n_nests):
-            rows, cols = np.where(y_nests == i)
+            rows, cols = np.where(Y_n == i)
             p = np.ones((n_instances, n_objects)) * pk[:, i][:, None]
             pn_k[rows, cols] = p[rows, cols]
         p = pni_k * pn_k
@@ -174,11 +256,10 @@ def _get_probability_np(self, y_nests, utility, lambda_k, utility_k):
 
     def construct_model(self, X, Y):
         """
-            Constructs the nested logit model.
-
-            .. math::
-
-                P_i = P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))}
+            Constructs the nested logit model by applying priors on weight vectors **weights** and **weights_k** as per
+            :meth:`model_configuration`. Then we apply a uniform prior to the :math:`\lambda s`, i.e.
+            :math:`\lambda s \sim Uniform(\\text{alpha}, 1.0)`.The probability of choosing the object :math:`x_i` from
+            the query set :math:`Q = \{x_1, \ldots ,x_n\}` is evaluated in :meth:`get_probabilities`.
 
             Parameters
             ----------
@@ -205,7 +286,7 @@ def construct_model(self, X, Y):
             weights = (weights_dict['weights'] / lambda_k[:, None])
             utility = self._eval_utility(weights)
             utility_k = tt.dot(self.features_nests, weights_dict['weights_k'])
-            self.p = self.get_probability(utility, lambda_k, utility_k)
+            self.p = self.get_probabilities(utility, lambda_k, utility_k)
 
             yl = LogLikelihood('yl', loss_func=self.loss_function, p=self.p, observed=self.Yt)
         self.logger.info("Model construction completed")
@@ -230,10 +311,11 @@ def fit(self, X, Y, sampler="vi", **kwargs):
             Y : numpy array (n_instances, n_objects)
                 Choices for given objects in the query
             sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
-                The sampler used to estimate the posterior mean and mass matrix from the trace.
-                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
-                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
-                * **nuts** : Use the No-U-Turn sampler
+                The sampler used to estimate the posterior mean and mass matrix from the trace
+
+                    * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                    * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                    * **nuts** : Use the No-U-Turn sampler
             **kwargs :
                 Keyword arguments for the fit function
         """
@@ -292,7 +374,7 @@ def _predict_scores_fixed(self, X, **kwargs):
         weights = (weights / lambda_k[:, None])
         utility_k = np.dot(self.features_nests, weights_k)
         utility = self._eval_utility_np(X, y_nests, weights)
-        scores = self._get_probability_np(y_nests, utility, lambda_k, utility_k)
+        scores = self._get_probabilities_np(y_nests, utility, lambda_k, utility_k)
         return scores
 
     def predict(self, X, **kwargs):
@@ -306,7 +388,7 @@ def predict_for_scores(self, scores, **kwargs):
 
     def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization="l1", **point):
         """
-            Set tunable parameters of the Multinomial Logit model to the values provided.
+            Set tunable parameters of the Nested Logit model to the values provided.
 
             Parameters
             ----------
diff --git a/csrank/discretechoice/paired_combinatorial_logit.py b/csrank/discretechoice/paired_combinatorial_logit.py
index 015f3013..460d1bf4 100644
--- a/csrank/discretechoice/paired_combinatorial_logit.py
+++ b/csrank/discretechoice/paired_combinatorial_logit.py
@@ -19,7 +19,57 @@
 class PairedCombinatorialLogit(DiscreteObjectChooser, Learner):
 
     def __init__(self, n_object_features, n_objects, loss_function='', regularization='l2', alpha=5e-2,
-                 random_state=None, model_args={}, **kwd):
+                 random_state=None, **kwd):
+        """
+            Create an instance of the Paired Combinatorial Logit model for learning the discrete choice function. This
+            model considering each pair of objects as a different nest allowing unique covariances for each pair of objects,
+            and each object is a member of :math:`n - 1` nests. This model structure is 1-layer of hierarchy and the
+            :math:`\lambda` for each nest :math:`B_k` signifies the degree of independence and  :math:`1-\lambda` signifies
+            the correlations between the object in it. We learn two weight vectors and the  :math:`\lambda s`.
+                * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i`
+                * **lambda_k** (:math:`\lambda_k`): Lambda for nest nest :math:`B_k` for correlations between the obejcts.
+
+            The probability of choosing an object :math:`x_i` from the given query set :math:`Q` is defined by product
+            of choosing the nest in which :math:`x_i` exists and then choosing the the object from the nest.
+
+            .. math::
+
+                P(x_i \\lvert Q) = P_i = \sum_{\substack{B_k \in \mathcal{B} \\ i \in B_k}}P_{i \\lvert B_k} P_{B_k} \enspace ,
+
+
+            The discrete choice for the given query set :math:`Q` is defined as:
+
+            .. math::
+
+                dc(Q) := \operatorname{argmax}_{x_i \in Q }  \; P(x_i \\lvert Q)
+
+            Parameters
+            ----------
+            n_object_features : int
+                Number of features of the object space
+            n_objects: int
+                Number of objects in each query set
+            n_nests : int range : [2,n_objects/2]
+                The number of nests/subsets in which the objects are divided
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            alpha: float (range : [0,1])
+                The lower bound of the correlations between the objects in a nest
+            random_state : int or object
+                Numpy random state
+            **kwargs
+                Keyword arguments for the algorithms
+
+            References
+            ----------
+                [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap GEV, pp. 87–111.
+
+                [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986
+
+                [3] Chaushie Chu. „A paired combinatorial logit model for travel demand analysis“. In: Proceedings of the fifth world conference on transportation research. Vol. 4.1989, pp. 295–309
+        """
         self.logger = logging.getLogger(PairedCombinatorialLogit.__name__)
         self.n_object_features = n_object_features
         self.n_objects = n_objects
@@ -41,7 +91,33 @@ def __init__(self, n_object_features, n_objects, loss_function='', regularizatio
         self.p = None
 
     @property
-    def model_priors(self):
+    def model_configuration(self):
+        """
+            Constructs the dictionary containing the priors for the weight vectors for the model according to the
+            regularization function. The parameters are:
+                * **weights** : Weights to evaluates the utility of the objects
+
+            For ``l1`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w)
+
+            For ``l2`` regularization the priors are:
+
+            .. math::
+
+                \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\
+                \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\
+                \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w)
+
+            Returns
+            -------
+                configuration : dict
+                    Dictionary containing the priors applies on the weights
+        """
         if self._config is None:
             if self.regularization == 'l2':
                 weight = pm.Normal
@@ -54,8 +130,39 @@ def model_priors(self):
                 'weights_k': [weight, {'mu': (pm.Normal, {'mu': 0, 'sd': 5}), prior: (pm.HalfCauchy, {'beta': 1})}]}
             self.logger.info('Creating model with config {}'.format(print_dictionary(self._config)))
         return self._config
+#
 
     def get_probabilities(self, utility, lambda_k):
+        """
+            This method calculates the probability of choosing an object from the query set using the following parameters of the model which are used:
+
+                * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i`
+                * **lambda_k** (:math:`\lambda_k`): Lambda is the measure of independence amongst the obejcts in the nest :math:`B_k`
+
+            The probability of choosing the object  :math:`x_i` from the query set :math:`Q`:
+
+            .. math::
+                    P_i = \sum_{j \in I \setminus i} P_{{i} \\lvert {ij}} P_{ij} \enspace where, \\\\
+                    P_{i \\lvert ij} = \\frac{\\boldsymbol{e}^{^{Y_i} /_{\lambda_{ij}}}}{\\boldsymbol{e}^{^{Y_i} /_{\lambda_{ij}}} + \\boldsymbol{e}^{^{Y_j} /_{\lambda_{ij}}}} \enspace ,\\\\
+                    P_{ij} = \\frac{{\\left( \\boldsymbol{e}^{^{V_i}/{\lambda_{ij}}} + \\boldsymbol{e}^{^{V_j}/{\lambda_{ij}}}  \\right)}^{\lambda_{ij}}}{\sum_{k=1}^{n-1} \sum_{\ell = k + 1}^{n} {\\left( \\boldsymbol{e}^{^{V_k}/{\lambda_{k\ell}}} + \\boldsymbol{e}^{^{V_{\ell}}/{\lambda_{k\ell}}}  \\right)}^{\lambda_{k\ell}}}
+
+
+            Parameters
+            ----------
+            utility : theano tensor
+                (n_instances, n_objects)
+                Utility :math:`Y_i` of the objects :math:`x_i \in Q` in the query sets
+            lambda_k : theano tensor (range : [alpha, 1.0])
+                (n_nests)
+                Measure of independence amongst the obejcts in each nests
+
+            Returns
+            -------
+            p : theano tensor
+                (n_instances, n_objects)
+                Choice probabilities :math:`P_i` of the objects :math:`x_i \in Q` in the query sets
+
+        """
         n_objects = self.n_objects
         nests_indices = self.nests_indices
         n_nests = self.n_nests
@@ -79,7 +186,7 @@ def get_probabilities(self, utility, lambda_k):
             p = tt.set_subtensor(p[:, i2], p[:, i2] + x2)
         return p
 
-    def get_probabilities_np(self, utility, lambda_k):
+    def _get_probabilities_np(self, utility, lambda_k):
         n_objects = self.n_objects
         nests_indices = self.nests_indices
         n_nests = self.n_nests
@@ -100,6 +207,25 @@ def get_probabilities_np(self, utility, lambda_k):
         return p
 
     def construct_model(self, X, Y):
+        """
+            Constructs the nested logit model by applying priors on weight vectors **weights** as per :meth:`model_configuration`.
+            Then we apply a uniform prior to the :math:`\lambda s`, i.e. :math:`\lambda s \sim Uniform(\\text{alpha}, 1.0)`.
+            The probability of choosing the object :math:`x_i` from the query set :math:`Q = \{x_1, \ldots ,x_n\}` is
+            evaluated in :meth:`get_probabilities`.
+
+            Parameters
+            ----------
+            X : numpy array
+                (n_instances, n_objects, n_features)
+                Feature vectors of the objects
+            Y : numpy array
+                (n_instances, n_objects)
+                Preferences in the form of discrete choices for given objects
+
+            Returns
+            -------
+             model : pymc3 Model :class:`pm.Model`
+        """
         with pm.Model() as self.model:
             self.Xt = theano.shared(X)
             self.Yt = theano.shared(Y)
@@ -113,7 +239,7 @@ def construct_model(self, X, Y):
 
     def fit(self, X, Y, sampler="vi", **kwargs):
         """
-            Fit a paired combinatorial logit model on the provided set of queries X and choices Y of those objects. The
+            Fit a paired combinatorial model on the provided set of queries X and choices Y of those objects. The
             provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network
             the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as:
 
@@ -131,10 +257,11 @@ def fit(self, X, Y, sampler="vi", **kwargs):
             Y : numpy array (n_instances, n_objects)
                 Choices for given objects in the query
             sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string
-                The sampler used to estimate the posterior mean and mass matrix from the trace.
-                * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
-                * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
-                * **nuts** : Use the No-U-Turn sampler
+                The sampler used to estimate the posterior mean and mass matrix from the trace
+
+                    * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix
+                    * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler
+                    * **nuts** : Use the No-U-Turn sampler
             **kwargs :
                 Keyword arguments for the fit function
         """
@@ -189,7 +316,7 @@ def _predict_scores_fixed(self, X, **kwargs):
         weights = np.array([mean_trace['weights__{}'.format(i)] for i in range(self.n_object_features)])
         lambda_k = np.array([mean_trace['lambda_k__{}'.format(i)] for i in range(self.n_nests)])
         utility = np.dot(X, weights)
-        p = self.get_probabilities_np(utility, lambda_k)
+        p = self._get_probabilities_np(utility, lambda_k)
         return p
 
     def predict(self, X, **kwargs):
@@ -202,6 +329,20 @@ def predict_for_scores(self, scores, **kwargs):
         return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs)
 
     def set_tunable_parameters(self, alpha=5e-2, loss_function='', regularization='l2', **point):
+        """
+            Set tunable parameters of the Paired Combinatorial logit model to the values provided.
+
+            Parameters
+            ----------
+            alpha: float (range : [0,1])
+                The lower bound of the correlations between the objects in a nest
+            loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’}
+                Loss function to be used for the discrete choice decision from the query set
+            regularization : string, {‘l1’, ‘l2’}, string
+               Regularizer function (L1 or L2) applied to the `kernel` weights matrix
+            point: dict
+                Dictionary containing parameter values which are not tuned for the network
+        """
         if alpha is not None:
             self.alpha = alpha
         if loss_function in likelihood_dict.keys():
diff --git a/csrank/discretechoice/ranknet_discrete_choice.py b/csrank/discretechoice/ranknet_discrete_choice.py
index 786008ab..1f15fbc8 100644
--- a/csrank/discretechoice/ranknet_discrete_choice.py
+++ b/csrank/discretechoice/ranknet_discrete_choice.py
@@ -107,4 +107,4 @@ def clear_memory(self, **kwargs):
     def set_tunable_parameters(self, n_hidden=32, n_units=2, reg_strength=1e-4, learning_rate=1e-3, batch_size=128,
                                **point):
         super().set_tunable_parameters(n_hidden=n_hidden, n_units=n_units, reg_strength=reg_strength,
-                                       learning_rate=learning_rate, batch_size=batch_size, **point)
\ No newline at end of file
+                                       learning_rate=learning_rate, batch_size=batch_size, **point)