From 45fcb9f89d85fab2854d5ffdd63a085f7ad6ab28 Mon Sep 17 00:00:00 2001 From: Pritha Gupta Date: Mon, 3 Jun 2019 12:51:58 +0200 Subject: [PATCH 1/5] Completed the description for context-dependent models --- csrank/choicefunctions/fate_choice.py | 13 ++++++------- csrank/choicefunctions/feta_choice.py | 15 ++++++++++++++- csrank/core/cmpnet_core.py | 2 +- csrank/discretechoice/fate_discrete_choice.py | 12 ++++++------ csrank/discretechoice/feta_discrete_choice.py | 13 +++++++++++++ csrank/objectranking/fate_object_ranker.py | 12 ++++++------ csrank/objectranking/feta_object_ranker.py | 14 +++++++++++++- 7 files changed, 59 insertions(+), 22 deletions(-) diff --git a/csrank/choicefunctions/fate_choice.py b/csrank/choicefunctions/fate_choice.py index 59e44be0..41e43130 100644 --- a/csrank/choicefunctions/fate_choice.py +++ b/csrank/choicefunctions/fate_choice.py @@ -17,11 +17,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units= optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9), batch_size=256, metrics=None, random_state=None, **kwargs): """ - Create a FATE-network architecture for leaning discrete choice function. Training complexity is quadratic in - the number of objects and prediction complexity is only linear. The first-aggregate-then-evaluate approach - learns an embedding of each object and then aggregates that into a context representation - :math:`\\mu_{C(x)}`, where :math`C(x) = Q \setminus \{x\}` and then scores each object :math:`x` using a - generalized utility function :math:`U (x, \\mu_{C(x)})`. + Create a FATE-network architecture for leaning discrete choice function. The first-aggregate-then-evaluate + approach learns an embedding of each object and then aggregates that into a context representation + :math:`\\mu_{C(x)}` and then scores each object :math:`x` using a generalized utility function + :math:`U (x, \\mu_{C(x)})`. + To make it computationally efficient we take the the context :math:`C(x)` as query set :math:`Q`. The context-representation is evaluated as: .. math:: @@ -29,12 +29,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units= where :math:`\phi \colon \mathcal{X} \\to \mathcal{Z}` maps each object :math:`y` to an :math:`m`-dimensional embedding space :math:`\mathcal{Z} \subseteq \mathbb{R}^m`. - To make it computationally efficient we take the the context as query set :math:`Q`. The choice set is defined as: .. math:: - c(Q) = \{ x_i \in Q \lvert \, U (x, \\mu_{C(x)}) > t \} + c(Q) = \{ x \in Q \lvert \, U (x, \\mu_{C(x)}) > t \} Parameters diff --git a/csrank/choicefunctions/feta_choice.py b/csrank/choicefunctions/feta_choice.py index ff0140e0..6cb2dd89 100644 --- a/csrank/choicefunctions/feta_choice.py +++ b/csrank/choicefunctions/feta_choice.py @@ -22,8 +22,21 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero metrics=['binary_accuracy'], batch_size=256, random_state=None, **kwargs): """ - Create a FETA-network architecture for learning the choice functions. + Create a FETA-network architecture for learning choice functions. + The first-evaluate-then-aggregate approach approximates the context-dependent utility function using the + first-order utility function :math:`U_1 \colon \mathcal{X} \\times \mathcal{X} \\rightarrow [0,1]` + and zeroth-order utility function :math:`U_0 \colon \mathcal{X} \\rightarrow [0,1]`. + The scores each object :math:`x` using a context-dependent utility function :math:`U (x, C_i)`: + + .. math:: + U(x_i, C_i) = U_0(x_i) + \\frac{1}{n-1} \sum_{x_j \in Q \\setminus \{x_i\}} U_1(x_i , x_j) \, . + Training and prediction complexity is quadratic in the number of objects. + The choice set is defined as: + + .. math:: + + c(Q) = \{ x_i \in Q \lvert \, U (x_i, C_i) > t \} Parameters ---------- diff --git a/csrank/core/cmpnet_core.py b/csrank/core/cmpnet_core.py index 530b3ecf..f3c7deeb 100644 --- a/csrank/core/cmpnet_core.py +++ b/csrank/core/cmpnet_core.py @@ -67,7 +67,7 @@ def _convert_instances(self, X, Y): def construct_model(self): """ - Construct the CmpNEt which is used to approximate the :math:`U_1(x_i,x_j)`. For each pair of objects in + Construct the CmpNet which is used to approximate the :math:`U_1(x_i,x_j)`. For each pair of objects in :math:`x_i, x_j \in Q` we construct two sub-networks with weight sharing in all hidden layers. The output of these networks are connected to two sigmoid units that produces the outputs of the network, i.e., :math:`U(x_1,x_2), U(x_2,x_1)` for each pair of objects are evaluated. diff --git a/csrank/discretechoice/fate_discrete_choice.py b/csrank/discretechoice/fate_discrete_choice.py index 39501b8e..cff7697c 100644 --- a/csrank/discretechoice/fate_discrete_choice.py +++ b/csrank/discretechoice/fate_discrete_choice.py @@ -15,11 +15,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units= kernel_regularizer=l2(l=0.01), optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9), batch_size=256, random_state=None, **kwargs): """ - Create a FATE-network architecture for leaning discrete choice function. Training complexity is quadratic in - the number of objects and prediction complexity is only linear. The first-aggregate-then-evaluate approach - learns an embedding of each object and then aggregates that into a context representation - :math:`\\mu_{C(x)}`, where :math`C(x) = Q \setminus \{x\}` and then scores each object :math:`x` using a - generalized utility function :math:`U (x, \\mu_{C(x)})`. + Create a FATE-network architecture for leaning discrete choice function. The first-aggregate-then-evaluate + approach learns an embedding of each object and then aggregates that into a context representation + :math:`\\mu_{C(x)}` and then scores each object :math:`x` using a generalized utility function + :math:`U (x, \\mu_{C(x)})`. + To make it computationally efficient we take the the context :math:`C(x)` as query set :math:`Q`. The context-representation is evaluated as: .. math:: @@ -27,7 +27,7 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units= where :math:`\phi \colon \mathcal{X} \\to \mathcal{Z}` maps each object :math:`y` to an :math:`m`-dimensional embedding space :math:`\mathcal{Z} \subseteq \mathbb{R}^m`. - To make it computationally efficient we take the the context as query set :math:`Q`. + Training complexity is quadratic in the number of objects and prediction complexity is only linear. The discrete choice for the given query set :math:`Q` is defined as: .. math:: diff --git a/csrank/discretechoice/feta_discrete_choice.py b/csrank/discretechoice/feta_discrete_choice.py index f0941e0e..8d404f7c 100644 --- a/csrank/discretechoice/feta_discrete_choice.py +++ b/csrank/discretechoice/feta_discrete_choice.py @@ -20,7 +20,20 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero metrics=['categorical_accuracy'], batch_size=256, random_state=None, **kwargs): """ Create a FETA-network architecture for learning the discrete choice functions. + The first-evaluate-then-aggregate approach approximates the context-dependent utility function using the + first-order utility function :math:`U_1 \colon \mathcal{X} \\times \mathcal{X} \\rightarrow [0,1]` + and zeroth-order utility function :math:`U_0 \colon \mathcal{X} \\rightarrow [0,1]`. + The scores each object :math:`x` using a context-dependent utility function :math:`U (x, C_i)`: + + .. math:: + U(x_i, C_i) = U_0(x_i) + \\frac{1}{n-1} \sum_{x_j \in Q \\setminus \{x_i\}} U_1(x_i , x_j) \, . + Training and prediction complexity is quadratic in the number of objects. + The discrete choice for the given query set :math:`Q` is defined as: + + .. math:: + + dc(Q) := \operatorname{argmax}_{x_i \in Q} \; U (x_i, C_i) Parameters ---------- diff --git a/csrank/objectranking/fate_object_ranker.py b/csrank/objectranking/fate_object_ranker.py index cf7f317b..119cea21 100644 --- a/csrank/objectranking/fate_object_ranker.py +++ b/csrank/objectranking/fate_object_ranker.py @@ -16,11 +16,11 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units= loss_function=hinged_rank_loss, metrics=[zero_one_rank_loss_for_scores_ties], random_state=None, **kwargs): """ - Create a FATE-network architecture for leaning discrete choice function. Training complexity is quadratic in - the number of objects and prediction complexity is only linear. The first-aggregate-then-evaluate approach - learns an embedding of each object and then aggregates that into a context representation - :math:`\\mu_{C(x)}`, where :math`C(x) = Q \setminus \{x\}` and then scores each object :math:`x` using a - generalized utility function :math:`U (x, \\mu_{C(x)})`. + Create a FATE-network architecture for leaning object ranking function. The first-aggregate-then-evaluate + approach learns an embedding of each object and then aggregates that into a context representation + :math:`\\mu_{C(x)}` and then scores each object :math:`x` using a context-dependent utility function + :math:`U (x, \\mu_{C(x)})`. + To make it computationally efficient we take the the context :math:`C(x)` as query set :math:`Q`. The context-representation is evaluated as: .. math:: @@ -28,7 +28,7 @@ def __init__(self, n_object_features, n_hidden_set_layers=2, n_hidden_set_units= where :math:`\phi \colon \mathcal{X} \\to \mathcal{Z}` maps each object :math:`y` to an :math:`m`-dimensional embedding space :math:`\mathcal{Z} \subseteq \mathbb{R}^m`. - To make it computationally efficient we take the the context as query set :math:`Q`. + Training complexity is quadratic in the number of objects and prediction complexity is only linear. The ranking for the given query set :math:`Q` is defined as: .. math:: diff --git a/csrank/objectranking/feta_object_ranker.py b/csrank/objectranking/feta_object_ranker.py index 79a74d3a..6c044c3b 100644 --- a/csrank/objectranking/feta_object_ranker.py +++ b/csrank/objectranking/feta_object_ranker.py @@ -17,8 +17,20 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9), metrics=None, batch_size=256, random_state=None, **kwargs): """ - Create a FETA-network architecture for object ranking. + Create a FETA-network architecture for object ranking. The first-evaluate-then-aggregate approach + approximates the context-dependent utility function using the first-order utility function + :math:`U_1 \colon \mathcal{X} \\times \mathcal{X} \\rightarrow [0,1]` and zeroth-order utility + function :math:`U_0 \colon \mathcal{X} \\rightarrow [0,1]`. + The scores each object :math:`x` using a context-dependent utility function :math:`U (x, C_i)`: + + .. math:: + U(x_i, C_i) = U_0(x_i) + \\frac{1}{n-1} \sum_{x_j \in Q \\setminus \{x_i\}} U_1(x_i , x_j) \, . + Training and prediction complexity is quadratic in the number of objects. + The ranking for the given query set :math:`Q` is defined as: + + .. math:: + ρ(Q) = \operatorname{argsort}_{x_i \in Q} \; U (x_i, C_i) Parameters ---------- From 404b1bcb4408b9d9c01cffa45e2feaf161566c45 Mon Sep 17 00:00:00 2001 From: Pritha Gupta Date: Mon, 3 Jun 2019 14:42:30 +0200 Subject: [PATCH 2/5] Completed description of models to learn the choice function --- csrank/choicefunctions/choice_functions.py | 29 +-- csrank/choicefunctions/cmpnet_choice.py | 40 +++- csrank/choicefunctions/fate_choice.py | 46 ++++- csrank/choicefunctions/feta_choice.py | 25 +++ .../generalized_linear_model.py | 176 +++++++++--------- csrank/choicefunctions/pairwise_choice.py | 18 ++ csrank/choicefunctions/ranknet_choice.py | 39 +++- csrank/choicefunctions/util.py | 83 +++++++++ csrank/core/cmpnet_core.py | 27 +-- csrank/core/fate_network.py | 10 +- csrank/core/feta_network.py | 2 +- csrank/core/ranknet_core.py | 33 ++-- csrank/discretechoice/discrete_choice.py | 36 ++-- csrank/learner.py | 2 +- csrank/objectranking/object_ranker.py | 7 +- 15 files changed, 412 insertions(+), 161 deletions(-) diff --git a/csrank/choicefunctions/choice_functions.py b/csrank/choicefunctions/choice_functions.py index 20695b62..a488bead 100644 --- a/csrank/choicefunctions/choice_functions.py +++ b/csrank/choicefunctions/choice_functions.py @@ -16,22 +16,25 @@ def learning_problem(self): return CHOICE_FUNCTION def predict_for_scores(self, scores, **kwargs): - """ Predict choices for scores for a given collection of sets of objects. + """ + Binary choice vector :math:`y` represents the choices amongst the objects in :math:`Q`, such that + :math:`y(k) = 1` represents that the object :math:`x_k` is chosen and :math:`y(k) = 0` represents it is not + chosen. Predict choices for the scores for a given collection of sets of objects (query sets). - Parameters - ---------- - scores : dict or numpy array - Dictionary with a mapping from ranking size to numpy arrays - or a single numpy array of size containing scores of each object of size: - (n_instances, n_objects) + Parameters + ---------- + scores : dict or numpy array + Dictionary with a mapping from query set size to numpy arrays + or a single numpy array of size containing scores of each object of size: + (n_instances, n_objects) - Returns - ------- - Y : dict or numpy array - Dictionary with a mapping from ranking size to numpy arrays - or a single numpy array containing predicted ranking of size: - (n_instances, n_objects) + Returns + ------- + Y : dict or numpy array + Dictionary with a mapping from query set size to numpy arrays + or a single numpy array containing predicted choice vectors of size: + (n_instances, n_objects) """ if isinstance(scores, dict): diff --git a/csrank/choicefunctions/cmpnet_choice.py b/csrank/choicefunctions/cmpnet_choice.py index 4ff7ac61..93bb9173 100644 --- a/csrank/choicefunctions/cmpnet_choice.py +++ b/csrank/choicefunctions/cmpnet_choice.py @@ -89,8 +89,44 @@ def _convert_instances(self, X, Y): def construct_model(self): return super().construct_model() - def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1, - thin_thresholds=1, verbose=0, **kwd): + def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1, thin_thresholds=1, verbose=0, + **kwd): + """ + Fit a CmptNet model for learning a choice fucntion on the provided set of queries X and preferences Y of + those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). For + learning this network the binary cross entropy loss function for a pair of objects :math:`x_i, x_j \in Q` + is defined as: + + .. math:: + + C_{ij} = -\\tilde{P_{ij}}(0)\\cdot \log(U(x_i,x_j)) - \\tilde{P_{ij}}(1) \\cdot \log(U(x_j,x_i)) \ , + + where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. + :math:`\\tilde{P_{ij}} = (1,0)` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = (0,1)`. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Preferences in form of Orderings or Choices for given n_objects + epochs : int + Number of epochs to run if training for a fixed query size + callbacks : list + List of callbacks to be called during optimization + validation_split : float (range : [0,1]) + Percentage of instances to split off to validate on + tune_size: float (range : [0,1]) + Percentage of instances to split off to tune the threshold for the choice function + thin_thresholds: int + The number of instances of scores to skip while tuning the threshold + verbose : bool + Print verbose information + **kwd : + Keyword arguments for the fit function + """ if tune_size > 0: X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state) try: diff --git a/csrank/choicefunctions/fate_choice.py b/csrank/choicefunctions/fate_choice.py index 41e43130..348ca094 100644 --- a/csrank/choicefunctions/fate_choice.py +++ b/csrank/choicefunctions/fate_choice.py @@ -102,7 +102,51 @@ def _construct_layers(self, **kwargs): def construct_model(self, n_features, n_objects): return super().construct_model(n_features, n_objects) - def fit(self, X, Y, tune_size=0.1, thin_thresholds=1, **kwargs): + def fit(self, X, Y, epochs=35, inner_epochs=1, callbacks=None, validation_split=0.1, verbose=0, global_lr=1.0, + global_momentum=0.9, min_bucket_size=500, refit=False, tune_size=0.1, thin_thresholds=1, **kwargs): + """ + Fit a generic FATE-network model for learning a choice function on a provided set of queries. + + The provided queries can be of a fixed size (numpy arrays) or of varying sizes in which case dictionaries + are expected as input. For varying sizes a meta gradient descent is performed across the + different query sizes. + + Parameters + ---------- + X : numpy array or dict + Feature vectors of the objects + (n_instances, n_objects, n_features) if numpy array or map from n_objects to numpy arrays + Y : numpy array or dict + Choices for given objects in the query + (n_instances, n_objects) if numpy array or map from n_objects to numpy arrays + epochs : int + Number of epochs to run if training for a fixed query size or + number of epochs of the meta gradient descent for the variadic model + inner_epochs : int + Number of epochs to train for each query size inside the variadic + model + callbacks : list + List of callbacks to be called during optimization + validation_split : float (range : [0,1]) + Percentage of instances to split off to validate on + verbose : bool + Print verbose information + global_lr : float + Learning rate of the meta gradient descent (variadic model only) + global_momentum : float + Momentum for the meta gradient descent (variadic model only) + min_bucket_size : int + Restrict the training to queries of a minimum size + refit : bool + If True, create a new model object, otherwise continue fitting the + existing one if one exists. + tune_size: float (range : [0,1]) + Percentage of instances to split off to tune the threshold for the choice function + thin_thresholds: int + The number of instances of scores to skip while tuning the threshold + **kwargs : + Keyword arguments for the fit function + """ if tune_size > 0: X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state) try: diff --git a/csrank/choicefunctions/feta_choice.py b/csrank/choicefunctions/feta_choice.py index 6cb2dd89..3d32f4ea 100644 --- a/csrank/choicefunctions/feta_choice.py +++ b/csrank/choicefunctions/feta_choice.py @@ -178,6 +178,31 @@ def create_input_lambda(i): def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1, thin_thresholds=1, verbose=0, **kwd): + """ + Fit a FETA-Network for learning a choice function on the provided set of queries X and preferences Y of + those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + epochs : int + Number of epochs to run if training for a fixed query size + callbacks : list + List of callbacks to be called during optimization + validation_split : float (range : [0,1]) + Percentage of instances to split off to validate on + verbose : bool + Print verbose information + tune_size: float (range : [0,1]) + Percentage of instances to split off to tune the threshold for the choice function + thin_thresholds: int + The number of instances of scores to skip while tuning the threshold + **kwd : + Keyword arguments for the fit function + """ if tune_size > 0: X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state) try: diff --git a/csrank/choicefunctions/generalized_linear_model.py b/csrank/choicefunctions/generalized_linear_model.py index 16bf01fa..0a14d82d 100644 --- a/csrank/choicefunctions/generalized_linear_model.py +++ b/csrank/choicefunctions/generalized_linear_model.py @@ -5,12 +5,11 @@ import pymc3 as pm import theano import theano.tensor as tt -from pymc3 import Discrete -from pymc3.distributions.dist_math import bound from sklearn.model_selection import train_test_split from sklearn.utils import check_random_state import csrank.theano_util as ttu +from csrank.choicefunctions.util import create_weight_dictionary, BinaryCrossEntropyLikelihood from csrank.learner import Learner from csrank.util import print_dictionary from .choice_functions import ChoiceFunctions @@ -19,7 +18,20 @@ class GeneralizedLinearModel(ChoiceFunctions, Learner): def __init__(self, n_object_features, regularization='l2', random_state=None, **kwargs): """ - Create an instance of the GeneralizedLinearModel model. + Create an instance of the GeneralizedLinearModel model for learning the choice function. This model is + adapted from the multinomial logit model :class:`MultinomialLogitModel`. The utility score for each object + in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is the weight vector. + The probability of choosing an object :math:`x_i` is defined by taking sigmoid over the utility scores: + + .. math:: + + P(x_i \\lvert Q) = \\frac{1}{1+exp(-U(x_i))} + + The choice set is defined as: + + .. math:: + + c(Q) = \{ x_i \in Q \lvert \, P(x_i \\lvert Q) > t \} Parameters ---------- @@ -51,26 +63,56 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, ** self.p = None @property - def default_configuration(self): + def model_configuration(self): + """ + Constructs the dictionary containing the priors for the parameters for the model according to the + regularization function. + Returns + ------- + configuration : dict + Dictionary containing the priors applies on the weights + """ if self.regularization == 'l2': weight = pm.Normal prior = 'sd' elif self.regularization == 'l1': weight = pm.Laplace prior = 'b' - config_dict = { + configuration = { 'weights': [weight, {'mu': (pm.Normal, {'mu': 0, 'sd': 10}), prior: (pm.HalfCauchy, {'beta': 1})}]} - self.logger.info('Creating default config {}'.format(print_dictionary(config_dict))) - return config_dict + self.logger.info('Creating default config {}'.format(print_dictionary(configuration))) + return configuration def construct_model(self, X, Y): - self.logger.info('Creating model_args config {}'.format(print_dictionary(self.default_configuration))) + """ + Constructs the linear logit model which evaluated the utility score as :math:`U(x) = w \cdot x`, where + :math:`w` is the weight vector. The probability of choosing the object :math:`x_i` from the query set + :math:`Q = \{x_1, \ldots ,x_n\}` is: + + .. math:: + + P_i = P(x_i \\lvert Q) = \\frac{1}{1+exp(-U(x_i))} + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Preferences in form of Choices for given objects + + Returns + ------- + model : pymc3 Model :class:`pm.Model` + """ + self.logger.info('Creating model_args config {}'.format(print_dictionary(self.model_configuration))) with pm.Model() as self.model: self.Xt = theano.shared(X) self.Yt = theano.shared(Y) shapes = {'weights': self.n_object_features} # shapes = {'weights': (self.n_object_features, 3)} - weights_dict = create_weight_dictionary(self.default_configuration, shapes) + weights_dict = create_weight_dictionary(self.model_configuration, shapes) intercept = pm.Normal('intercept', mu=0, sd=10) utility = tt.dot(self.Xt, weights_dict['weights']) + intercept self.p = ttu.sigmoid(utility) @@ -78,6 +120,36 @@ def construct_model(self, X, Y): self.logger.info("Model construction completed") def fit(self, X, Y, sampler='vi', tune_size=0.1, thin_thresholds=1, **kwargs): + """ + Fit a generalized logit model on the provided set of queries X and preferences Y of those objects. The + provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network + the binary cross entropy loss function for each object :math:`x_i \in Q` is defined as: + + .. math:: + + C_{ij} = -y(i)\log(P_i) - (1 - y(i))\log(1 - P_i) \enspace, + + where :math:`y` is ground-truth choice vector of the objects in the given query set :math:`Q`. + The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string + The sampler used to estimate the posterior mean and mass matrix from the trace. + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler + tune_size: float (range : [0,1]) + Percentage of instances to split off to tune the threshold for the choice function + thin_thresholds: int + The number of instances of scores to skip while tuning the threshold + **kwargs : + Keyword arguments for the fit function + """ if tune_size > 0: X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state) try: @@ -153,6 +225,16 @@ def predict_for_scores(self, scores, **kwargs): return ChoiceFunctions.predict_for_scores(self, scores, **kwargs) def set_tunable_parameters(self, regularization="l1", **point): + """ + Set tunable parameters of the Generalized Linear model to the values provided. + + Parameters + ---------- + regularization : {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + point: dict + Dictionary containing parameter values which are not tuned for the network + """ self.regularization = regularization self.model = None self.trace = None @@ -163,79 +245,3 @@ def set_tunable_parameters(self, regularization="l1", **point): if len(point) > 0: self.logger.warning('This ranking algorithm does not support' ' tunable parameters called: {}'.format(print_dictionary(point))) - - -def create_weight_dictionary(model_args, shapes): - weights_dict = dict() - for key, value in model_args.items(): - prior, params = copy.deepcopy(value) - for k in params.keys(): - if isinstance(params[k], tuple): - params[k][1]['name'] = '{}_{}'.format(key, k) - params[k] = params[k][0](**params[k][1]) - params['name'] = key - params['shape'] = shapes[key] - weights_dict[key] = prior(**params) - return weights_dict - - -def binary_crossentropy(p, y_true): - if p.ndim > 1: - l = (tt.nnet.binary_crossentropy(p, y_true).sum(axis=1)).mean() - else: - l = tt.nnet.binary_crossentropy(p, y_true).mean(axis=0) - return -l - - -def categorical_crossentropy(p, y_true): - return -tt.nnet.categorical_crossentropy(p, y_true) - - -def categorical_hinge(p, y_true): - pos = tt.sum(y_true * p, axis=-1) - neg = tt.max((1. - y_true) * p, axis=-1) - return -tt.maximum(0., neg - pos + 1.) - - -class BinaryCrossEntropyLikelihood(Discrete): - R""" - Categorical log-likelihood. - - The most general discrete distribution. - - .. math:: f(x \mid p) = p_x - - ======== =================================== - Support :math:`x \in \{0, 1, \ldots, |p|-1\}` - ======== =================================== - - Parameters - ---------- - p : array of floats - p > 0 and the elements of p must sum to 1. They will be automatically - rescaled otherwise. - """ - - def __init__(self, p, *args, **kwargs): - super(BinaryCrossEntropyLikelihood, self).__init__(*args, **kwargs) - self.loss_func = categorical_hinge - try: - self.k = tt.shape(p)[-1].tag.test_value - except AttributeError: - self.k = tt.shape(p)[-1] - self.p = tt.as_tensor_variable(p) - self.mode = tt.argmax(p) - - def random(self, **kwargs): - return NotImplemented - - def logp(self, value): - p = self.p - k = self.k - a = self.loss_func(p, value) - p = ttu.normalize(p) - sum_to1 = theano.gradient.zero_grad( - tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5)) - - value_k = tt.argmax(value, axis=1) - return bound(a, value_k >= 0, value_k <= (k - 1), sum_to1) diff --git a/csrank/choicefunctions/pairwise_choice.py b/csrank/choicefunctions/pairwise_choice.py index 897e0a08..842b7422 100644 --- a/csrank/choicefunctions/pairwise_choice.py +++ b/csrank/choicefunctions/pairwise_choice.py @@ -61,6 +61,24 @@ def _convert_instances(self, X, Y): return x_train, y_single def fit(self, X, Y, tune_size=0.1, thin_thresholds=1, **kwd): + """ + Fit a generic preference learning model on a provided set of queries. + The provided queries can be of a fixed size (numpy arrays). + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + tune_size: float (range : [0,1]) + Percentage of instances to split off to tune the threshold for the choice function + thin_thresholds: int + The number of instances of scores to skip while tuning the threshold + **kwd : + Keyword arguments for the fit function + + """ if tune_size > 0: X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state) try: diff --git a/csrank/choicefunctions/ranknet_choice.py b/csrank/choicefunctions/ranknet_choice.py index 08f5d3c2..3871468a 100644 --- a/csrank/choicefunctions/ranknet_choice.py +++ b/csrank/choicefunctions/ranknet_choice.py @@ -88,17 +88,48 @@ def _convert_instances(self, X, Y): def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, tune_size=0.1, thin_thresholds=1, verbose=0, **kwd): + """ + Fit RankNet model for learning choice function on a provided set of queries. The provided queries can be of + a fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of + objects :math:`x_i, x_j \in Q` is defined as: + + .. math:: + + C_{ij} = -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace, + + where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. + :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = 0`. + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Preferences in form of Orderings or Choices for given n_objects + epochs : int + Number of epochs to run if training for a fixed query size + callbacks : list + List of callbacks to be called during optimization + validation_split : float (range : [0,1]) + Percentage of instances to split off to validate on + tune_size: float (range : [0,1]) + Percentage of instances to split off to tune the threshold for the choice function + thin_thresholds: int + The number of instances of scores to skip while tuning the threshold + verbose : bool + Print verbose information + **kwd : + Keyword arguments for the fit function + """ if tune_size > 0: X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=tune_size, random_state=self.random_state) try: - super().fit(X_train, Y_train, epochs, callbacks, - validation_split, verbose, **kwd) + super().fit(X_train, Y_train, epochs, callbacks, validation_split, verbose, **kwd) finally: self.logger.info('Fitting utility function finished. Start tuning threshold.') self.threshold = self._tune_threshold(X_val, Y_val, thin_thresholds=thin_thresholds) else: - super().fit(X, Y, epochs, callbacks, validation_split, verbose, - **kwd) + super().fit(X, Y, epochs, callbacks, validation_split, verbose, **kwd) self.threshold = 0.5 def _predict_scores_fixed(self, X, **kwargs): diff --git a/csrank/choicefunctions/util.py b/csrank/choicefunctions/util.py index 356fb9f5..19d4dd08 100644 --- a/csrank/choicefunctions/util.py +++ b/csrank/choicefunctions/util.py @@ -1,6 +1,13 @@ +import copy from itertools import product import numpy as np +import theano +from pymc3 import Discrete +from pymc3.distributions.dist_math import bound +from theano import tensor as tt + +from csrank import theano_util as ttu def generate_pairwise_instances(x, choice): @@ -38,3 +45,79 @@ def generate_complete_pairwise_dataset(X, Y): Y_single = np.array(Y_single) X_train = X1 - X2 return X1, X2, X_train, Y_double, Y_single + + +def create_weight_dictionary(model_args, shapes): + weights_dict = dict() + for key, value in model_args.items(): + prior, params = copy.deepcopy(value) + for k in params.keys(): + if isinstance(params[k], tuple): + params[k][1]['name'] = '{}_{}'.format(key, k) + params[k] = params[k][0](**params[k][1]) + params['name'] = key + params['shape'] = shapes[key] + weights_dict[key] = prior(**params) + return weights_dict + + +def binary_crossentropy(p, y_true): + if p.ndim > 1: + l = (tt.nnet.binary_crossentropy(p, y_true).sum(axis=1)).mean() + else: + l = tt.nnet.binary_crossentropy(p, y_true).mean(axis=0) + return -l + + +def categorical_crossentropy(p, y_true): + return -tt.nnet.categorical_crossentropy(p, y_true) + + +def categorical_hinge(p, y_true): + pos = tt.sum(y_true * p, axis=-1) + neg = tt.max((1. - y_true) * p, axis=-1) + return -tt.maximum(0., neg - pos + 1.) + + +class BinaryCrossEntropyLikelihood(Discrete): + R""" + Categorical log-likelihood. + + The most general discrete distribution. + + .. math:: f(x \mid p) = p_x + + ======== =================================== + Support :math:`x \in \{0, 1, \ldots, |p|-1\}` + ======== =================================== + + Parameters + ---------- + p : array of floats + p > 0 and the elements of p must sum to 1. They will be automatically + rescaled otherwise. + """ + + def __init__(self, p, *args, **kwargs): + super(BinaryCrossEntropyLikelihood, self).__init__(*args, **kwargs) + self.loss_func = categorical_hinge + try: + self.k = tt.shape(p)[-1].tag.test_value + except AttributeError: + self.k = tt.shape(p)[-1] + self.p = tt.as_tensor_variable(p) + self.mode = tt.argmax(p) + + def random(self, **kwargs): + return NotImplemented + + def logp(self, value): + p = self.p + k = self.k + a = self.loss_func(p, value) + p = ttu.normalize(p) + sum_to1 = theano.gradient.zero_grad( + tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5)) + + value_k = tt.argmax(value, axis=1) + return bound(a, value_k >= 0, value_k <= (k - 1), sum_to1) diff --git a/csrank/core/cmpnet_core.py b/csrank/core/cmpnet_core.py index f3c7deeb..a2895371 100644 --- a/csrank/core/cmpnet_core.py +++ b/csrank/core/cmpnet_core.py @@ -70,17 +70,8 @@ def construct_model(self): Construct the CmpNet which is used to approximate the :math:`U_1(x_i,x_j)`. For each pair of objects in :math:`x_i, x_j \in Q` we construct two sub-networks with weight sharing in all hidden layers. The output of these networks are connected to two sigmoid units that produces the outputs of the network, - i.e., :math:`U(x_1,x_2), U(x_2,x_1)` for each pair of objects are evaluated. - :math:`U(x_1,x_2)` is a measure of how favorable it is to choose :math:`x_1` over :math:`x_2`. - For learning this network the binary cross entropy loss function for a pair of example :math:`x_i, x_j \in Q` - is defined as: - - .. math:: - - C_{ij} = -\\tilde{P_{ij}}\log(U(x_i,x_j)) - (1 - \\tilde{P_{ij}})\log(U(x_j,x_i)) \enspace, - - where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. - :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else 0. + i.e., :math:`U(x_1,x_2), U(x_2,x_1)` for each pair of objects are evaluated. :math:`U(x_1,x_2)` is a measure + of how favorable it is to choose :math:`x_1` over :math:`x_2`. Returns ------- @@ -106,6 +97,15 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, """ Fit a generic preference learning CmptNet on the provided set of queries X and preferences Y of those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). + For learning this network the binary cross entropy loss function for a pair of objects + :math:`x_i, x_j \in Q` is defined as: + + .. math:: + + C_{ij} = -\\tilde{P_{ij}}(0)\\cdot \log(U(x_i,x_j)) - \\tilde{P_{ij}}(1) \\cdot \log(U(x_j,x_i)) \ , + + where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. + :math:`\\tilde{P_{ij}} = (1,0)` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = (0,1)`. Parameters ---------- @@ -115,15 +115,16 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, Y : numpy array (n_instances, n_objects) Preferences in form of Orderings or Choices for given n_objects - epochs : int Number of epochs to run if training for a fixed query size callbacks : list List of callbacks to be called during optimization - validation_split : float + validation_split : float (range : [0,1]) Percentage of instances to split off to validate on verbose : bool Print verbose information + **kwd : + Keyword arguments for the fit function """ x1, x2, y_double = self._convert_instances(X, Y) diff --git a/csrank/core/fate_network.py b/csrank/core/fate_network.py index 827dc0ae..7c337d6c 100644 --- a/csrank/core/fate_network.py +++ b/csrank/core/fate_network.py @@ -358,11 +358,11 @@ def fit(self, X, Y, epochs=35, inner_epochs=1, callbacks=None, validation_split= Parameters ---------- X : numpy array or dict - (n_instances, n_objects, n_features) if numpy array or - map from n_objects to numpy arrays + Feature vectors of the objects + (n_instances, n_objects, n_features) if numpy array or map from n_objects to numpy arrays Y : numpy array or dict - (n_instances, n_objects) if numpy array or - map from n_objects to numpy arrays + Preferences in form of rankings or choices for given objects + (n_instances, n_objects) if numpy array or map from n_objects to numpy arrays epochs : int Number of epochs to run if training for a fixed query size or number of epochs of the meta gradient descent for the variadic model @@ -371,7 +371,7 @@ def fit(self, X, Y, epochs=35, inner_epochs=1, callbacks=None, validation_split= model callbacks : list List of callbacks to be called during optimization - validation_split : float + validation_split : float (range : [0,1]) Percentage of instances to split off to validate on verbose : bool Print verbose information diff --git a/csrank/core/feta_network.py b/csrank/core/feta_network.py index 4155b40c..67671ad8 100644 --- a/csrank/core/feta_network.py +++ b/csrank/core/feta_network.py @@ -234,7 +234,7 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, Number of epochs to run if training for a fixed query size callbacks : list List of callbacks to be called during optimization - validation_split : float + validation_split : float (range : [0,1]) Percentage of instances to split off to validate on verbose : bool Print verbose information diff --git a/csrank/core/ranknet_core.py b/csrank/core/ranknet_core.py index 9c112c50..977ae6dd 100644 --- a/csrank/core/ranknet_core.py +++ b/csrank/core/ranknet_core.py @@ -64,16 +64,7 @@ def construct_model(self): :math:`x_i, x_j \in Q` we construct two sub-networks with weight sharing in all hidden layer apart form the last layer for which weights are mirrored version of each other. The output of these networks are connected to a sigmoid unit that produces the output :math:`P_{ij}` which is the probability of preferring object - :math:`x_i` over :math:`x_j`, to approximate the :math:`U(x)`. - For learning this network the binary cross entropy loss function for a pair of example :math:`x_i, x_j \in Q` - is defined as: - - .. math:: - - C_{ij} = -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace, - - where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. - :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else 0. + :math:`x_i` over :math:`x_j`, to approximate the :math:`U(x)`. Returns ------- @@ -98,25 +89,33 @@ def _convert_instances(self, X, Y): def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd): """ - Fit a generic preference learning model on a provided set of queries. - The provided queries can be of a fixed size (numpy arrays). + Fit a generic preference learning RankNet model on a provided set of queries. The provided queries can be of + a fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of + objects :math:`x_i, x_j \in Q` is defined as: + + .. math:: + + C_{ij} = -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace, + + where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. + :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = 0`. Parameters ---------- - X : numpy array - (n_instances, n_objects, n_features) + X : numpy array (n_instances, n_objects, n_features) Feature vectors of the objects - Y : numpy array - (n_instances, n_objects) + Y : numpy array (n_instances, n_objects) Preferences in form of Orderings or Choices for given n_objects epochs : int Number of epochs to run if training for a fixed query size callbacks : list List of callbacks to be called during optimization - validation_split : float + validation_split : float (range : [0,1]) Percentage of instances to split off to validate on verbose : bool Print verbose information + **kwd : + Keyword arguments for the fit function """ X1, X2, Y_single = self._convert_instances(X, Y) diff --git a/csrank/discretechoice/discrete_choice.py b/csrank/discretechoice/discrete_choice.py index 2c6642fb..bf2ea081 100644 --- a/csrank/discretechoice/discrete_choice.py +++ b/csrank/discretechoice/discrete_choice.py @@ -13,22 +13,26 @@ def learning_problem(self): return DISCRETE_CHOICE def predict_for_scores(self, scores): - """ Predict discrete choice for a given collection scores for the sets of objects. - - Parameters - ---------- - scores : dict or numpy array - Dictionary with a mapping from size of the choice set to numpy arrays - or a single numpy array of size containing scores of each object of size: - (n_instances, n_objects) - - - Returns - ------- - Y : dict or numpy array - Dictionary with a mapping from size of the choice set to numpy arrays - or a single numpy array containing discrete choices of size: - (n_instances, 1) + """ + Binary discrete choice vector :math:`y` represents the choices amongst the objects in :math:`Q`, such that + :math:`y(k) = 1` represents that the object :math:`x_k` is chosen and :math:`y(k) = 0` represents + it is not chosen. For choice to be discrete :math:`\sum_{x_i \in Q} y(i) = 1`. Predict discrete choices for + the scores for a given collection of sets of objects (query sets). + + Parameters + ---------- + scores : dict or numpy array + Dictionary with a mapping from query set size to numpy arrays + or a single numpy array of size containing scores of each object of size: + (n_instances, n_objects) + + + Returns + ------- + Y : dict or numpy array + Dictionary with a mapping from query set size to numpy arrays + or a single numpy array containing predicted discrete choice vectors of size: + (n_instances, n_objects) """ if isinstance(scores, dict): result = dict() diff --git a/csrank/learner.py b/csrank/learner.py index 57e762fe..0bbaec00 100644 --- a/csrank/learner.py +++ b/csrank/learner.py @@ -75,7 +75,7 @@ def predict_scores(self, X, **kwargs): def predict(self, X, **kwargs): """ Predict preferences in the form of rankings or choices for a given collection of sets of objects called - a query set. + a query set using the function :meth:`.predict_for_scores`. Parameters ---------- diff --git a/csrank/objectranking/object_ranker.py b/csrank/objectranking/object_ranker.py index ce35e982..9be76abc 100644 --- a/csrank/objectranking/object_ranker.py +++ b/csrank/objectranking/object_ranker.py @@ -14,9 +14,10 @@ def learning_problem(self): def predict_for_scores(self, scores, **kwargs): """ - :math:`\pi` represents the ranking amongst the objects in :math:`Q`, such that :math:`\pi(k)` is the - position of the :math:`k`-th object :math:`x_k`, and :math:`\pi^{-1}(k)` is the index of the object on - position :math:`k`. Predict rankings for the scores for a given collection of sets of objects (query set). + The permutation vector :math:`\pi` represents the ranking amongst the objects in :math:`Q`, such that + :math:`\pi(k)` is the position of the :math:`k`-th object :math:`x_k`, and :math:`\pi^{-1}(k)` is the index + of the object on position :math:`k`. Predict rankings for the scores for a given collection of sets of + objects (query sets). Parameters ---------- From fcd9e8f76f3fa870f2b2b4c64a7340b634aad2e8 Mon Sep 17 00:00:00 2001 From: Pritha Gupta Date: Mon, 3 Jun 2019 15:15:00 +0200 Subject: [PATCH 3/5] Fine tuned remaining rankers --- csrank/objectranking/cmp_net.py | 35 +++++++++++++++++++++-- csrank/objectranking/list_net.py | 49 ++++++++++++++++++++------------ csrank/objectranking/rank_net.py | 12 ++++++-- 3 files changed, 74 insertions(+), 22 deletions(-) diff --git a/csrank/objectranking/cmp_net.py b/csrank/objectranking/cmp_net.py index 3461c24e..9d320309 100644 --- a/csrank/objectranking/cmp_net.py +++ b/csrank/objectranking/cmp_net.py @@ -93,8 +93,39 @@ def _convert_instances(self, X, Y): def construct_model(self): return super().construct_model() - def fit(self, X, Y, **kwd): - super().fit(X, Y, **kwd) + def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd): + """ + Fit an object ranking learning CmpNet model on a provided set of queries. The provided queries can be of a + fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of + objects :math:`x_i, x_j \in Q` is defined as: + + .. math:: + + C_{ij} = -\\tilde{P_{ij}}(0)\\cdot \log(U(x_i,x_j)) - \\tilde{P_{ij}}(1) \\cdot \log(U(x_j,x_i)) \ , + + where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. + :math:`\\tilde{P_{ij}} = (1,0)` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = (0,1)`. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Rankings of the given objects + epochs : int + Number of epochs to run if training for a fixed query size + callbacks : list + List of callbacks to be called during optimization + validation_split : float + Percentage of instances to split off to validate on + verbose : bool + Print verbose information + **kwd + Keyword arguments for the fit function + """ + super().fit(X, Y, epochs=epochs, callbacks=callbacks, validation_split=validation_split, verbose=verbose, **kwd) def _predict_scores_fixed(self, X, **kwargs): return super()._predict_scores_fixed(X, **kwargs) diff --git a/csrank/objectranking/list_net.py b/csrank/objectranking/list_net.py index 82457660..bc23f16d 100644 --- a/csrank/objectranking/list_net.py +++ b/csrank/objectranking/list_net.py @@ -25,16 +25,15 @@ def __init__(self, n_object_features, n_top, n_hidden=2, n_units=8, loss_functio batch_normalization=False, kernel_regularizer=l2(l=1e-4), activation="selu", kernel_initializer='lecun_normal', optimizer=SGD(lr=1e-4, nesterov=True, momentum=0.9), metrics=[zero_one_rank_loss_for_scores_ties], batch_size=256, random_state=None, **kwargs): - """ Create an instance of the ListNet architecture. - ListNet trains a latent utility model based on top-k-subrankings of the objects. - A listwise loss function like the negative Plackett-Luce likelihood is used for training. - For example for query set :math:`Q = \{x_1,x_2,x_3\}`, the scores are :math:`Q = (s_1,s_2,s_3)` - and the ranking is :math:`\pi = (3,1,2)`. The Plackett-Luce likelihood is defined as: + """ Create an instance of the ListNet architecture. ListNet trains a latent utility model based on + top-k-subrankings of the objects. This network learns a latent utility score for each object in the given + query set :math:`Q = \{x_1, \ldots ,x_n\}` using the equation :math:`U(x) = F(x, w)` where :math:`w` is the + weight vector. A listwise loss function like the negative Plackett-Luce likelihood is used for training. + The ranking for the given query set :math:`Q` is defined as: .. math:: - P_l(\pi) = \\frac{s_2}{s_1+s_2+s_3} \cdot \\frac{s_3}{s_1+s_3} \cdot \\frac{s_1}{s_1} - Note: For k=2 we obtain :class:`RankNet` as a special case. + ρ(Q) = \operatorname{argsort}_{x \in Q} \; U(x) Parameters ---------- @@ -125,8 +124,15 @@ def _create_topk(self, X, Y): def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd): """ - Fit an object ranking learning model on a provided set of queries. - The provided queries can be of a fixed size (numpy arrays). + Fit an object ranking learning ListNet on the top-k-subrankings in the provided set of queries. The provided + queries can be of a fixed size (numpy arrays). For fitting the model we maximize the Plackett-Luce + likelihood. For example for query set :math:`Q = \{x_1,x_2,x_3\}`, the scores are :math:`Q = (s_1,s_2,s_3)` + and the ranking is :math:`\pi = (3,1,2)`. The Plackett-Luce likelihood is defined as: + + .. math:: + P_l(\pi) = \\frac{s_2}{s_1+s_2+s_3} \cdot \\frac{s_3}{s_1+s_3} \cdot \\frac{s_1}{s_1} + + Note: For k=2 we obtain :class:`RankNet` as a special case. Parameters ---------- @@ -153,9 +159,7 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, self.logger.debug("Finished creating the dataset") self.logger.debug("Creating the model") - output = self.construct_model() - self.model = Model(inputs=self.input_layer, outputs=output) - self.model.compile(loss=self.loss_function, optimizer=self.optimizer, metrics=self.metrics) + self.model = self.construct_model() self.logger.debug("Finished creating the model, now fitting...") self.model.fit(X, Y, batch_size=self.batch_size, epochs=epochs, callbacks=callbacks, validation_split=validation_split, verbose=verbose, **kwd) @@ -163,20 +167,30 @@ def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, def construct_model(self): """ - Construct the ListNet architecture. - Weight sharing guarantees that we have a latent utility model for any given object. + Construct the ListNet architecture which takes topk-subrankings from the given queries and minimize a + listwise loss on the utility scores of top objects. Weight sharing guarantees that we learn the shared + weights :math:`w` of the latent utility function :math:`U(x) = F(x, w)`. + + Returns + ------- + model: keras model :class:`Model` + ListNet model used to learn the utiliy function using the top-k-subrankings in the provided set of queries. """ hid = [create_input_lambda(i)(self.input_layer) for i in range(self.n_top)] for hidden_layer in self.hidden_layers: hid = [hidden_layer(x) for x in hid] outputs = [self.output_node(x) for x in hid] merged = concatenate(outputs) - return merged + model = Model(inputs=self.input_layer, outputs=merged) + model.compile(loss=self.loss_function, optimizer=self.optimizer, metrics=self.metrics) + return model @property def scoring_model(self): """ - Creates a scoring model for the trained ListNet, which predicts the utility scores for given set of objects. + Creates a scoring model from the trained ListNet, which predicts the utility scores for given set of objects. + This network consist of a sequential network which predicts the utility score for each object :math:`x \in Q` + using the latent utility function :math:`U(x) = F(x, w)` where :math:`w` is the weights of the model. Returns ------- @@ -230,8 +244,7 @@ def clear_memory(self, **kwargs): self._construct_layers(kernel_regularizer=self.kernel_regularizer, kernel_initializer=self.kernel_initializer, activation=self.activation, **self.kwargs) - output = self.construct_model() - self.model = Model(inputs=self.input_layer, outputs=output) + self.model = self.construct_model() self.model.load_weights(self.hash_file) else: self.logger.info("Cannot clear the memory") diff --git a/csrank/objectranking/rank_net.py b/csrank/objectranking/rank_net.py index c61786f5..938346f4 100644 --- a/csrank/objectranking/rank_net.py +++ b/csrank/objectranking/rank_net.py @@ -88,8 +88,16 @@ def _convert_instances(self, X, Y): def fit(self, X, Y, epochs=10, callbacks=None, validation_split=0.1, verbose=0, **kwd): """ - Fit an object ranking learning model on a provided set of queries. - The provided queries can be of a fixed size (numpy arrays). + Fit an object ranking learning RankNet model on a provided set of queries. The provided queries can be of a + fixed size (numpy arrays). For learning this network the binary cross entropy loss function for a pair of + objects :math:`x_i, x_j \in Q` is defined as: + + .. math:: + + C_{ij} = -\\tilde{P_{ij}}\log(P_{ij}) - (1 - \\tilde{P_{ij}})\log(1 - P{ij}) \enspace, + + where :math:`\\tilde{P_{ij}}` is ground truth probability of the preference of :math:`x_i` over :math:`x_j`. + :math:`\\tilde{P_{ij}} = 1` if :math:`x_i \succ x_j` else :math:`\\tilde{P_{ij}} = 0`. Parameters ---------- From ff3d65184bf029237d8faee55c857165f18484f5 Mon Sep 17 00:00:00 2001 From: Pritha Gupta Date: Mon, 3 Jun 2019 15:55:53 +0200 Subject: [PATCH 4/5] Improved the documentation of discrete choice functions --- csrank/choicefunctions/feta_choice.py | 2 +- .../generalized_linear_model.py | 14 +- csrank/discretechoice/feta_discrete_choice.py | 2 +- .../generalized_nested_logit.py | 30 +++- .../discretechoice/multinomial_logit_model.py | 112 ++++++++++++- csrank/discretechoice/nested_logit_model.py | 157 ++++++++++++++---- .../paired_combinatorial_logit.py | 30 +++- 7 files changed, 296 insertions(+), 51 deletions(-) diff --git a/csrank/choicefunctions/feta_choice.py b/csrank/choicefunctions/feta_choice.py index 3d32f4ea..ccf5bea8 100644 --- a/csrank/choicefunctions/feta_choice.py +++ b/csrank/choicefunctions/feta_choice.py @@ -41,7 +41,7 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero Parameters ---------- n_objects : int - Number of objects to be ranked + Number of objects in each query set n_object_features : int Dimensionality of the feature space of each object n_hidden : int diff --git a/csrank/choicefunctions/generalized_linear_model.py b/csrank/choicefunctions/generalized_linear_model.py index 0a14d82d..a173d4d6 100644 --- a/csrank/choicefunctions/generalized_linear_model.py +++ b/csrank/choicefunctions/generalized_linear_model.py @@ -19,9 +19,10 @@ class GeneralizedLinearModel(ChoiceFunctions, Learner): def __init__(self, n_object_features, regularization='l2', random_state=None, **kwargs): """ Create an instance of the GeneralizedLinearModel model for learning the choice function. This model is - adapted from the multinomial logit model :class:`MultinomialLogitModel`. The utility score for each object - in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is the weight vector. - The probability of choosing an object :math:`x_i` is defined by taking sigmoid over the utility scores: + adapted from the multinomial logit model :class:`csrank.discretechoice.multinomial_logit_model.MultinomialLogitModel`. + The utility score for each object in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, + where :math:`w` is the weight vector. The probability of choosing an object :math:`x_i` is defined by taking + sigmoid over the utility scores: .. math:: @@ -65,8 +66,9 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, ** @property def model_configuration(self): """ - Constructs the dictionary containing the priors for the parameters for the model according to the + Constructs the dictionary containing the priors for the weight vector for the model according to the regularization function. + Returns ------- configuration : dict @@ -121,13 +123,13 @@ def construct_model(self, X, Y): def fit(self, X, Y, sampler='vi', tune_size=0.1, thin_thresholds=1, **kwargs): """ - Fit a generalized logit model on the provided set of queries X and preferences Y of those objects. The + Fit a generalized logit model on the provided set of queries X and choices Y of those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network the binary cross entropy loss function for each object :math:`x_i \in Q` is defined as: .. math:: - C_{ij} = -y(i)\log(P_i) - (1 - y(i))\log(1 - P_i) \enspace, + C_{i} = -y(i)\log(P_i) - (1 - y(i))\log(1 - P_i) \enspace, where :math:`y` is ground-truth choice vector of the objects in the given query set :math:`Q`. The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. diff --git a/csrank/discretechoice/feta_discrete_choice.py b/csrank/discretechoice/feta_discrete_choice.py index 8d404f7c..4143a757 100644 --- a/csrank/discretechoice/feta_discrete_choice.py +++ b/csrank/discretechoice/feta_discrete_choice.py @@ -38,7 +38,7 @@ def __init__(self, n_objects, n_object_features, n_hidden=2, n_units=8, add_zero Parameters ---------- n_objects : int - Number of objects to be ranked + Number of objects in each query set n_object_features : int Dimensionality of the feature space of each object n_hidden : int diff --git a/csrank/discretechoice/generalized_nested_logit.py b/csrank/discretechoice/generalized_nested_logit.py index 7a71247a..4cb58caa 100644 --- a/csrank/discretechoice/generalized_nested_logit.py +++ b/csrank/discretechoice/generalized_nested_logit.py @@ -107,6 +107,32 @@ def construct_model(self, X, Y): self.logger.info("Model construction completed") def fit(self, X, Y, sampler="vi", **kwargs): + """ + Fit a generalized nested logit model on the provided set of queries X and choices Y of those objects. The + provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network + the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as: + + .. math:: + + C_{i} = -y(i)\log(P_i) \enspace, + + where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`. + The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string + The sampler used to estimate the posterior mean and mass matrix from the trace. + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler + **kwargs : + Keyword arguments for the fit function + """ self.construct_model(X, Y) kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32') callbacks = kwargs['vi_params'].get('callbacks', []) @@ -175,10 +201,6 @@ def predict_scores(self, X, **kwargs): def predict_for_scores(self, scores, **kwargs): return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs) - def clear_memory(self, **kwargs): - self.logger.info("Clearing memory") - pass - def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization='l2', **point): if alpha is not None: self.alpha = alpha diff --git a/csrank/discretechoice/multinomial_logit_model.py b/csrank/discretechoice/multinomial_logit_model.py index 44872af1..49ffc553 100644 --- a/csrank/discretechoice/multinomial_logit_model.py +++ b/csrank/discretechoice/multinomial_logit_model.py @@ -14,7 +14,40 @@ class MultinomialLogitModel(DiscreteObjectChooser, Learner): - def __init__(self, n_object_features, loss_function='', regularization='l2', model_args={}, **kwargs): + def __init__(self, n_object_features, loss_function='', regularization='l2', **kwargs): + """ + Create an instance of the MultinomialLogitModel model for learning the discrete choice function. The utility + score for each object in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is + the weight vector. The probability of choosing an object :math:`x_i` is defined by taking softmax over the + utility scores of the objects: + + .. math:: + + P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))} + + The discrete choice for the given query set :math:`Q` is defined as: + + .. math:: + + dc(Q) := \operatorname{argmax}_{x_i \in Q } \; P(x_i \\lvert Q) + + Parameters + ---------- + n_object_features : int + Number of features of the object space + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + random_state : int or object + Numpy random state + **kwargs + Keyword arguments for the algorithms + + References + ---------- + [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86. + """ self.logger = logging.getLogger(MultinomialLogitModel.__name__) self.n_object_features = n_object_features self.loss_function = likelihood_dict.get(loss_function, None) @@ -31,7 +64,16 @@ def __init__(self, n_object_features, loss_function='', regularization='l2', mod self.p = None @property - def model_priors(self): + def model_configuration(self): + """ + Constructs the dictionary containing the priors for the weight vector for the model according to the + regularization function. + + Returns + ------- + configuration : dict + Dictionary containing the priors applies on the weights + """ if self._config is None: if self.regularization == 'l2': weight = pm.Normal @@ -45,7 +87,29 @@ def model_priors(self): return self._config def construct_model(self, X, Y): - self.logger.info('Creating model_args config {}'.format(print_dictionary(self.model_priors))) + """ + Constructs the multinomial logit model which evaluated the utility score as :math:`U(x) = w \cdot x`, where + :math:`w` is the weight vector. The probability of choosing the object :math:`x_i` from the query set + :math:`Q = \{x_1, \ldots ,x_n\}` is: + + .. math:: + + P_i = P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))} + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Preferences in the form of discrete choices for given objects + + Returns + ------- + model : pymc3 Model :class:`pm.Model` + """ + self.logger.info('Creating model_args config {}'.format(print_dictionary(self.model_configuration))) with pm.Model() as self.model: self.Xt = theano.shared(X) self.Yt = theano.shared(Y) @@ -60,6 +124,32 @@ def construct_model(self, X, Y): self.logger.info("Model construction completed") def fit(self, X, Y, sampler='vi', **kwargs): + """ + Fit a multinomial logit model on the provided set of queries X and choices Y of those objects. The + provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network + the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as: + + .. math:: + + C_{i} = -y(i)\log(P_i) \enspace, + + where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`. + The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string + The sampler used to estimate the posterior mean and mass matrix from the trace. + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler + **kwargs : + Keyword arguments for the fit function + """ self.construct_model(X, Y) kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32') callbacks = kwargs['vi_params'].get('callbacks', []) @@ -123,11 +213,19 @@ def predict_scores(self, X, **kwargs): def predict_for_scores(self, scores, **kwargs): return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs) - def clear_memory(self, **kwargs): - self.logger.info("Clearing memory") - pass - def set_tunable_parameters(self, loss_function='', regularization="l1", **point): + """ + Set tunable parameters of the Multinomial Logit model to the values provided. + + Parameters + ---------- + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + point: dict + Dictionary containing parameter values which are not tuned for the network + """ if loss_function in likelihood_dict.keys(): self.loss_function = likelihood_dict.get(loss_function, None) self.regularization = regularization diff --git a/csrank/discretechoice/nested_logit_model.py b/csrank/discretechoice/nested_logit_model.py index 2e61f33b..1eba43f3 100644 --- a/csrank/discretechoice/nested_logit_model.py +++ b/csrank/discretechoice/nested_logit_model.py @@ -20,6 +20,38 @@ class NestedLogitModel(DiscreteObjectChooser, Learner): def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='', regularization='l1', alpha=1e-2, random_state=None, **kwd): + """ + Create an instance of the NestedLogitModel model for learning the discrete choice function. + + .. math:: + + P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))} + + The discrete choice for the given query set :math:`Q` is defined as: + + .. math:: + + dc(Q) := \operatorname{argmax}_{x_i \in Q } \; P(x_i \\lvert Q) + + Parameters + ---------- + n_object_features : int + Number of features of the object space + n_objects: int + Number of objects in each query set + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + random_state : int or object + Numpy random state + **kwargs + Keyword arguments for the algorithms + + References + ---------- + [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86. + """ self.logger = logging.getLogger(NestedLogitModel.__name__) self.n_object_features = n_object_features self.n_objects = n_objects @@ -46,7 +78,18 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='', self.y_nests = None @property - def model_priors(self): + def model_configuration(self): + """ + Constructs the dictionary containing the priors for the weight vectors for the model according to the + regularization function. The parameters are: + * weights : + * weights_k : + + Returns + ------- + configuration : dict + Dictionary containing the priors applies on the weights + """ if self._config is None: if self.regularization == 'l2': weight = pm.Normal @@ -60,7 +103,23 @@ def model_priors(self): self.logger.info('Creating model with config {}'.format(print_dictionary(self._config))) return self._config - def eval_utility(self, weights): + def create_nests(self, X): + n, n_obj, n_dim = X.shape + objects = X.reshape(n * n_obj, n_dim) + if self.cluster_model is None: + self.cluster_model = MiniBatchKMeans(n_clusters=self.n_nests, random_state=self.random_state).fit(objects) + self.features_nests = self.cluster_model.cluster_centers_ + prediction = self.cluster_model.labels_ + else: + prediction = self.cluster_model.predict(objects) + y_nests = [] + for i in np.arange(0, n * n_obj, step=n_obj): + nest_ids = prediction[i:i + n_obj] + y_nests.append(nest_ids) + y_nests = np.array(y_nests) + return y_nests + + def _eval_utility(self, weights): utility = tt.zeros(tuple(self.y_nests.shape)) for i in range(self.n_nests): rows, cols = tt.eq(self.y_nests, i).nonzero() @@ -87,14 +146,14 @@ def get_probability(self, utility, lambda_k, utility_k): p = pni_k * pn_k return p - def eval_utility_np(self, x_t, y_nests, weights): + def _eval_utility_np(self, x_t, y_nests, weights): utility = np.zeros(tuple(y_nests.shape)) for i in range(self.n_nests): rows, cols = np.where(y_nests == i) utility[rows, cols] = np.dot(x_t[rows, cols], weights[i]) return utility - def get_probability_np(self, y_nests, utility, lambda_k, utility_k): + def _get_probability_np(self, y_nests, utility, lambda_k, utility_k): n_instances, n_objects = y_nests.shape pni_k = np.zeros((n_instances, n_objects)) ivm = np.zeros((n_instances, self.n_nests)) @@ -113,23 +172,27 @@ def get_probability_np(self, y_nests, utility, lambda_k, utility_k): p = pni_k * pn_k return p - def create_nests(self, X): - n, n_obj, n_dim = X.shape - objects = X.reshape(n * n_obj, n_dim) - if self.cluster_model is None: - self.cluster_model = MiniBatchKMeans(n_clusters=self.n_nests, random_state=self.random_state).fit(objects) - self.features_nests = self.cluster_model.cluster_centers_ - prediction = self.cluster_model.labels_ - else: - prediction = self.cluster_model.predict(objects) - y_nests = [] - for i in np.arange(0, n * n_obj, step=n_obj): - nest_ids = prediction[i:i + n_obj] - y_nests.append(nest_ids) - y_nests = np.array(y_nests) - return y_nests - def construct_model(self, X, Y): + """ + Constructs the nested logit model. + + .. math:: + + P_i = P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))} + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Preferences in the form of discrete choices for given objects + + Returns + ------- + model : pymc3 Model :class:`pm.Model` + """ y_nests = self.create_nests(X) with pm.Model() as self.model: self.Xt = theano.shared(X) @@ -137,10 +200,10 @@ def construct_model(self, X, Y): self.y_nests = theano.shared(y_nests) shapes = {'weights': self.n_object_features, 'weights_k': self.n_object_features} - weights_dict = create_weight_dictionary(self.model_priors, shapes) + weights_dict = create_weight_dictionary(self.model_configuration, shapes) lambda_k = pm.Uniform('lambda_k', self.alpha, 1.0, shape=self.n_nests) weights = (weights_dict['weights'] / lambda_k[:, None]) - utility = self.eval_utility(weights) + utility = self._eval_utility(weights) utility_k = tt.dot(self.features_nests, weights_dict['weights_k']) self.p = self.get_probability(utility, lambda_k, utility_k) @@ -148,6 +211,32 @@ def construct_model(self, X, Y): self.logger.info("Model construction completed") def fit(self, X, Y, sampler="vi", **kwargs): + """ + Fit a nested logit model on the provided set of queries X and choices Y of those objects. The + provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network + the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as: + + .. math:: + + C_{i} = -y(i)\log(P_i) \enspace, + + where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`. + The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string + The sampler used to estimate the posterior mean and mass matrix from the trace. + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler + **kwargs : + Keyword arguments for the fit function + """ self.construct_model(X, Y) kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32') callbacks = kwargs['vi_params'].get('callbacks', []) @@ -202,8 +291,8 @@ def _predict_scores_fixed(self, X, **kwargs): lambda_k = np.array([mean_trace['lambda_k__{}'.format(i)] for i in range(self.n_nests)]) weights = (weights / lambda_k[:, None]) utility_k = np.dot(self.features_nests, weights_k) - utility = self.eval_utility_np(X, y_nests, weights) - scores = self.get_probability_np(y_nests, utility, lambda_k, utility_k) + utility = self._eval_utility_np(X, y_nests, weights) + scores = self._get_probability_np(y_nests, utility, lambda_k, utility_k) return scores def predict(self, X, **kwargs): @@ -215,11 +304,23 @@ def predict_scores(self, X, **kwargs): def predict_for_scores(self, scores, **kwargs): return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs) - def clear_memory(self, **kwargs): - self.logger.info("Clearing memory") - pass - def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization="l1", **point): + """ + Set tunable parameters of the Multinomial Logit model to the values provided. + + Parameters + ---------- + alpha: float (range : [0,1]) + The lower bound of the correlations between the objects in a nest + n_nests: int (range : [2,n_objects]) + The number of nests in which the objects are divided + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + point: dict + Dictionary containing parameter values which are not tuned for the network + """ if alpha is not None: self.alpha = alpha if n_nests is None: diff --git a/csrank/discretechoice/paired_combinatorial_logit.py b/csrank/discretechoice/paired_combinatorial_logit.py index 9951ff0e..015f3013 100644 --- a/csrank/discretechoice/paired_combinatorial_logit.py +++ b/csrank/discretechoice/paired_combinatorial_logit.py @@ -112,6 +112,32 @@ def construct_model(self, X, Y): self.logger.info("Model construction completed") def fit(self, X, Y, sampler="vi", **kwargs): + """ + Fit a paired combinatorial logit model on the provided set of queries X and choices Y of those objects. The + provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network + the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as: + + .. math:: + + C_{i} = -y(i)\log(P_i) \enspace, + + where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`. + The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string + The sampler used to estimate the posterior mean and mass matrix from the trace. + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler + **kwargs : + Keyword arguments for the fit function + """ self.construct_model(X, Y) kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32') callbacks = kwargs['vi_params'].get('callbacks', []) @@ -175,10 +201,6 @@ def predict_scores(self, X, **kwargs): def predict_for_scores(self, scores, **kwargs): return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs) - def clear_memory(self, **kwargs): - self.logger.info("Clearing memory") - pass - def set_tunable_parameters(self, alpha=5e-2, loss_function='', regularization='l2', **point): if alpha is not None: self.alpha = alpha From b10e373917bebeaa99debbf0905af915bbb2db56 Mon Sep 17 00:00:00 2001 From: Pritha Gupta Date: Tue, 4 Jun 2019 10:56:48 +0200 Subject: [PATCH 5/5] Completed the description of logit models Completed Paired combinatorial Logit model completed the documentation of logit models --- .../generalized_linear_model.py | 35 ++-- .../generalized_nested_logit.py | 164 ++++++++++++++++-- csrank/discretechoice/mixed_logit_model.py | 135 +++++++++++++- .../discretechoice/multinomial_logit_model.py | 37 ++-- csrank/discretechoice/nested_logit_model.py | 136 ++++++++++++--- .../paired_combinatorial_logit.py | 159 ++++++++++++++++- .../discretechoice/ranknet_discrete_choice.py | 2 +- 7 files changed, 593 insertions(+), 75 deletions(-) diff --git a/csrank/choicefunctions/generalized_linear_model.py b/csrank/choicefunctions/generalized_linear_model.py index a173d4d6..ba243177 100644 --- a/csrank/choicefunctions/generalized_linear_model.py +++ b/csrank/choicefunctions/generalized_linear_model.py @@ -48,6 +48,8 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, ** References ---------- [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86. + + [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986 """ self.logger = logging.getLogger(GeneralizedLinearModel.__name__) self.n_object_features = n_object_features @@ -66,13 +68,25 @@ def __init__(self, n_object_features, regularization='l2', random_state=None, ** @property def model_configuration(self): """ - Constructs the dictionary containing the priors for the weight vector for the model according to the - regularization function. + Constructs the dictionary containing the priors for the weight vectors for the model according to the + regularization function. The parameters are: + * **weights** : Weights to evaluates the utility of the objects - Returns - ------- - configuration : dict - Dictionary containing the priors applies on the weights + For ``l1`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w) + + For ``l2`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w) """ if self.regularization == 'l2': weight = pm.Normal @@ -141,10 +155,11 @@ def fit(self, X, Y, sampler='vi', tune_size=0.1, thin_thresholds=1, **kwargs): Y : numpy array (n_instances, n_objects) Choices for given objects in the query sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string - The sampler used to estimate the posterior mean and mass matrix from the trace. - * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix - * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler - * **nuts** : Use the No-U-Turn sampler + The sampler used to estimate the posterior mean and mass matrix from the trace + + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler tune_size: float (range : [0,1]) Percentage of instances to split off to tune the threshold for the choice function thin_thresholds: int diff --git a/csrank/discretechoice/generalized_nested_logit.py b/csrank/discretechoice/generalized_nested_logit.py index 4cb58caa..b65c30d9 100644 --- a/csrank/discretechoice/generalized_nested_logit.py +++ b/csrank/discretechoice/generalized_nested_logit.py @@ -18,7 +18,54 @@ class GeneralizedNestedLogitModel(DiscreteObjectChooser, Learner): def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='None', regularization='l2', - alpha=5e-2, random_state=None, model_args={}, **kwd): + alpha=5e-2, random_state=None, **kwd): + """ + Create an instance of the Generalized Nested Logit model for learning the discrete choice function. This + model divides objects into subsets called nests, such that the each object is associtated to each nest to some degree. + This model structure is 1-layer of hierarchy and the :math:`\lambda` for each nest :math:`B_k` signifies the degree of independence + and :math:`1-\lambda` signifies the correlations between the object in it. We learn two weight vectors and the :math:`\lambda s`. + The probability of choosing an object :math:`x_i` from the given query set :math:`Q` is defined by product + of choosing the nest in which :math:`x_i` exists and then choosing the the object from the nest. + + .. math:: + + P(x_i \\lvert Q) = P_i = \sum_{\substack{B_k \in \mathcal{B} \\ i \in B_k}}P_{i \\lvert B_k} P_{B_k} \enspace , + + + The discrete choice for the given query set :math:`Q` is defined as: + + .. math:: + + dc(Q) := \operatorname{argmax}_{x_i \in Q } \; P(x_i \\lvert Q) + + Parameters + ---------- + n_object_features : int + Number of features of the object space + n_objects: int + Number of objects in each query set + n_nests : int range : [2,n_objects/2] + The number of nests/subsets in which the objects are divided + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + alpha: float (range : [0,1]) + The lower bound of the correlations between the objects in a nest + random_state : int or object + Numpy random state + **kwargs + Keyword arguments for the algorithms + + References + ---------- + [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap GEV, pp. 87–111. + + [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986 + + [3] Chieh-Hua Wen and Frank S Koppelman. „The generalized nested logit model“. In: Transportation Research Part B: Methodological 35.7 (2001), pp. 627–641 + + """ self.logger = logging.getLogger(GeneralizedNestedLogitModel.__name__) self.n_object_features = n_object_features @@ -45,7 +92,34 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='No self._config = None @property - def model_priors(self): + def model_configuration(self): + """ + Constructs the dictionary containing the priors for the weight vectors for the model according to the + regularization function. The parameters are: + * **weights** : Weights to evaluates the utility of the objects + * **weights_k** : Weights to evaluates the fractional allocation of each object in :math:'Q' to each nest + + For ``l1`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w) + + For ``l2`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w) + + Returns + ------- + configuration : dict + Dictionary containing the priors applies on the weights + """ if self._config is None: if self.regularization == 'l2': weight = pm.Normal @@ -60,6 +134,40 @@ def model_priors(self): return self._config def get_probabilities(self, utility, lambda_k, alpha_ik): + """ + This method calculates the probability of choosing an object from the query set using the following parameters of the model which are used: + + * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i` + * **weights_k** (:math:`w_k`): Weights to get fractional allocation of each object :math:'x_j' in :math:'Q' to each nest math:`B_k` as :math:`\alpha_{ik} = w_k \cdot x_i`. + * **lambda_k** (:math:`\lambda_k`): Lambda for nest :math:`B_k` for correlations between the obejcts. + + The probability of choosing the object :math:`x_i` from the query set :math:`Q`: + + .. math:: + P_i = \sum_{\substack{B_k \in \mathcal{B} \\ i \in B_k}} P_{i \\lvert {B_k}} P_{B_k} \enspace where, \\\\ + P_{B_k} = \\frac{{\\left(\sum_{j \in B_k} {\\left(\\alpha_{jk} \\boldsymbol{e}^{V_j} \\right)}^ {^{1}/{\lambda_k}} \\right)}^{\lambda_k}}{\sum_{\ell = 1}^{K} {\\left( \sum_{j \in B_{\ell}} {\\left( \\alpha_{j\ell} \\boldsymbol{e}^{V_j} \\right)}^{^{1}/{\lambda_\ell}} \\right)^{\lambda_{\ell}}}} \\\\ + P_{{i} \\lvert {B_k}} = \\frac{{\\left(\\alpha_{ik} \\boldsymbol{e}^{V_i} \\right)}^{^{1}/{\lambda_k}}}{\sum_{j \in B_k} {\\left(\\alpha_{jk} \\boldsymbol{e}^{V_j} \\right)}^{^{1}/{\lambda_k}}} \enspace , + + + Parameters + ---------- + utility : theano tensor + (n_instances, n_objects) + Utility :math:`Y_i` of the objects :math:`x_i \in Q` in the query sets + lambda_k : theano tensor (range : [alpha, 1.0]) + (n_nests) + Measure of independence amongst the obejcts in each nests + alpha_ik : theano tensor + (n_instances, n_objects, n_nests) + Fractional allocation of each object :math:`x_i` in each nest math:`B_k` + + Returns + ------- + p : theano tensor + (n_instances, n_objects) + Choice probabilities :math:`P_i` of the objects :math:`x_i \in Q` in the query sets + + """ n_nests = self.n_nests n_instances, n_objects = utility.shape pik = tt.zeros((n_instances, n_objects, n_nests)) @@ -75,7 +183,7 @@ def get_probabilities(self, utility, lambda_k, alpha_ik): p = p.sum(axis=2) return p - def get_probabilities_np(self, utility, lambda_k, alpha_ik): + def _get_probabilities_np(self, utility, lambda_k, alpha_ik): n_nests = self.n_nests n_instances, n_objects = utility.shape pik = np.zeros((n_instances, n_objects, n_nests)) @@ -92,11 +200,30 @@ def get_probabilities_np(self, utility, lambda_k, alpha_ik): return p def construct_model(self, X, Y): + """ + Constructs the nested logit model by applying priors on weight vectors **weights** and **weights_k** as per + :meth:`model_configuration`. Then we apply a uniform prior to the :math:`\lambda s`, i.e. + :math:`\lambda s \sim Uniform(\\text{alpha}, 1.0)`.The probability of choosing the object :math:`x_i` from the + query set :math:`Q = \{x_1, \ldots ,x_n\}` is evaluated in :meth:`get_probabilities`. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Preferences in the form of discrete choices for given objects + + Returns + ------- + model : pymc3 Model :class:`pm.Model` + """ with pm.Model() as self.model: self.Xt = theano.shared(X) self.Yt = theano.shared(Y) shapes = {'weights': self.n_object_features, 'weights_ik': (self.n_object_features, self.n_nests)} - weights_dict = create_weight_dictionary(self.model_priors, shapes) + weights_dict = create_weight_dictionary(self.model_configuration, shapes) alpha_ik = tt.dot(self.Xt, weights_dict['weights_ik']) alpha_ik = ttu.softmax(alpha_ik, axis=2) @@ -125,11 +252,12 @@ def fit(self, X, Y, sampler="vi", **kwargs): Feature vectors of the objects Y : numpy array (n_instances, n_objects) Choices for given objects in the query - sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string - The sampler used to estimate the posterior mean and mass matrix from the trace. - * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix - * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler - * **nuts** : Use the No-U-Turn sampler + sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string + The sampler used to estimate the posterior mean and mass matrix from the trace + + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler **kwargs : Keyword arguments for the fit function """ @@ -189,7 +317,7 @@ def _predict_scores_fixed(self, X, **kwargs): alpha_ik = np.dot(X, weights_ik) alpha_ik = npu.softmax(alpha_ik, axis=2) utility = np.dot(X, weights) - p = self.get_probabilities_np(utility, lambda_k, alpha_ik) + p = self._get_probabilities_np(utility, lambda_k, alpha_ik) return p def predict(self, X, **kwargs): @@ -202,6 +330,22 @@ def predict_for_scores(self, scores, **kwargs): return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs) def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization='l2', **point): + """ + Set tunable parameters of the Nested Logit model to the values provided. + + Parameters + ---------- + alpha: float (range : [0,1]) + The lower bound of the correlations between the objects in a nest + n_nests: int (range : [2,n_objects]) + The number of nests in which the objects are divided + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + point: dict + Dictionary containing parameter values which are not tuned for the network + """ if alpha is not None: self.alpha = alpha if n_nests is None: diff --git a/csrank/discretechoice/mixed_logit_model.py b/csrank/discretechoice/mixed_logit_model.py index 6bdab0c0..bbe4876f 100644 --- a/csrank/discretechoice/mixed_logit_model.py +++ b/csrank/discretechoice/mixed_logit_model.py @@ -16,7 +16,48 @@ class MixedLogitModel(DiscreteObjectChooser, Learner): - def __init__(self, n_object_features, n_mixtures=4, loss_function='', regularization='l2', model_args={}, **kwargs): + def __init__(self, n_object_features, n_mixtures=4, loss_function='', regularization='l2', **kwargs): + """ + Create an instance of the Mixed Logit model for learning the discrete choice function. In this model we + assume weights of this model to be random due to which this model can learn different variations in choices + amongst the individuals. The utility score for each object in query set :math:`Q` is defined as + :math:`U_r(x) = w_r \cdot x`, where :math:`w_r` is the k-th sample weight vector from the underlying distribution + The probability of choosing an object :math:`x_i` is defined by taking softmax over the + utility scores of the objects: + + .. math:: + + P(x_i \\lvert Q) = \\frac{1}{R} \sum_{r=1}^R \\frac{exp(U_r(x_i))}{\sum_{x_j \in Q} exp(U_r(x_j))} + + The discrete choice for the given query set :math:`Q` is defined as: + + .. math:: + + dc(Q) := \operatorname{argmax}_{x_i \in Q } \; P(x_i \\lvert Q) + + Parameters + ---------- + n_object_features : int + Number of features of the object space + n_mixtures: int (range : [2, inf]) + The number of logit models (:math:`R`) which are used to estimate the choice probability + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + random_state : int or object + Numpy random state + **kwargs + Keyword arguments for the algorithms + + References + ---------- + [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Mixed Logit, pp. 153–172. + + [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986 + + [3] Daniel McFadden and Kenneth Train. „Mixed MNL models for discrete response“. In: Journal of applied Econometrics 15.5 (2000), pp. 447–470 + """ self.logger = logging.getLogger(MixedLogitModel.__name__) self.n_object_features = n_object_features self.loss_function = likelihood_dict.get(loss_function, None) @@ -34,7 +75,28 @@ def __init__(self, n_object_features, n_mixtures=4, loss_function='', regulariza self.p = None @property - def model_priors(self): + def model_configuration(self): + """ + Constructs the dictionary containing the priors for the weight vectors for the model according to the + regularization function. The parameters are: + * **weights** : Distribution of the weigh vectors to evaluates the utility of the objects + + For ``l1`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w) + + For ``l2`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w) + """ if self._config is None: if self.regularization == 'l2': weight = pm.Normal @@ -48,17 +110,66 @@ def model_priors(self): return self._config def construct_model(self, X, Y): + """ + Constructs the mixed logit model by applying priors on weight vectors **weights** as per + :meth:`model_configuration`. The probability of choosing the object :math:`x_i` from the query set + :math:`Q = \{x_1, \ldots ,x_n\}` assuming we draw :math:`R` samples of the weight vectors is: + + .. math:: + + P(x_i \\lvert Q) = \\frac{1}{R} \sum_{r=1}^R \\frac{exp(U_r(x_i))}{\sum_{x_j \in Q} exp(U_r(x_j))} + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Preferences in the form of discrete choices for given objects + + Returns + ------- + model : pymc3 Model :class:`pm.Model` + """ with pm.Model() as self.model: self.Xt = theano.shared(X) self.Yt = theano.shared(Y) shapes = {'weights': (self.n_object_features, self.n_mixtures)} - weights_dict = create_weight_dictionary(self.model_priors, shapes) + weights_dict = create_weight_dictionary(self.model_configuration, shapes) utility = tt.dot(self.Xt, weights_dict['weights']) self.p = tt.mean(ttu.softmax(utility, axis=1), axis=2) yl = LogLikelihood('yl', loss_func=self.loss_function, p=self.p, observed=self.Yt) self.logger.info("Model construction completed") def fit(self, X, Y, sampler='vi', **kwargs): + """ + Fit a multinomial logit model on the provided set of queries X and choices Y of those objects. The + provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network + the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as: + + .. math:: + + C_{i} = -y(i)\log(P_i) \enspace, + + where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`. + The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. + + Parameters + ---------- + X : numpy array (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array (n_instances, n_objects) + Choices for given objects in the query + sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string + The sampler used to estimate the posterior mean and mass matrix from the trace + + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler + **kwargs : + Keyword arguments for the fit function + """ self.construct_model(X, Y) kwargs['random_seed'] = self.random_state.randint(2 ** 32, dtype='uint32') callbacks = kwargs['vi_params'].get('callbacks', []) @@ -123,11 +234,21 @@ def predict_scores(self, X, **kwargs): def predict_for_scores(self, scores, **kwargs): return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs) - def clear_memory(self, **kwargs): - self.logger.info("Clearing memory") - pass + def set_tunable_parameters(self, n_mixtures=4, loss_function='', regularization="l1", **point): + """ + Set tunable parameters of the Mixed Logit model to the values provided. - def set_tunable_parameters(self, loss_function='', regularization="l1", n_mixtures=4, **point): + Parameters + ---------- + n_mixtures: int (range : [2, inf]) + The number of logit models (:math:`R`) which are used to estimate the choice probability + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + point: dict + Dictionary containing parameter values which are not tuned for the network + """ if loss_function in likelihood_dict.keys(): self.loss_function = likelihood_dict.get(loss_function, None) self.n_mixtures = n_mixtures diff --git a/csrank/discretechoice/multinomial_logit_model.py b/csrank/discretechoice/multinomial_logit_model.py index 49ffc553..5613e59f 100644 --- a/csrank/discretechoice/multinomial_logit_model.py +++ b/csrank/discretechoice/multinomial_logit_model.py @@ -16,7 +16,7 @@ class MultinomialLogitModel(DiscreteObjectChooser, Learner): def __init__(self, n_object_features, loss_function='', regularization='l2', **kwargs): """ - Create an instance of the MultinomialLogitModel model for learning the discrete choice function. The utility + Create an instance of the Multinomial Logit model for learning the discrete choice function. The utility score for each object in query set :math:`Q` is defined as :math:`U(x) = w \cdot x`, where :math:`w` is the weight vector. The probability of choosing an object :math:`x_i` is defined by taking softmax over the utility scores of the objects: @@ -47,6 +47,8 @@ def __init__(self, n_object_features, loss_function='', regularization='l2', **k References ---------- [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86. + + [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986 """ self.logger = logging.getLogger(MultinomialLogitModel.__name__) self.n_object_features = n_object_features @@ -66,13 +68,25 @@ def __init__(self, n_object_features, loss_function='', regularization='l2', **k @property def model_configuration(self): """ - Constructs the dictionary containing the priors for the weight vector for the model according to the - regularization function. + Constructs the dictionary containing the priors for the weight vectors for the model according to the + regularization function. The parameters are: + * **weights** : Weights to evaluates the utility of the objects - Returns - ------- - configuration : dict - Dictionary containing the priors applies on the weights + For ``l1`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w) + + For ``l2`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w) """ if self._config is None: if self.regularization == 'l2': @@ -143,10 +157,11 @@ def fit(self, X, Y, sampler='vi', **kwargs): Y : numpy array (n_instances, n_objects) Choices for given objects in the query sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string - The sampler used to estimate the posterior mean and mass matrix from the trace. - * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix - * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler - * **nuts** : Use the No-U-Turn sampler + The sampler used to estimate the posterior mean and mass matrix from the trace + + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler **kwargs : Keyword arguments for the fit function """ diff --git a/csrank/discretechoice/nested_logit_model.py b/csrank/discretechoice/nested_logit_model.py index 1eba43f3..396cd28d 100644 --- a/csrank/discretechoice/nested_logit_model.py +++ b/csrank/discretechoice/nested_logit_model.py @@ -21,11 +21,19 @@ class NestedLogitModel(DiscreteObjectChooser, Learner): def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='', regularization='l1', alpha=1e-2, random_state=None, **kwd): """ - Create an instance of the NestedLogitModel model for learning the discrete choice function. + Create an instance of the Nested Logit model for learning the discrete choice function. This model divides + objects into disjoint subsets called nests,such that the objects which are similar to each other are in same + nest. This model structure is 1-layer of hierarchy and the :math:`\lambda` for each nest :math:`B_k` signifies + the degree of independence and :math:`1-\lambda` signifies the correlations between the object in it. We + learn two weight vectors and the :math:`\lambda s`. + + The probability of choosing an object :math:`x_i` from the given query set :math:`Q` is defined by product + of choosing the nest in which :math:`x_i` exists and then choosing the the object from the nest. .. math:: - P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))} + P(x_i \\lvert Q) = P_i = P_{i \lvert B_k} P_{B_k} \enspace , + The discrete choice for the given query set :math:`Q` is defined as: @@ -39,10 +47,14 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='', Number of features of the object space n_objects: int Number of objects in each query set + n_nests : int range : [2,n_objects/2] + The number of nests/subsets in which the objects are divided loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} Loss function to be used for the discrete choice decision from the query set regularization : string, {‘l1’, ‘l2’}, string Regularizer function (L1 or L2) applied to the `kernel` weights matrix + alpha: float (range : [0,1]) + The lower bound of the correlations between the objects in a nest random_state : int or object Numpy random state **kwargs @@ -50,7 +62,11 @@ def __init__(self, n_object_features, n_objects, n_nests=None, loss_function='', References ---------- - [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap Logit, pp. 41–86. + [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap GEV, pp. 87–111. + + [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986 + + [3] Kenneth Train and Daniel McFadden. „The goods/leisure tradeoff and disaggregate work trip mode choice models“. In: Transportation research 12.5 (1978), pp. 349–353 """ self.logger = logging.getLogger(NestedLogitModel.__name__) self.n_object_features = n_object_features @@ -82,8 +98,25 @@ def model_configuration(self): """ Constructs the dictionary containing the priors for the weight vectors for the model according to the regularization function. The parameters are: - * weights : - * weights_k : + * **weights** : Weights to evaluates the utility of the objects + * **weights_k** : Weights to evaluates the utility of the nests + + For ``l1`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w) + + For ``l2`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w) + Returns ------- @@ -104,6 +137,22 @@ def model_configuration(self): return self._config def create_nests(self, X): + """ + For allocating the objects to different nests we use the clustering algorithm with number of clusters + :math:`k` and allocate the similar objects in query set :math:`Q`. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects in the query sets + + Returns + ------- + Yn : numpy array + (n_instances, n_objects) Values for each object implying the nest it belongs to. For example for :math:`2` nests the value 0 implies that object is allocated to nest 1 and value 1 implies it is allocated to nest 2. + + """ n, n_obj, n_dim = X.shape objects = X.reshape(n * n_obj, n_dim) if self.cluster_model is None: @@ -112,12 +161,12 @@ def create_nests(self, X): prediction = self.cluster_model.labels_ else: prediction = self.cluster_model.predict(objects) - y_nests = [] + Yn = [] for i in np.arange(0, n * n_obj, step=n_obj): nest_ids = prediction[i:i + n_obj] - y_nests.append(nest_ids) - y_nests = np.array(y_nests) - return y_nests + Yn.append(nest_ids) + Yn = np.array(Yn) + return Yn def _eval_utility(self, weights): utility = tt.zeros(tuple(self.y_nests.shape)) @@ -126,7 +175,40 @@ def _eval_utility(self, weights): utility = tt.set_subtensor(utility[rows, cols], tt.dot(self.Xt[rows, cols], weights[i])) return utility - def get_probability(self, utility, lambda_k, utility_k): + def get_probabilities(self, utility, lambda_k, utility_k): + """ + This method calculates the probability of choosing an object from the query set using the following parameters of the model which are used: + + * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i` + * **weights_k** (:math:`w_k`): Weights to get the utility of the next :math:`W_k = U_k(x) = w_k \cdot c_k`, where :math:`c_k` is the center of the object space of nest :math:`B_k` + * **lambda_k** (:math:`\lambda_k`): Lambda is the measure of independence amongst the obejcts in the nest :math:`B_k` + + The probability of choosing the object :math:`x_i` from the query set :math:`Q`: + + .. math:: + P_i = \\frac{\\boldsymbol{e}^{ ^{Y_i} /_{\lambda_k}}}{\sum_{j \in B_k} \\boldsymbol{e}^{^{Y_j} /_{\lambda_k}}} \\frac {\\boldsymbol{e}^{W_k + \lambda_k I_k}} {\sum_{\\ell = 1}^{K} \\boldsymbol{e}^{ W_{\\ell } + \lambda_{\\ell} I_{\\ell}}} \quad i \in B_k \enspace , \\\\ + where,\enspace I_k = \ln \sum_{ j \in B_k} \\boldsymbol{e}^{^{Y_j} /_{\lambda_k}} + + + Parameters + ---------- + utility : theano tensor + (n_instances, n_objects) + Utility :math:`Y_i` of the objects :math:`x_i \in Q` in the query sets + lambda_k : theano tensor (range : [alpha, 1.0]) + (n_nests) + Measure of independence amongst the obejcts in each nests + utility_k : theano tensor + (n_instances, n_nests) + Utilities of the nests :math:`B_k \in \mathcal{B}` + + Returns + ------- + p : theano tensor + (n_instances, n_objects) + Choice probabilities :math:`P_i` of the objects :math:`x_i \in Q` in the query sets + + """ n_instances, n_objects = self.y_nests.shape pni_k = tt.zeros((n_instances, n_objects)) ivm = tt.zeros((n_instances, self.n_nests)) @@ -153,20 +235,20 @@ def _eval_utility_np(self, x_t, y_nests, weights): utility[rows, cols] = np.dot(x_t[rows, cols], weights[i]) return utility - def _get_probability_np(self, y_nests, utility, lambda_k, utility_k): - n_instances, n_objects = y_nests.shape + def _get_probabilities_np(self, Y_n, utility, lambda_k, utility_k): + n_instances, n_objects = Y_n.shape pni_k = np.zeros((n_instances, n_objects)) ivm = np.zeros((n_instances, self.n_nests)) for i in range(self.n_nests): sub_tensor = np.copy(utility) - sub_tensor[np.where(y_nests != i)] = -1e50 + sub_tensor[np.where(Y_n != i)] = -1e50 ink = npu.logsumexp(sub_tensor) - pni_k[np.where(y_nests == i)] = np.exp(sub_tensor - ink)[np.where(y_nests == i)] + pni_k[np.where(Y_n == i)] = np.exp(sub_tensor - ink)[np.where(Y_n == i)] ivm[:, i] = lambda_k[i] * ink[:, 0] + utility_k[i] pk = np.exp(ivm - npu.logsumexp(ivm)) pn_k = np.zeros((n_instances, n_objects)) for i in range(self.n_nests): - rows, cols = np.where(y_nests == i) + rows, cols = np.where(Y_n == i) p = np.ones((n_instances, n_objects)) * pk[:, i][:, None] pn_k[rows, cols] = p[rows, cols] p = pni_k * pn_k @@ -174,11 +256,10 @@ def _get_probability_np(self, y_nests, utility, lambda_k, utility_k): def construct_model(self, X, Y): """ - Constructs the nested logit model. - - .. math:: - - P_i = P(x_i \\lvert Q) = \\frac{exp(U(x_i))}{\sum_{x_j \in Q} exp(U(x_j))} + Constructs the nested logit model by applying priors on weight vectors **weights** and **weights_k** as per + :meth:`model_configuration`. Then we apply a uniform prior to the :math:`\lambda s`, i.e. + :math:`\lambda s \sim Uniform(\\text{alpha}, 1.0)`.The probability of choosing the object :math:`x_i` from + the query set :math:`Q = \{x_1, \ldots ,x_n\}` is evaluated in :meth:`get_probabilities`. Parameters ---------- @@ -205,7 +286,7 @@ def construct_model(self, X, Y): weights = (weights_dict['weights'] / lambda_k[:, None]) utility = self._eval_utility(weights) utility_k = tt.dot(self.features_nests, weights_dict['weights_k']) - self.p = self.get_probability(utility, lambda_k, utility_k) + self.p = self.get_probabilities(utility, lambda_k, utility_k) yl = LogLikelihood('yl', loss_func=self.loss_function, p=self.p, observed=self.Yt) self.logger.info("Model construction completed") @@ -230,10 +311,11 @@ def fit(self, X, Y, sampler="vi", **kwargs): Y : numpy array (n_instances, n_objects) Choices for given objects in the query sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string - The sampler used to estimate the posterior mean and mass matrix from the trace. - * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix - * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler - * **nuts** : Use the No-U-Turn sampler + The sampler used to estimate the posterior mean and mass matrix from the trace + + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler **kwargs : Keyword arguments for the fit function """ @@ -292,7 +374,7 @@ def _predict_scores_fixed(self, X, **kwargs): weights = (weights / lambda_k[:, None]) utility_k = np.dot(self.features_nests, weights_k) utility = self._eval_utility_np(X, y_nests, weights) - scores = self._get_probability_np(y_nests, utility, lambda_k, utility_k) + scores = self._get_probabilities_np(y_nests, utility, lambda_k, utility_k) return scores def predict(self, X, **kwargs): @@ -306,7 +388,7 @@ def predict_for_scores(self, scores, **kwargs): def set_tunable_parameters(self, alpha=None, n_nests=None, loss_function='', regularization="l1", **point): """ - Set tunable parameters of the Multinomial Logit model to the values provided. + Set tunable parameters of the Nested Logit model to the values provided. Parameters ---------- diff --git a/csrank/discretechoice/paired_combinatorial_logit.py b/csrank/discretechoice/paired_combinatorial_logit.py index 015f3013..460d1bf4 100644 --- a/csrank/discretechoice/paired_combinatorial_logit.py +++ b/csrank/discretechoice/paired_combinatorial_logit.py @@ -19,7 +19,57 @@ class PairedCombinatorialLogit(DiscreteObjectChooser, Learner): def __init__(self, n_object_features, n_objects, loss_function='', regularization='l2', alpha=5e-2, - random_state=None, model_args={}, **kwd): + random_state=None, **kwd): + """ + Create an instance of the Paired Combinatorial Logit model for learning the discrete choice function. This + model considering each pair of objects as a different nest allowing unique covariances for each pair of objects, + and each object is a member of :math:`n - 1` nests. This model structure is 1-layer of hierarchy and the + :math:`\lambda` for each nest :math:`B_k` signifies the degree of independence and :math:`1-\lambda` signifies + the correlations between the object in it. We learn two weight vectors and the :math:`\lambda s`. + * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i` + * **lambda_k** (:math:`\lambda_k`): Lambda for nest nest :math:`B_k` for correlations between the obejcts. + + The probability of choosing an object :math:`x_i` from the given query set :math:`Q` is defined by product + of choosing the nest in which :math:`x_i` exists and then choosing the the object from the nest. + + .. math:: + + P(x_i \\lvert Q) = P_i = \sum_{\substack{B_k \in \mathcal{B} \\ i \in B_k}}P_{i \\lvert B_k} P_{B_k} \enspace , + + + The discrete choice for the given query set :math:`Q` is defined as: + + .. math:: + + dc(Q) := \operatorname{argmax}_{x_i \in Q } \; P(x_i \\lvert Q) + + Parameters + ---------- + n_object_features : int + Number of features of the object space + n_objects: int + Number of objects in each query set + n_nests : int range : [2,n_objects/2] + The number of nests/subsets in which the objects are divided + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + alpha: float (range : [0,1]) + The lower bound of the correlations between the objects in a nest + random_state : int or object + Numpy random state + **kwargs + Keyword arguments for the algorithms + + References + ---------- + [1] Kenneth E Train. „Discrete choice methods with simulation“. In: Cambridge university press, 2009. Chap GEV, pp. 87–111. + + [2] Kenneth Train. Qualitative choice analysis. Cambridge, MA: MIT Press, 1986 + + [3] Chaushie Chu. „A paired combinatorial logit model for travel demand analysis“. In: Proceedings of the fifth world conference on transportation research. Vol. 4.1989, pp. 295–309 + """ self.logger = logging.getLogger(PairedCombinatorialLogit.__name__) self.n_object_features = n_object_features self.n_objects = n_objects @@ -41,7 +91,33 @@ def __init__(self, n_object_features, n_objects, loss_function='', regularizatio self.p = None @property - def model_priors(self): + def model_configuration(self): + """ + Constructs the dictionary containing the priors for the weight vectors for the model according to the + regularization function. The parameters are: + * **weights** : Weights to evaluates the utility of the objects + + For ``l1`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{b}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Laplace}(\\text{mu}=\\text{mu}_w, \\text{b}=\\text{b}_w) + + For ``l2`` regularization the priors are: + + .. math:: + + \\text{mu}_w \sim \\text{Normal}(\\text{mu}=0, \\text{sd}=5.0) \\\\ + \\text{sd}_w \sim \\text{HalfCauchy}(\\beta=1.0) \\\\ + \\text{weights} \sim \\text{Normal}(\\text{mu}=\\text{mu}_w, \\text{sd}=\\text{sd}_w) + + Returns + ------- + configuration : dict + Dictionary containing the priors applies on the weights + """ if self._config is None: if self.regularization == 'l2': weight = pm.Normal @@ -54,8 +130,39 @@ def model_priors(self): 'weights_k': [weight, {'mu': (pm.Normal, {'mu': 0, 'sd': 5}), prior: (pm.HalfCauchy, {'beta': 1})}]} self.logger.info('Creating model with config {}'.format(print_dictionary(self._config))) return self._config +# def get_probabilities(self, utility, lambda_k): + """ + This method calculates the probability of choosing an object from the query set using the following parameters of the model which are used: + + * **weights** (:math:`w`): Weights to get the utility of the object :math:`Y_i = U(x_i) = w \cdot x_i` + * **lambda_k** (:math:`\lambda_k`): Lambda is the measure of independence amongst the obejcts in the nest :math:`B_k` + + The probability of choosing the object :math:`x_i` from the query set :math:`Q`: + + .. math:: + P_i = \sum_{j \in I \setminus i} P_{{i} \\lvert {ij}} P_{ij} \enspace where, \\\\ + P_{i \\lvert ij} = \\frac{\\boldsymbol{e}^{^{Y_i} /_{\lambda_{ij}}}}{\\boldsymbol{e}^{^{Y_i} /_{\lambda_{ij}}} + \\boldsymbol{e}^{^{Y_j} /_{\lambda_{ij}}}} \enspace ,\\\\ + P_{ij} = \\frac{{\\left( \\boldsymbol{e}^{^{V_i}/{\lambda_{ij}}} + \\boldsymbol{e}^{^{V_j}/{\lambda_{ij}}} \\right)}^{\lambda_{ij}}}{\sum_{k=1}^{n-1} \sum_{\ell = k + 1}^{n} {\\left( \\boldsymbol{e}^{^{V_k}/{\lambda_{k\ell}}} + \\boldsymbol{e}^{^{V_{\ell}}/{\lambda_{k\ell}}} \\right)}^{\lambda_{k\ell}}} + + + Parameters + ---------- + utility : theano tensor + (n_instances, n_objects) + Utility :math:`Y_i` of the objects :math:`x_i \in Q` in the query sets + lambda_k : theano tensor (range : [alpha, 1.0]) + (n_nests) + Measure of independence amongst the obejcts in each nests + + Returns + ------- + p : theano tensor + (n_instances, n_objects) + Choice probabilities :math:`P_i` of the objects :math:`x_i \in Q` in the query sets + + """ n_objects = self.n_objects nests_indices = self.nests_indices n_nests = self.n_nests @@ -79,7 +186,7 @@ def get_probabilities(self, utility, lambda_k): p = tt.set_subtensor(p[:, i2], p[:, i2] + x2) return p - def get_probabilities_np(self, utility, lambda_k): + def _get_probabilities_np(self, utility, lambda_k): n_objects = self.n_objects nests_indices = self.nests_indices n_nests = self.n_nests @@ -100,6 +207,25 @@ def get_probabilities_np(self, utility, lambda_k): return p def construct_model(self, X, Y): + """ + Constructs the nested logit model by applying priors on weight vectors **weights** as per :meth:`model_configuration`. + Then we apply a uniform prior to the :math:`\lambda s`, i.e. :math:`\lambda s \sim Uniform(\\text{alpha}, 1.0)`. + The probability of choosing the object :math:`x_i` from the query set :math:`Q = \{x_1, \ldots ,x_n\}` is + evaluated in :meth:`get_probabilities`. + + Parameters + ---------- + X : numpy array + (n_instances, n_objects, n_features) + Feature vectors of the objects + Y : numpy array + (n_instances, n_objects) + Preferences in the form of discrete choices for given objects + + Returns + ------- + model : pymc3 Model :class:`pm.Model` + """ with pm.Model() as self.model: self.Xt = theano.shared(X) self.Yt = theano.shared(Y) @@ -113,7 +239,7 @@ def construct_model(self, X, Y): def fit(self, X, Y, sampler="vi", **kwargs): """ - Fit a paired combinatorial logit model on the provided set of queries X and choices Y of those objects. The + Fit a paired combinatorial model on the provided set of queries X and choices Y of those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network the categorical cross entropy loss function for each object :math:`x_i \in Q` is defined as: @@ -131,10 +257,11 @@ def fit(self, X, Y, sampler="vi", **kwargs): Y : numpy array (n_instances, n_objects) Choices for given objects in the query sampler : {‘vi’, ‘metropolis’, ‘nuts’}, string - The sampler used to estimate the posterior mean and mass matrix from the trace. - * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix - * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler - * **nuts** : Use the No-U-Turn sampler + The sampler used to estimate the posterior mean and mass matrix from the trace + + * **vi** : Run ADVI to estimate posterior mean and diagonal mass matrix + * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler + * **nuts** : Use the No-U-Turn sampler **kwargs : Keyword arguments for the fit function """ @@ -189,7 +316,7 @@ def _predict_scores_fixed(self, X, **kwargs): weights = np.array([mean_trace['weights__{}'.format(i)] for i in range(self.n_object_features)]) lambda_k = np.array([mean_trace['lambda_k__{}'.format(i)] for i in range(self.n_nests)]) utility = np.dot(X, weights) - p = self.get_probabilities_np(utility, lambda_k) + p = self._get_probabilities_np(utility, lambda_k) return p def predict(self, X, **kwargs): @@ -202,6 +329,20 @@ def predict_for_scores(self, scores, **kwargs): return DiscreteObjectChooser.predict_for_scores(self, scores, **kwargs) def set_tunable_parameters(self, alpha=5e-2, loss_function='', regularization='l2', **point): + """ + Set tunable parameters of the Paired Combinatorial logit model to the values provided. + + Parameters + ---------- + alpha: float (range : [0,1]) + The lower bound of the correlations between the objects in a nest + loss_function : string , {‘categorical_crossentropy’, ‘binary_crossentropy’, ’categorical_hinge’} + Loss function to be used for the discrete choice decision from the query set + regularization : string, {‘l1’, ‘l2’}, string + Regularizer function (L1 or L2) applied to the `kernel` weights matrix + point: dict + Dictionary containing parameter values which are not tuned for the network + """ if alpha is not None: self.alpha = alpha if loss_function in likelihood_dict.keys(): diff --git a/csrank/discretechoice/ranknet_discrete_choice.py b/csrank/discretechoice/ranknet_discrete_choice.py index 786008ab..1f15fbc8 100644 --- a/csrank/discretechoice/ranknet_discrete_choice.py +++ b/csrank/discretechoice/ranknet_discrete_choice.py @@ -107,4 +107,4 @@ def clear_memory(self, **kwargs): def set_tunable_parameters(self, n_hidden=32, n_units=2, reg_strength=1e-4, learning_rate=1e-3, batch_size=128, **point): super().set_tunable_parameters(n_hidden=n_hidden, n_units=n_units, reg_strength=reg_strength, - learning_rate=learning_rate, batch_size=batch_size, **point) \ No newline at end of file + learning_rate=learning_rate, batch_size=batch_size, **point)