ENH/REF: support pd ordered Categorical in formula

josef-pkt · Sep 7, 2020 · 6fc1e65 · 6fc1e65
1 parent 3f7dede
commit 6fc1e65
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 21 deletions.
diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py
@@ -103,6 +103,9 @@ def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
         for key in kwargs:
             if key in ['design_info', 'formula']:  # leave attached to data
                 continue
+            if key == 'frame':
+                data.frame = kwargs["frame"]
+                continue
             # pop so we do not start keeping all these twice or references
             try:
                 setattr(self, key, data.__dict__.pop(key))
@@ -191,12 +194,14 @@ def from_formula(cls, formula, data, subset=None, drop_cols=None,
         kwargs.update({'missing_idx': missing_idx,
                        'missing': missing,
                        'formula': formula,  # attach formula for unpckling
-                       'design_info': design_info})
+                       'design_info': design_info,
+                       'frame': data
+                       })
         mod = cls(endog, exog, *args, **kwargs)
         mod.formula = formula
 
         # since we got a dataframe, attach the original
-        mod.data.frame = data
+        # mod.data.frame = data
         return mod
 
     @property

diff --git a/statsmodels/miscmodels/ordinal_model.py b/statsmodels/miscmodels/ordinal_model.py
@@ -10,8 +10,8 @@
 import pandas as pd
 from pandas.api.types import CategoricalDtype
 from scipy import stats
-from statsmodels.base.model import GenericLikelihoodModel, \
-    GenericLikelihoodModelResults
+from statsmodels.base.model import (
+    GenericLikelihoodModel, GenericLikelihoodModelResults, LikelihoodModel)
 from statsmodels.compat.pandas import Appender
 
 
@@ -23,7 +23,8 @@ class OrderedModel(GenericLikelihoodModel):
     The mode assumes that the endogenous variable is ordered but that the
     labels have no numeric interpretation besides the ordering.
 
-    The model is based on a latent linear variable, where we observe only
+    The model is based on a latent linear variable, where we observe only a
+    discretization.
 
     y_latent = X beta + u
 
@@ -46,18 +47,18 @@ class OrderedModel(GenericLikelihoodModel):
     distribution. We use standardized distributions to avoid identifiability
     problems.
 
-
     Parameters
     ----------
     endog : array_like
-        endogenous or dependent ordered categorical variable with k levels.
+        Endogenous or dependent ordered categorical variable with k levels.
         Labels or values of endog will internally transformed to consecutive
         integers, 0, 1, 2, ...
         pd.Series with Categorical as dtype should be preferred as it gives
         the order relation between the levels.
+        If endog is not a pandas Categorical, then categories are
+        sorted in lexicographic order (by numpy.unique).
     exog : array_like
-        exogenous explanatory variables. This should not include an intercept.
-        (TODO: verify)
+        Exogenous, explanatory variables. This should not include an intercept.
         pd.DataFrame are also accepted.
     distr : string 'probit' or 'logit', or a distribution instance
         The default is currently 'probit' which uses the normal distribution
@@ -69,6 +70,7 @@ class OrderedModel(GenericLikelihoodModel):
     Status: initial version, subclasses `GenericLikelihoodModel`
 
     """
+    _formula_max_endog = np.inf
 
     def __init__(self, endog, exog, offset=None, distr='probit', **kwds):
 
@@ -87,12 +89,24 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds):
 
         endog, labels, is_pandas = self._check_inputs(endog, exog)
 
+        frame = kwds.pop("frame", None)
         super(OrderedModel, self).__init__(endog, exog, **kwds)
 
+        if frame is not None:
+            self.data.frame = frame
+
         if not is_pandas:
-            unique, index = np.unique(self.endog, return_inverse=True)
-            self.endog = index
-            labels = unique
+            # TODO: maybe handle 2-dim endog obtained from formula
+            if self.endog.ndim == 1:
+                unique, index = np.unique(self.endog, return_inverse=True)
+                self.endog = index
+                labels = unique
+            elif self.endog.ndim == 2:
+                endog_, labels, ynames = self._handle_formula_categorical()
+                # replace endog with categorical
+                self.endog = endog_
+                # fix yname
+                self.data.ynames = ynames
 
         self.labels = labels
         self.k_levels = len(labels)
@@ -114,11 +128,13 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds):
         self.results_class = OrderedResults
 
     def _check_inputs(self, endog, exog):
+        """handle endog that is pandas Categorical
+
+        checks if self.distrib is legal and does the Pandas Categorical
+        support for endog.
         """
-        checks if self.distrib is legal and does the Pandas
-        support for endog and exog. Also retrieves columns & categories
-        names for .summary() of the results class.
-        """
+
+        # TOCO: maybe remove this if we want to have duck distributions
         if not isinstance(self.distr, stats.rv_continuous):
             msg = (
                 f"{self.distr.name} must be a scipy.stats distribution."
@@ -135,6 +151,7 @@ def _check_inputs(self, endog, exog):
                                   "risk of capturing a wrong order for the "
                                   "categories. ordered == True preferred.",
                                   Warning)
+
                 endog_name = endog.name
                 labels = endog.values.categories
                 endog = endog.cat.codes
@@ -143,13 +160,45 @@ def _check_inputs(self, endog, exog):
                                      "not supported")
                 endog.name = endog_name
                 is_pandas = True
-            else:
-                msg = ("If endog is a pandas.Series, "
-                       "it must be of CategoricalDtype.")
-                raise ValueError(msg)
+#             else:
+#                 msg = ("If endog is a pandas.Series, "
+#                        "it must be of CategoricalDtype.")
+#                 raise ValueError(msg)
 
         return endog, labels, is_pandas
 
+    def _handle_formula_categorical(self):
+        """handle 2dim endog,
+
+        raise if not from formula with pandas ordered Categorical endog
+
+        """
+        # get info about formula and original data
+        if not hasattr(self.data.orig_endog, "design_info"):
+            msg = "2-dim endog are not supported"
+            raise ValueError(msg)
+
+        di_endog = self.data.orig_endog.design_info
+        if len(di_endog.terms) > 1:
+            raise ValueError("more than one term in endog")
+
+        factor = list(di_endog.factor_infos.values())[0]
+        labels = factor.categories
+        name = factor.state["eval_code"]
+        original_endog = self.data.frame[name]
+        if not (isinstance(original_endog.dtype, CategoricalDtype)
+                and original_endog.dtype.ordered):
+            msg = ("Only ordered pandas Categorical are supported as endog "
+                   "in formulas")
+            raise ValueError(msg)
+
+        # Now we should only have an ordered pandas Categorical
+
+        endog = self.endog.argmax(1)
+        # fix yname
+        ynames = name
+        return endog, labels, ynames
+
     def cdf(self, x):
         """cdf evaluated at x
         """
@@ -268,7 +317,7 @@ def score_obs_(self, params):
         """score, first derivative of loglike for each observations
 
         This currently only implements the derivative with respect to the
-        exog parameters, but not with respect to threshold-constant parameters.
+        exog parameters, but not with respect to threshold parameters.
 
         """
         low, upp = self._bounds(params)

diff --git a/statsmodels/miscmodels/tests/test_ordinal_model.py b/statsmodels/miscmodels/tests/test_ordinal_model.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import scipy.stats as stats
+import pytest
 
 from numpy.testing import assert_allclose, assert_equal
 from .results.results_ordinal_model import data_store as ds
@@ -210,6 +211,30 @@ def test_loglikerelated(self):
         res_null = mod_null.fit(method='bfgs', disp=False)
         assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8)
 
+    def test_formula_categorical(self):
+
+        resp = self.resp
+        data = ds.df
+
+        "apply ~ pared + public + gpa - 1"
+        modf2 = OrderedModel.from_formula("apply ~ pared + public + gpa - 1",
+                                          data, distr='probit')
+        resf2 = modf2.fit(method='bfgs', disp=False)
+        assert_allclose(resf2.params, resp.params, atol=1e-8)
+        assert modf2.exog_names == resp.model.exog_names
+        assert modf2.data.ynames == resp.model.data.ynames
+        assert hasattr(modf2.data, "frame")
+        assert not hasattr(modf2, "frame")
+
+        with pytest.raises(ValueError):
+            OrderedModel.from_formula(
+                "apply ~ pared + public + gpa - 1",
+                data={"apply": np.asarray(data['apply']),
+                      "pared": data['pared'],
+                      "public": data['public'],
+                      "gpa": data['gpa']},
+                distr='probit')
+
 
 class TestCLogLogModel(CheckOrdinalModelMixin):