Skip to content

Commit

Permalink
ENH/REF: support pd ordered Categorical in formula
Browse files Browse the repository at this point in the history
  • Loading branch information
josef-pkt committed Sep 7, 2020
1 parent 3f7dede commit 6fc1e65
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 21 deletions.
9 changes: 7 additions & 2 deletions statsmodels/base/model.py
Expand Up @@ -103,6 +103,9 @@ def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
for key in kwargs:
if key in ['design_info', 'formula']: # leave attached to data
continue
if key == 'frame':
data.frame = kwargs["frame"]
continue
# pop so we do not start keeping all these twice or references
try:
setattr(self, key, data.__dict__.pop(key))
Expand Down Expand Up @@ -191,12 +194,14 @@ def from_formula(cls, formula, data, subset=None, drop_cols=None,
kwargs.update({'missing_idx': missing_idx,
'missing': missing,
'formula': formula, # attach formula for unpckling
'design_info': design_info})
'design_info': design_info,
'frame': data
})
mod = cls(endog, exog, *args, **kwargs)
mod.formula = formula

# since we got a dataframe, attach the original
mod.data.frame = data
# mod.data.frame = data
return mod

@property
Expand Down
87 changes: 68 additions & 19 deletions statsmodels/miscmodels/ordinal_model.py
Expand Up @@ -10,8 +10,8 @@
import pandas as pd
from pandas.api.types import CategoricalDtype
from scipy import stats
from statsmodels.base.model import GenericLikelihoodModel, \
GenericLikelihoodModelResults
from statsmodels.base.model import (
GenericLikelihoodModel, GenericLikelihoodModelResults, LikelihoodModel)
from statsmodels.compat.pandas import Appender


Expand All @@ -23,7 +23,8 @@ class OrderedModel(GenericLikelihoodModel):
The mode assumes that the endogenous variable is ordered but that the
labels have no numeric interpretation besides the ordering.
The model is based on a latent linear variable, where we observe only
The model is based on a latent linear variable, where we observe only a
discretization.
y_latent = X beta + u
Expand All @@ -46,18 +47,18 @@ class OrderedModel(GenericLikelihoodModel):
distribution. We use standardized distributions to avoid identifiability
problems.
Parameters
----------
endog : array_like
endogenous or dependent ordered categorical variable with k levels.
Endogenous or dependent ordered categorical variable with k levels.
Labels or values of endog will internally transformed to consecutive
integers, 0, 1, 2, ...
pd.Series with Categorical as dtype should be preferred as it gives
the order relation between the levels.
If endog is not a pandas Categorical, then categories are
sorted in lexicographic order (by numpy.unique).
exog : array_like
exogenous explanatory variables. This should not include an intercept.
(TODO: verify)
Exogenous, explanatory variables. This should not include an intercept.
pd.DataFrame are also accepted.
distr : string 'probit' or 'logit', or a distribution instance
The default is currently 'probit' which uses the normal distribution
Expand All @@ -69,6 +70,7 @@ class OrderedModel(GenericLikelihoodModel):
Status: initial version, subclasses `GenericLikelihoodModel`
"""
_formula_max_endog = np.inf

def __init__(self, endog, exog, offset=None, distr='probit', **kwds):

Expand All @@ -87,12 +89,24 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds):

endog, labels, is_pandas = self._check_inputs(endog, exog)

frame = kwds.pop("frame", None)
super(OrderedModel, self).__init__(endog, exog, **kwds)

if frame is not None:
self.data.frame = frame

if not is_pandas:
unique, index = np.unique(self.endog, return_inverse=True)
self.endog = index
labels = unique
# TODO: maybe handle 2-dim endog obtained from formula
if self.endog.ndim == 1:
unique, index = np.unique(self.endog, return_inverse=True)
self.endog = index
labels = unique
elif self.endog.ndim == 2:
endog_, labels, ynames = self._handle_formula_categorical()
# replace endog with categorical
self.endog = endog_
# fix yname
self.data.ynames = ynames

self.labels = labels
self.k_levels = len(labels)
Expand All @@ -114,11 +128,13 @@ def __init__(self, endog, exog, offset=None, distr='probit', **kwds):
self.results_class = OrderedResults

def _check_inputs(self, endog, exog):
"""handle endog that is pandas Categorical
checks if self.distrib is legal and does the Pandas Categorical
support for endog.
"""
checks if self.distrib is legal and does the Pandas
support for endog and exog. Also retrieves columns & categories
names for .summary() of the results class.
"""

# TOCO: maybe remove this if we want to have duck distributions
if not isinstance(self.distr, stats.rv_continuous):
msg = (
f"{self.distr.name} must be a scipy.stats distribution."
Expand All @@ -135,6 +151,7 @@ def _check_inputs(self, endog, exog):
"risk of capturing a wrong order for the "
"categories. ordered == True preferred.",
Warning)

endog_name = endog.name
labels = endog.values.categories
endog = endog.cat.codes
Expand All @@ -143,13 +160,45 @@ def _check_inputs(self, endog, exog):
"not supported")
endog.name = endog_name
is_pandas = True
else:
msg = ("If endog is a pandas.Series, "
"it must be of CategoricalDtype.")
raise ValueError(msg)
# else:
# msg = ("If endog is a pandas.Series, "
# "it must be of CategoricalDtype.")
# raise ValueError(msg)

return endog, labels, is_pandas

def _handle_formula_categorical(self):
"""handle 2dim endog,
raise if not from formula with pandas ordered Categorical endog
"""
# get info about formula and original data
if not hasattr(self.data.orig_endog, "design_info"):
msg = "2-dim endog are not supported"
raise ValueError(msg)

di_endog = self.data.orig_endog.design_info
if len(di_endog.terms) > 1:
raise ValueError("more than one term in endog")

factor = list(di_endog.factor_infos.values())[0]
labels = factor.categories
name = factor.state["eval_code"]
original_endog = self.data.frame[name]
if not (isinstance(original_endog.dtype, CategoricalDtype)
and original_endog.dtype.ordered):
msg = ("Only ordered pandas Categorical are supported as endog "
"in formulas")
raise ValueError(msg)

# Now we should only have an ordered pandas Categorical

endog = self.endog.argmax(1)
# fix yname
ynames = name
return endog, labels, ynames

def cdf(self, x):
"""cdf evaluated at x
"""
Expand Down Expand Up @@ -268,7 +317,7 @@ def score_obs_(self, params):
"""score, first derivative of loglike for each observations
This currently only implements the derivative with respect to the
exog parameters, but not with respect to threshold-constant parameters.
exog parameters, but not with respect to threshold parameters.
"""
low, upp = self._bounds(params)
Expand Down
25 changes: 25 additions & 0 deletions statsmodels/miscmodels/tests/test_ordinal_model.py
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import scipy.stats as stats
import pytest

from numpy.testing import assert_allclose, assert_equal
from .results.results_ordinal_model import data_store as ds
Expand Down Expand Up @@ -210,6 +211,30 @@ def test_loglikerelated(self):
res_null = mod_null.fit(method='bfgs', disp=False)
assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8)

def test_formula_categorical(self):

resp = self.resp
data = ds.df

"apply ~ pared + public + gpa - 1"
modf2 = OrderedModel.from_formula("apply ~ pared + public + gpa - 1",
data, distr='probit')
resf2 = modf2.fit(method='bfgs', disp=False)
assert_allclose(resf2.params, resp.params, atol=1e-8)
assert modf2.exog_names == resp.model.exog_names
assert modf2.data.ynames == resp.model.data.ynames
assert hasattr(modf2.data, "frame")
assert not hasattr(modf2, "frame")

with pytest.raises(ValueError):
OrderedModel.from_formula(
"apply ~ pared + public + gpa - 1",
data={"apply": np.asarray(data['apply']),
"pared": data['pared'],
"public": data['public'],
"gpa": data['gpa']},
distr='probit')


class TestCLogLogModel(CheckOrdinalModelMixin):

Expand Down

0 comments on commit 6fc1e65

Please sign in to comment.