Merge pull request #11 from joshloyal/josh/t_ratio_test

Added t-ratio test for individual sir components
joshloyal · Jun 18, 2018 · 51d88fb · 51d88fb
2 parents 7574cf3 + 5d48967
commit 51d88fb
Show file tree

Hide file tree

Showing 2 changed files with 138 additions and 3 deletions.
diff --git a/sliced/sir.py b/sliced/sir.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import scipy.linalg as linalg
+import scipy.stats as stats
 
 from scipy import sparse
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -19,6 +20,32 @@
 __all__ = ['SlicedInverseRegression']
 
 
+def diagonal_precision(X, R=None):
+    """Calculate he diagonal elements of the empirical precision matrix
+    from a pre-computed QR decomposition of the centered data matrix X.
+
+    Parameters
+    ----------
+    X : numpy-array, shape (n_samples, n_features)
+        The centered data matrix.
+
+    R : numpy-array, shape (n_features, n_features), optional
+        The upper triangular matrix obtained by the economic QR decomposition
+        of the centered data matrix.
+
+    Returns
+    -------
+    precisions : numpy-array, shape (n_features,)
+        The diagonal elements of the precision (inverse covariance) matrix of
+        the centered data.
+    """
+    if R is None:
+        Q, R = linalg.qr(X, mode='economic')
+
+    R_inv = linalg.solve_triangular(R, np.eye(R.shape[1]))
+    return np.sum(R_inv ** 2, axis=1) * X.shape[0]
+
+
 class SlicedInverseRegression(BaseEstimator, TransformerMixin):
     """Sliced Inverse Regression (SIR) [1]
 
@@ -57,6 +84,13 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
         The number of slices used when calculating the inverse regression
         curve. Truncated to at most the number of unique values of ``y``.
 
+    alpha : float or None (default=None)
+        Significance level for the two-sided t-test used to check for
+        non-zero coefficients. Must be a number between 0 and 1. If not
+        `None`, the non-zero components of each direction are determined
+        from an asymptotic normal approximation. Useful if one desires that
+        the directions are sparse in the number of features.
+
     copy : bool (default=True)
          If False, data passed to fit are overwritten and running
          fit(X).transform(X) will not yield the expected results,
@@ -73,7 +107,7 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
         The eigenvalues corresponding to each of the selected directions.
         These are the eigenvalues of the covariance matrix
         of the inverse regression curve. Larger eigenvalues indicate
-        more prevelant directions.
+        more prevalent directions.
 
     Examples
     --------
@@ -84,7 +118,7 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
     >>> X, y = make_cubic(random_state=123)
     >>> sir = SlicedInverseRegression(n_directions=2)
     >>> sir.fit(X, y)
-    SlicedInverseRegression(copy=True, n_directions=2, n_slices=10)
+    SlicedInverseRegression(alpha=None, copy=True, n_directions=2, n_slices=10)
     >>> X_sir = sir.transform(X)
 
     References
@@ -93,10 +127,13 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
     [1] Li, K C. (1991)
         "Sliced Inverse Regression for Dimension Reduction (with discussion)",
         Journal of the American Statistical Association, 86, 316-342.
+    [2] Chen, C.H., and Li, K.C. (1998), "Can SIR Be as Popular as Multiple
+        Linear Regression?" Statistica Sinica, 8, 289-316.
     """
-    def __init__(self, n_directions='auto', n_slices=10, copy=True):
+    def __init__(self, n_directions='auto', n_slices=10, alpha=None, copy=True):
         self.n_directions = n_directions
         self.n_slices = n_slices
+        self.alpha = alpha
         self.copy = copy
 
     def fit(self, X, y):
@@ -135,6 +172,11 @@ def fit(self, X, y):
         else:
             n_directions = self.n_directions
 
+        if self.alpha is not None and (self.alpha <= 0 or self.alpha >= 1):
+            raise ValueError("The significance level `alpha` "
+                             "must be between 0 and 1. Got `alpha`={}".format(
+                                self.alpha))
+
         # validate y
         if is_multioutput(y):
             raise TypeError("The target `y` cannot be multi-output.")
@@ -187,6 +229,32 @@ def fit(self, X, y):
         self.directions_ = directions.T
         self.eigenvalues_ = evals[:self.n_directions_]
 
+        # Drop components in each direction using the t-ratio approach
+        # suggested in section 4 of Chen and Li (1998).
+        if self.alpha is not None:
+            # similar to multiple linear-regression the standard error
+            # estimates are proportional to the diagonals of the inverse
+            # covariance matrix.
+            precs = diagonal_precision(X, R=R)
+            weights = (1 - self.eigenvalues_) / (n_samples * self.eigenvalues_)
+            std_error = np.sqrt(weights.reshape(-1, 1) * precs)
+
+            # perform a two-sided t-test at level alpha for coefs equal to zero
+            # NOTE: we are not correcting for multiple tests and this is
+            #       a very rough approximation, so do not expect the test
+            #       to be close to the nominal level.
+            df = n_samples - n_features - 1
+            crit_val = stats.distributions.t.ppf(1 - self.alpha/2, df)
+            for j in range(self.n_directions_):
+                test_stat = np.abs(self.directions_[j, :] / std_error[j])
+                zero_mask = test_stat < crit_val
+                if np.sum(zero_mask) == n_features:
+                    warnings.warn("Not zeroing out coefficients. All "
+                                  "coefficients are not significantly "
+                                  "different from zero.", RuntimeWarning)
+                else:
+                    self.directions_[j, test_stat < crit_val] = 0.
+
         return self
 
     def transform(self, X):

diff --git a/sliced/test/test_sir.py b/sliced/test/test_sir.py
@@ -10,6 +10,7 @@
 from scipy import sparse
 from scipy import linalg
 from sklearn.datasets import load_digits
+from sklearn.datasets import load_breast_cancer
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 
 from sliced import SlicedInverseRegression
@@ -140,3 +141,69 @@ def test_matches_athletes():
 
     np.testing.assert_allclose(
         sir.directions_, expected_directions)
+
+
+def test_errors_alpha_out_of_bounds():
+    X, y = datasets.make_cubic(random_state=123)
+
+    sir = SlicedInverseRegression(alpha=10)
+
+    with pytest.raises(ValueError):
+        sir.fit(X, y)
+
+
+def test_sparse_coefficient():
+    """Test the component-wise t-test works on a simple synthetic dataset.
+    """
+    rng = np.random.RandomState(123)
+
+    n_samples = 300
+    n_features = 6
+    X = rng.randn(n_samples, n_features)
+    noise = rng.randn(n_samples).reshape(-1, 1)
+    beta = np.array([1, 0, 0, -1, 0, 0]).reshape(-1, 1)
+    y = np.exp(-0.75 * np.dot(X, beta)) + 0.5 * noise
+
+    sir = SlicedInverseRegression(alpha=0.05)
+    sir.fit(X, y.ravel())
+
+    np.testing.assert_array_equal(
+        sir.directions_.ravel() != 0, beta.ravel() != 0)
+
+
+def test_sparse_coefficient_multiple_dimensions():
+    """Perform the t-test on a dataset with two directions.
+    """
+    rng = np.random.RandomState(123)
+
+    n_samples = 300
+    n_features = 15
+    beta = np.zeros((2, n_features))
+    beta[0, :9] = 1
+    beta[1, 9:] = 1
+    X = rng.randn(n_samples, n_features)
+    y = (np.sign(np.dot(X, beta[0, :])) *
+         np.log(np.abs(np.dot(X, beta[1, :]) + 5)))
+
+    sir = SlicedInverseRegression(n_directions=2, alpha=0.05)
+    sir.fit(X, y.ravel())
+
+    # the first dimension is found without error
+    np.testing.assert_array_equal(
+        sir.directions_[1, :].ravel() != 0, beta[0, :].ravel() != 0)
+
+    # the second dimension picks up a few spurious coefficients
+    assert np.all(sir.directions_[0, :].ravel()[9:] != 0)
+
+
+def test_all_zero_coefficients_warns_and_does_not_zero_out():
+    """To avoid errors a t-test that indicates that all coefficients are
+    zero will not zero-out the directions vector.
+    """
+    X, y = load_breast_cancer(return_X_y=True)
+    sir = SlicedInverseRegression(n_directions=2, alpha=0.05)
+
+    with pytest.warns(RuntimeWarning):
+        sir.fit(X, y)
+
+    assert np.any(sir.directions_ != 0)