Skip to content

Commit

Permalink
Merge pull request #11 from joshloyal/josh/t_ratio_test
Browse files Browse the repository at this point in the history
Added t-ratio test for individual sir components
  • Loading branch information
joshloyal committed Jun 18, 2018
2 parents 7574cf3 + 5d48967 commit 51d88fb
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 3 deletions.
74 changes: 71 additions & 3 deletions sliced/sir.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import scipy.linalg as linalg
import scipy.stats as stats

from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
Expand All @@ -19,6 +20,32 @@
__all__ = ['SlicedInverseRegression']


def diagonal_precision(X, R=None):
"""Calculate he diagonal elements of the empirical precision matrix
from a pre-computed QR decomposition of the centered data matrix X.
Parameters
----------
X : numpy-array, shape (n_samples, n_features)
The centered data matrix.
R : numpy-array, shape (n_features, n_features), optional
The upper triangular matrix obtained by the economic QR decomposition
of the centered data matrix.
Returns
-------
precisions : numpy-array, shape (n_features,)
The diagonal elements of the precision (inverse covariance) matrix of
the centered data.
"""
if R is None:
Q, R = linalg.qr(X, mode='economic')

R_inv = linalg.solve_triangular(R, np.eye(R.shape[1]))
return np.sum(R_inv ** 2, axis=1) * X.shape[0]


class SlicedInverseRegression(BaseEstimator, TransformerMixin):
"""Sliced Inverse Regression (SIR) [1]
Expand Down Expand Up @@ -57,6 +84,13 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
The number of slices used when calculating the inverse regression
curve. Truncated to at most the number of unique values of ``y``.
alpha : float or None (default=None)
Significance level for the two-sided t-test used to check for
non-zero coefficients. Must be a number between 0 and 1. If not
`None`, the non-zero components of each direction are determined
from an asymptotic normal approximation. Useful if one desires that
the directions are sparse in the number of features.
copy : bool (default=True)
If False, data passed to fit are overwritten and running
fit(X).transform(X) will not yield the expected results,
Expand All @@ -73,7 +107,7 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
The eigenvalues corresponding to each of the selected directions.
These are the eigenvalues of the covariance matrix
of the inverse regression curve. Larger eigenvalues indicate
more prevelant directions.
more prevalent directions.
Examples
--------
Expand All @@ -84,7 +118,7 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
>>> X, y = make_cubic(random_state=123)
>>> sir = SlicedInverseRegression(n_directions=2)
>>> sir.fit(X, y)
SlicedInverseRegression(copy=True, n_directions=2, n_slices=10)
SlicedInverseRegression(alpha=None, copy=True, n_directions=2, n_slices=10)
>>> X_sir = sir.transform(X)
References
Expand All @@ -93,10 +127,13 @@ class SlicedInverseRegression(BaseEstimator, TransformerMixin):
[1] Li, K C. (1991)
"Sliced Inverse Regression for Dimension Reduction (with discussion)",
Journal of the American Statistical Association, 86, 316-342.
[2] Chen, C.H., and Li, K.C. (1998), "Can SIR Be as Popular as Multiple
Linear Regression?" Statistica Sinica, 8, 289-316.
"""
def __init__(self, n_directions='auto', n_slices=10, copy=True):
def __init__(self, n_directions='auto', n_slices=10, alpha=None, copy=True):
self.n_directions = n_directions
self.n_slices = n_slices
self.alpha = alpha
self.copy = copy

def fit(self, X, y):
Expand Down Expand Up @@ -135,6 +172,11 @@ def fit(self, X, y):
else:
n_directions = self.n_directions

if self.alpha is not None and (self.alpha <= 0 or self.alpha >= 1):
raise ValueError("The significance level `alpha` "
"must be between 0 and 1. Got `alpha`={}".format(
self.alpha))

# validate y
if is_multioutput(y):
raise TypeError("The target `y` cannot be multi-output.")
Expand Down Expand Up @@ -187,6 +229,32 @@ def fit(self, X, y):
self.directions_ = directions.T
self.eigenvalues_ = evals[:self.n_directions_]

# Drop components in each direction using the t-ratio approach
# suggested in section 4 of Chen and Li (1998).
if self.alpha is not None:
# similar to multiple linear-regression the standard error
# estimates are proportional to the diagonals of the inverse
# covariance matrix.
precs = diagonal_precision(X, R=R)
weights = (1 - self.eigenvalues_) / (n_samples * self.eigenvalues_)
std_error = np.sqrt(weights.reshape(-1, 1) * precs)

# perform a two-sided t-test at level alpha for coefs equal to zero
# NOTE: we are not correcting for multiple tests and this is
# a very rough approximation, so do not expect the test
# to be close to the nominal level.
df = n_samples - n_features - 1
crit_val = stats.distributions.t.ppf(1 - self.alpha/2, df)
for j in range(self.n_directions_):
test_stat = np.abs(self.directions_[j, :] / std_error[j])
zero_mask = test_stat < crit_val
if np.sum(zero_mask) == n_features:
warnings.warn("Not zeroing out coefficients. All "
"coefficients are not significantly "
"different from zero.", RuntimeWarning)
else:
self.directions_[j, test_stat < crit_val] = 0.

return self

def transform(self, X):
Expand Down
67 changes: 67 additions & 0 deletions sliced/test/test_sir.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from scipy import sparse
from scipy import linalg
from sklearn.datasets import load_digits
from sklearn.datasets import load_breast_cancer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sliced import SlicedInverseRegression
Expand Down Expand Up @@ -140,3 +141,69 @@ def test_matches_athletes():

np.testing.assert_allclose(
sir.directions_, expected_directions)


def test_errors_alpha_out_of_bounds():
X, y = datasets.make_cubic(random_state=123)

sir = SlicedInverseRegression(alpha=10)

with pytest.raises(ValueError):
sir.fit(X, y)


def test_sparse_coefficient():
"""Test the component-wise t-test works on a simple synthetic dataset.
"""
rng = np.random.RandomState(123)

n_samples = 300
n_features = 6
X = rng.randn(n_samples, n_features)
noise = rng.randn(n_samples).reshape(-1, 1)
beta = np.array([1, 0, 0, -1, 0, 0]).reshape(-1, 1)
y = np.exp(-0.75 * np.dot(X, beta)) + 0.5 * noise

sir = SlicedInverseRegression(alpha=0.05)
sir.fit(X, y.ravel())

np.testing.assert_array_equal(
sir.directions_.ravel() != 0, beta.ravel() != 0)


def test_sparse_coefficient_multiple_dimensions():
"""Perform the t-test on a dataset with two directions.
"""
rng = np.random.RandomState(123)

n_samples = 300
n_features = 15
beta = np.zeros((2, n_features))
beta[0, :9] = 1
beta[1, 9:] = 1
X = rng.randn(n_samples, n_features)
y = (np.sign(np.dot(X, beta[0, :])) *
np.log(np.abs(np.dot(X, beta[1, :]) + 5)))

sir = SlicedInverseRegression(n_directions=2, alpha=0.05)
sir.fit(X, y.ravel())

# the first dimension is found without error
np.testing.assert_array_equal(
sir.directions_[1, :].ravel() != 0, beta[0, :].ravel() != 0)

# the second dimension picks up a few spurious coefficients
assert np.all(sir.directions_[0, :].ravel()[9:] != 0)


def test_all_zero_coefficients_warns_and_does_not_zero_out():
"""To avoid errors a t-test that indicates that all coefficients are
zero will not zero-out the directions vector.
"""
X, y = load_breast_cancer(return_X_y=True)
sir = SlicedInverseRegression(n_directions=2, alpha=0.05)

with pytest.warns(RuntimeWarning):
sir.fit(X, y)

assert np.any(sir.directions_ != 0)

0 comments on commit 51d88fb

Please sign in to comment.