In [4]:
import functools, uuid

import numpy as np, pandas as pd
from scipy.stats import norm as ndist

import regreg.api as rr

from selection.tests.instance import gaussian_instance


from selection.learning.utils import full_model_inference, pivot_plot
from selection.learning.core import split_sampler, keras_fit

from sklearn.linear_model import lasso_path




Using TensorFlow backend.


In [5]:
import json
# metad = {'data_input': [['X', 'X.csv'], ['y', 'y.csv']]}
metad = {'capture_selection': [{'name':'selected_vars', 'selection_type': 'set'}]} # could capture more than one thing in a cell
#metad = {'data_model': {'sufficient_statistics': 'compute_sufficient_statistics', 
#                        'estimators': 'estimators', 
#                        'resample_data': 'resample_data'}}



json.dumps(metad)

'{"capture_selection": [{"name": "selected_vars", "selection_type": "set"}]}'

In [6]:
# researcher loads in data
n, p, s = 100, 50, 5
sigma = 1.
signal = 10.
#X, y, truth = gaussian_instance(n=n,
#                                p=p,
#                                s=s,
#                                equicorrelated=False,
#                                rho=0.1,
#                                sigma=sigma,
#                                signal=signal,
#                                random_signs=True,
#                                scale=True)[:3]
#np.savetxt("X.csv", X, delimiter=',')
#np.savetxt('y.csv', y, delimiter=',')
X = np.loadtxt("X.csv", delimiter=',')
y = np.loadtxt('y.csv', delimiter=',')

In [14]:
# we don't need to make any assumption on signature of selection algorithm

def selection_algorithm(X, y):
    
    n, p = X.shape
    
    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)

    min_success = 6
    ntries = 10

    def _alpha_grid(X, y, center, XTX):
        n, p = X.shape
        alphas, coefs, _ = lasso_path(X, y, Xy=center, precompute=XTX)
        nselected = np.count_nonzero(coefs, axis=0)
        return alphas[nselected < np.sqrt(0.8 * p)]

    alpha_grid = _alpha_grid(X, y, X.T.dot(y), XTX)
    success = np.zeros((p, alpha_grid.shape[0]))

    for _ in range(ntries):
        subsample = np.random.choice(range(n), n // 2, replace=False)
        Xsub = X[subsample]
        noisy_S = Xsub.T.dot(y[subsample])
        _, coefs, _ = lasso_path(X, y, Xy = noisy_S, precompute=Xsub.T.dot(Xsub), alphas=alpha_grid)
        success += np.abs(np.sign(coefs))

    selected = np.apply_along_axis(lambda row: any(x>min_success for x in row), 1, success)
    vars = set(np.nonzero(selected)[0])
    return tuple([int(i) for i in vars])

selected_vars = selection_algorithm(X, y)
selected_vars

'[12, 13, 48, 29, 31]'

In [5]:
# function to compute sufficient statistics

# let's say that selection type can be "fixed" or "set" i.e. "Lee/selected" vs. "Liu/full"

# model (and hence sufficient statistics) can be a function of all "fixed" selections

# below: fixed_selection and set_selection should be dictionaries (python) or lists (R)

def compute_sufficient_statistics(data, fixed_selection):
    fixed_selection = json.loads(fixed_selection)
    X, y = data['X'], data['y']
    return np.hstack([X.T.dot(y), (y**2).sum().reshape(-1)])

# compute estimator from sufficient statistics
def estimators(suff_stat, fixed_selection, set_selection):
    fixed_selection = json.loads(fixed_selection)
    set_selection = json.loads(set_selection)
    return np.linalg.pinv(X.T.dot(X)).dot(X.T.dot(y))[set_selection['selected_vars']]

# resample data -- should return a dictionary with keys found in various 'data_input' cells
def resample_data(data, fixed_selection):
    fixed_selection = json.loads(fixed_selection)
    X, y = data['X'], data['y']
    n, p = X.shape
    resids = y - X.dot(np.linalg.pinv(X.T.dot(X)).dot(X.T.dot(y)))
    fitted = y - resids
    
    resampled = np.random.choice(range(n), n, replace=True)
    ytilde = fitted + resids[resampled]
    
    return {'X':X, 'y':ytilde}

In [22]:
from IPython.display import _display_mimetype

ImportError: cannot import name '_display_mimetype'

In [18]:
display.display_json(selected_vars)

In [23]:
display.display_html??

In [21]:
display._display_mimetype

AttributeError: module 'IPython.display' has no attribute '_display_mimetype'

In [28]:
display.display({'application.blah': selected_vars}, raw=False)

{'application.blah': '[12, 13, 48, 29, 31]'}