In [1]:
import numpy as np
import pandas as pd

from scipy.stats import bernoulli

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

In [2]:
def loglik(theta, X, y):
    return np.mean(bernoulli.logpmf(y, mu=X.dot(theta)))

In [3]:
def grad_loglik(theta, X, y):
#     n = y.shape[0]
    mu = 1./(1 + np.exp(-X.dot(theta)))
    return np.mean(X.T.dot(y - mu))

In [4]:
def hess_loglik(theta, X):
    mu = 1./(1 + np.exp(-X.dot(theta)))
    D = np.diag(mu*(1-mu))
    return X.T.dot(D.dot(X))

If we're doing predictive deviance reltaive to $\beta^*$, the relevant term we need to estimate is approximately just $p$.

If its relative to $\beta_{full}$ when we're doing some kind of selection, should be $tr(I - S)$??

In [54]:
def mc_sim_term_star(theta, n=5000, niter=200):
    p = len(theta)
    ests = np.zeros(niter)
    
    for i in np.arange(niter):
        X = np.random.randn(n,p)
        mu = 1/(1 + np.exp(-X.dot(theta)))
        y = bernoulli.rvs(mu)
        y_star = bernoulli.rvs(mu)
        
        m = LogisticRegression(fit_intercept=False, penalty='none')
        m.fit(X,y)
        beta_hat = m.coef_
        
        a = beta_hat - theta
        b = X.T.dot(y_star - y)
        
        ests[i] = a.dot(b)
        
    return ests.mean()
    

In [55]:
p = 20
beta_0 = np.random.randn(p)
len(beta_0)

20

In [56]:
est = mc_sim_term_star(beta_0)

In [57]:
est

-20.59966633147342

In [17]:
def est_dev_term_lasso(theta, n=5000, niter=200):
    p = len(theta)
    
    mc_ests = np.zeros(niter)
    math_ests = np.zeros(niter)
    s = np.zeros(niter)
    
    for i in np.arange(niter):
        X = np.random.randn(n,p)
        mu = 1/(1 + np.exp(-X.dot(theta)))
        y = bernoulli.rvs(mu)
        y_star = bernoulli.rvs(mu)
        
        fm = LogisticRegression(fit_intercept=False, penalty='none')
        fm.fit(X,y)
        beta_full = fm.coef_[0]
        
        sm = LogisticRegression(fit_intercept=False, penalty='l1', solver='liblinear', C=.1)
        sm.fit(X,y)
        beta_sel = sm.coef_[0]
        
        a = beta_full - beta_sel
        b = X.T.dot(y_star - y)
        
        E = beta_sel != 0
        XE = X[:, E]
        SE = np.diag(E)[E,:]
        W = np.diag(1./(1 + np.exp(-X.dot(beta_sel))))
        HE = np.linalg.inv(XE.T.dot(W.dot(XE))).dot(SE).dot(np.linalg.inv(X.T.dot(W.dot(X))).dot(beta_full))
        math_ests[i] = np.sum(np.diag(HE)) - p
        
        s[i] = E.sum() - p
        
        mc_ests[i] = a.dot(b)
        
    return mc_ests.mean(), math_ests.mean(), s.mean()
    

In [18]:
p = 20
beta_0 = np.random.randn(p)
beta_0[np.random.choice(p,int(3*p/4),replace=False)] = 0
len(beta_0)

20

In [19]:
mc_est_lasso, math_est_lasso, s_est_lasso = est_dev_term_lasso(beta_0)

In [20]:
mc_est_lasso

-4.563374027671653

In [21]:
math_est_lasso

-20.000000032481175

In [22]:
s_est_lasso

-4.12

In [75]:
def est_dev_term_rl(theta, n=5000, niter=200):
    p = len(theta)
    
    mc_ests = np.zeros(niter)
    math_ests = np.zeros(niter)
    s = np.zeros(niter)
    
    for i in np.arange(niter):
        X = np.random.randn(n,p)
        mu = 1/(1 + np.exp(-X.dot(theta)))
        y = bernoulli.rvs(mu)
        y_star = bernoulli.rvs(mu)
        
        fm = LogisticRegression(fit_intercept=False, penalty='none')
        fm.fit(X,y)
        beta_full = fm.coef_[0]
#         W_full = np.diag(1./(1 + np.exp(-X.dot(beta_full))))
        
        sm = LogisticRegression(fit_intercept=False, penalty='l1', solver='liblinear', C=.1)
        sm.fit(X,y)
        beta_sel = sm.coef_[0]
        
        E = beta_sel != 0
#         E = np.ones(p).astype(int)
#         E[np.random.choice(p, int(3*p/4),replace=False)] = 0
#         E = theta != 0
#         add_idx = np.where(theta == 0)[0]
#         E[np.random.choice(add_idx,3,replace=False)] = True
        XE = X[:, E]
        s[i] = E.sum()

        rm = LogisticRegression(fit_intercept=False, penalty='none')
        rm.fit(XE, y)
        beta_rl = np.zeros_like(beta_full)
        beta_rl[E] = rm.coef_[0]
        
        a = beta_full - beta_rl
        b = X.T.dot(y_star - y)
        mc_ests[i] = a.dot(b)
        
#         SE = np.diag(E)[E,:]
#         W_rl = np.diag(1./(1 + np.exp(-X.dot(beta_rl))))
#         HE = np.linalg.inv(XE.T.dot(W_rl.dot(XE))).dot(SE).dot(np.linalg.inv(X.T.dot(W_full.dot(X))).dot(beta_full)) / np.diag(W_full) * np.diag(W_rl)
#         math_ests[i] = np.sum(np.diag(HE)) - p
                
    return mc_ests.mean(), math_ests.mean(), s.mean()
    

In [76]:
p = 20
beta_0 = np.random.randn(p)
beta_0[np.random.choice(p,int(3*p/4),replace=False)] = 0
len(beta_0), (beta_0 != 0).sum()

(20, 5)

In [77]:
mc_est_rl, math_est_rl, s_rl = est_dev_term_rl(beta_0)

In [78]:
mc_est_rl

-0.1538609084019507

In [79]:
s_rl

15.885

In [73]:
mc_est_rl

-11.684117589769489

In [74]:
s_rl

8.0

In [61]:
mc_est_rl

-14.538643519431188

In [62]:
s_rl

5.0

Seems to be true for properly specificed $E$, i.e. correction is $\mathrm{tr}(I-S)$

In [66]:
p = 700
beta_0 = np.random.randn(p)
beta_0[np.random.choice(p,int(3*p/4),replace=False)] = 0
len(beta_0), (beta_0 != 0).sum()

(700, 175)

In [67]:
mc_est_rl, math_est_rl, s_rl = est_dev_term_rl(beta_0)

In [68]:
mc_est_rl

-9619.843287457727

In [69]:
s_rl

175.0