In [1]:
import numpy as np
import pandas as pd
import mpmath
import matplotlib
import sklearn
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
import parametric_lasso 
import util 
import ci 
from sklearn.linear_model import LogisticRegression
from scipy import stats
import warnings
import random

In [2]:
# Generate data
def gen_Xy(n, beta_vec):
    p = len(beta_vec) - 1  
    X = np.random.randn(n, p)
    X = np.hstack([np.ones((n, 1)), X])
    eta = np.dot(X, beta_vec)
    pr = 1 / (1 + np.exp(-eta))
    y = np.random.binomial(1, pr)
    
    return X, y

In [3]:
# MLE
def logreg(X,y,tol=1e-6,kmax=20):

    # get dimension of X
    dm = X.shape
    n = dm[0] 
    p = dm[1]
    
    # initialize b 
    b = np.zeros(p)
    
    # update b until convergence
    conv = 0
    k = 1
    while conv == 0 and k < kmax:
        
        b0 = b

        eta = np.dot(X,b)
        mu = np.exp(eta)/(1 + np.exp(eta))
        w = np.sqrt(mu*(1-mu))
        
        # update b
        U = np.multiply(w.reshape((n,1)),X)
        z = np.dot(U,b) + (1 / w) * (y - mu)
        b = np.linalg.lstsq(U,z,rcond=None)[0]
        
        # check convergence and increment k
        conv = max(abs(b-b0)) < tol
        k = k + 1

    return b, U, z


In [4]:
# Parametric programming with the pseudo-response
def logreg_pp(X,y,lam):

    # fit the regression
    bmle, Umle, zmle = logreg(X,y)
    
    # set tuning parameter for the lasso
    cov = np.identity(n)
    
    # some tuning parameter set in Le Duy's code
    threshold = 20
    
    # remove effect of the intercept column from both zmle and Umle
    z = zmle
    u0 = Umle[:,0]
    Pu0 = 1 / sum(u0**2) * np.outer(u0,u0.T)
    U = np.delete(Umle,0,1)
    U = U - np.matmul(Pu0,U)
    lasso = linear_model.Lasso(alpha=lam, fit_intercept=False)
    lasso.fit(U,z)
    bh = lasso.coef_
    
    p_val = [None]*p
    CI_lo = [None]*p
    CI_up = [None]*p
   
    # prepare for pp
    z = z.reshape((n,1))
    A, XA, Ac, XAc, bhA = util.construct_A_XA_Ac_XAc_bhA(U, bh, n, p)
    
    # obtain pp p-values and CIs for each coefficient in selected model
    for j_selected in A:
        etaj, etajTy = util.construct_test_statistic(j_selected, XA, z, A)
        list_zk, list_bhz, list_active_set = parametric_lasso.run_parametric_lasso(U, z, lam, etaj, n, p, threshold)
        p_val[j_selected] = util.p_value(A, bh, list_active_set, list_zk, list_bhz, etaj, etajTy, cov)
        CI = ci.compute_ci(A, bh, list_active_set, list_zk, list_bhz, etaj, etajTy, cov, bh[j_selected], 0.05)
        CI_lo[j_selected] = CI[0]
        CI_up[j_selected] = CI[1]
        
    return A, bh, p_val, CI_lo, CI_up

### Simulation

In [5]:
# Simulation example
n = 500
p = 20
beta_vec = [-2, 2, 2, 1] + [0] * 17
X, y = gen_Xy(n=n, beta_vec=beta_vec)
A, bh, p_val, CI_lo, CI_up = logreg_pp(X, y, lam=0.01)

In [6]:
A

[0, 1, 2, 6, 7, 8, 11, 13, 14, 19]

In [7]:
p_val

[0.0153056673711911,
 0.00208832602176523,
 0.0027052210054154813,
 None,
 None,
 None,
 0.005502465557381875,
 0.9963352716726235,
 0.8023860871218704,
 None,
 None,
 0.9439996595221406,
 None,
 0.36409852040771573,
 0.2778673082810531,
 None,
 None,
 None,
 None,
 0.3242671832366377]