In [1]:
import numpy as np
import pandas as pd
import mpmath
import matplotlib
import sklearn
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
import parametric_lasso 
import util 
import ci 
from sklearn.linear_model import LogisticRegression
from scipy import stats
import warnings
import random

In [2]:
# Generate data
def gen_Xy(n, beta_vec):
    p = len(beta_vec) - 1 
    X = np.random.randn(n, p)
    X = np.hstack([np.ones((n, 1)), X])
    eta = np.dot(X, beta_vec)
    mu = np.exp(eta)
    y = np.random.poisson(mu)
    
    return X,y

In [3]:
# MLE
def poireg(X,y):
    poisson_model = sm.GLM(y, X, family=sm.families.Poisson())
    poisson_results = poisson_model.fit()
    b = poisson_results.params
    eta = np.dot(X,b)
    mu = np.exp(eta)
    w = np.sqrt(mu)
    U = np.multiply(w.reshape((n,1)),X)
    z = np.dot(U,b) + (1 / w) * (y - mu)
    
    return b, U, z

In [4]:
# Parametric programming with the pseudo-response
def poireg_pp(X,y,lam):
    
    # fit the regression
    bmle, Umle, zmle = poireg(X,y)
    
    # set tuning parameter for the lasso
    cov = np.identity(n)
    
    # some tuning parameter set in Le Duy's code
    threshold = 20
    
    # remove effect of the intercept column from both zmle and Umle
    z = zmle
    u0 = Umle[:,0]
    Pu0 = 1 / sum(u0**2) * np.outer(u0,u0.T)
    U = np.delete(Umle,0,1)
    U = U - np.matmul(Pu0,U)
    lasso = linear_model.Lasso(alpha=lam, fit_intercept=False)
    lasso.fit(U,z)
    bh = lasso.coef_
    
    p_val = [None]*p
    CI_lo = [None]*p
    CI_up = [None]*p
    
    # prepare for pp
    z = z.reshape((n,1))
    A, XA, Ac, XAc, bhA = util.construct_A_XA_Ac_XAc_bhA(U, bh, n, p)

    # obtain pp p-values and CIs for each coefficient in selected model
    for j_selected in A:
        etaj, etajTy = util.construct_test_statistic(j_selected, XA, z, A)
        list_zk, list_bhz, list_active_set = parametric_lasso.run_parametric_lasso(U, z, lam, etaj, n, p, threshold)
        p_val[j_selected] = util.p_value(A, bh, list_active_set, list_zk, list_bhz, etaj, etajTy, cov)
        CI = ci.compute_ci(A, bh, list_active_set, list_zk, list_bhz, etaj, etajTy, cov, bh[j_selected], 0.05)
        CI_lo[j_selected] = CI[0]
        CI_up[j_selected] = CI[1]
    return A, bh, p_val, CI_lo, CI_up


### Simulation

In [5]:
n = 500
p = 20
beta_vec = [-2, 1, 1, -1] + [0] * 17
X, y = gen_Xy(n=n, beta_vec=beta_vec)
A, bh, p_val, CI_lo, CI_up = poireg_pp(X=X, y=y, lam=0.02)

In [6]:
A

[0, 1, 2, 3, 6, 9, 15, 18]

In [7]:
p_val

[0.0,
 0.008169507401538656,
 0.0005312800847090432,
 0.36454219618995587,
 None,
 None,
 0.3251924157744554,
 None,
 None,
 0.4010697635572129,
 None,
 None,
 None,
 None,
 None,
 0.4521839968357868,
 None,
 None,
 0.14108228068744016,
 None]