In [1]:
import numpy as np
import pandas as pd
import mpmath
import matplotlib
import sklearn
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
import parametric_lasso 
import crossvalidation_event
import util 
import ci 
from sklearn.linear_model import LogisticRegression
from scipy import stats
import warnings
import random

In [2]:
# Generate Data
def gen_Xy(n, beta_vec):
    p = len(beta_vec) - 1  
    X = np.random.randn(n, p)
    X = np.hstack([np.ones((n, 1)), X])
    eta = np.dot(X, beta_vec)
    pr = 1 / (1 + np.exp(-eta))
    y = np.random.binomial(1, pr)
    
    return X, y

In [3]:
# MLE
def logreg(X,y,tol=1e-6,kmax=20):

    # get dimension of X
    dm = X.shape
    n = dm[0] 
    p = dm[1]
    
    # initialize b 
    b = np.zeros(p)
    
    # update b until convergence
    conv = 0
    k = 1
    while conv == 0 and k < kmax:
        
        b0 = b

        eta = np.dot(X,b)
        mu = np.exp(eta)/(1 + np.exp(eta))
        w = np.sqrt(mu*(1-mu))
        
        # update b
        U = np.multiply(w.reshape((n,1)),X)
        z = np.dot(U,b) + (1 / w) * (y - mu)
        b = np.linalg.lstsq(U,z,rcond=None)[0]
        
        # check convergence and increment k
        conv = max(abs(b-b0)) < tol
        k = k + 1

    return b, U, z


In [4]:
def linreg_pp_cv(X,y,list_lam,train,cov_scale):
    
    threshold = 20
    cov = np.identity(n) * cov_scale 
    
    cutoff = int(train * n )
    
    X_train = X[:cutoff, :]
    y_train = y[:cutoff]
    
    X_val = X[cutoff:n, :]
    y_val = y[cutoff:n]
    
    min_cv_error = np.Inf
    lam = None
    lam_idx = None
    
    for i in range(len(list_lam)):
        
        each_lam = list_lam[i]
        clf_lam = linear_model.Lasso(alpha=each_lam, fit_intercept=False)
        clf_lam.fit(X_train, y_train)
        bh_lam = clf_lam.coef_
        bh_lam = bh_lam.reshape((len(bh_lam), 1))
        temp_cv_error = 0.5*sum((y_val - (np.dot(X_val, bh_lam)).flatten())**2)
        
        if temp_cv_error < min_cv_error:
            min_cv_error = temp_cv_error
            lam = each_lam
            lam_idx = i
    
    best_lam = list_lam[lam_idx]
    clf = linear_model.Lasso(alpha=lam, fit_intercept=False)
    clf.fit(X, y)
    bh = clf.coef_
    
    y = y.reshape((n, 1))
    
    
    A, XA, Ac, XAc, bhA = util.construct_A_XA_Ac_XAc_bhA(X, bh, n, p)

    p_val = [None]*p
    CI_lo_005 = [None]*p
    CI_up_005 = [None]*p
    
    for j_selected in A:
    
        etaj, etajTy = util.construct_test_statistic(j_selected, XA, y, A)
    
        a, b = crossvalidation_event.compute_a_b(y, etaj, n)
        a_flatten = a.flatten()
        b_flatten = b.flatten()
        a_train = (a_flatten[:cutoff]).reshape((cutoff, 1))
        b_train = (b_flatten[:cutoff]).reshape((cutoff, 1))
    
        a_val = (a_flatten[cutoff:n]).reshape((n - cutoff, 1))
        b_val = (b_flatten[cutoff:n]).reshape((n - cutoff, 1))
    
        list_zk_min_lam, list_bhz_min_lam, list_active_set_min_lam, list_etaAkz_min_lam, list_bhAz_min_lam = \
            parametric_lasso.run_parametric_lasso_cv(X_train, list_lam[lam_idx], X_train.shape[0], p, threshold, a_train, b_train)
    
        piecewise_quadratic_min_lam = crossvalidation_event.construct_piecewise_quadratic(a_val, b_val, X_val, list_zk_min_lam,
                                                                      list_active_set_min_lam, list_etaAkz_min_lam,
                                                                      list_bhAz_min_lam)
    
        set_piecewise_funct = [piecewise_quadratic_min_lam]
        set_list_zk = [list_zk_min_lam]
    
        for i in range(len(list_lam)):
            if i == lam_idx:
                continue
    
            list_zk_i, list_bhz_i, list_active_set_i, list_etaAkz_i, list_bhAz_i = \
                parametric_lasso.run_parametric_lasso_cv(X_train, list_lam[i], X_train.shape[0], p, threshold, a_train, b_train)
    
            piecewise_quadratic_i = crossvalidation_event.construct_piecewise_quadratic(a_val, b_val, X_val, list_zk_i,
                                                                  list_active_set_i, list_etaAkz_i, list_bhAz_i)
    
            set_piecewise_funct.append(piecewise_quadratic_i)
            set_list_zk.append(list_zk_i)
    
        z_interval_cv = crossvalidation_event.construct_z_interval_cv(set_piecewise_funct, set_list_zk)
    
        list_zk, list_bhz, list_active_set = parametric_lasso.run_parametric_lasso(X, y, lam, etaj, n, p, threshold)
    
        z_interval_m = crossvalidation_event.construct_m_z_interval(A, list_active_set, list_zk)
    
        z_interval = crossvalidation_event.construct_z_interval(z_interval_m, z_interval_cv)
    
        pivot = util.pivot_with_specified_interval(z_interval, etaj, etajTy, cov, 0)

        if pivot is None:
            p_val[j_selected] = None
        else: 
            p_val[j_selected] = 2 * min(1 - pivot, pivot)

        confidence_interval_005 = ci.compute_ci_with_specified_interval(z_interval, etaj, etajTy, cov, bh[j_selected], 0.05)
        if confidence_interval_005 is None:
            CI_lo_005[j_selected] = None
            CI_up_005[j_selected] = None
        else:
            CI_lo_005[j_selected] = confidence_interval_005[0]
            CI_up_005[j_selected] = confidence_interval_005[1]
        
    return best_lam, A, bh, p_val, CI_lo_005, CI_up_005

In [5]:
def logreg_pp_cv(X,y,list_lambda,train):
    
    # fit the logistic regression
    bmle, Umle, zmle = logreg(X,y)
    
    # remove effect of the intercept column from both zmle and Umle
    u0 = Umle[:,0]
    z = zmle - u0*bmle[0]
    Pu0 = 1 / sum(u0**2) * np.outer(u0,u0.T)
    U = np.delete(Umle,0,1)
    U = U - np.matmul(Pu0,U)
    
    # feed U and z into the linreg_pp_cv function
    best_lam, A, bh, p_val, CI_lo_005, CI_up_005 = linreg_pp_cv(U,z,list_lambda,train,cov_scale = 1)
  
    return best_lam, A, bh, p_val, CI_lo_005, CI_up_005

### Simulation

In [6]:
n = 500
p = 20
beta_vec = [-2, 2, 2, 1] + [0] * 17
X, y = gen_Xy(n=n, beta_vec=beta_vec)
lambdas_lo = 1/n
lambdas_hi = 6/n
n_lambdas = 20
lambdas = np.logspace(np.log10(lambdas_lo), np.log10(lambdas_hi), num=n_lambdas)
train = 0.7
best_lam, A, bh, p_val, CI_lo, CI_up = logreg_pp_cv(X=X, y=y, list_lambda=lambdas, train=train)

In [7]:
lambdas

array([0.002     , 0.00219779, 0.00241513, 0.00265397, 0.00291643,
       0.00320484, 0.00352178, 0.00387006, 0.00425278, 0.00467335,
       0.00513551, 0.00564337, 0.00620146, 0.00681474, 0.00748867,
       0.00822924, 0.00904306, 0.00993735, 0.01092008, 0.012     ])

In [8]:
best_lam

0.012000000000000002

In [9]:
p_val

[3.113331814574849e-11,
 5.8393788471278185e-08,
 8.89913427615241e-07,
 None,
 None,
 None,
 0.8925405570137085,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 0.796079186168132,
 None,
 None,
 None]