In [1]:
import numpy as np
import pandas as pd
import mpmath
import matplotlib
import sklearn
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
import parametric_lasso
import gen_data
import util
import ci
from sklearn.linear_model import LogisticRegression
from scipy import stats
import crossvalidation_event
import warnings

In [2]:
# Generate data
def gen_Xy(n, beta_vec):
    p = len(beta_vec) - 1 
    X = np.random.randn(n, p)
    X = np.hstack([np.ones((n, 1)), X])
    eta = np.dot(X, beta_vec)
    mu = np.exp(eta)
    y = np.random.poisson(mu)
    
    return X,y

In [3]:
# MLE
def poireg(X,y):
    poisson_model = sm.GLM(y, X, family=sm.families.Poisson())
    poisson_results = poisson_model.fit()
    b = poisson_results.params
    eta = np.dot(X,b)
    mu = np.exp(eta)
    w = np.sqrt(mu)
    U = np.multiply(w.reshape((n,1)),X)
    z = np.dot(U,b) + (1 / w) * (y - mu)
    
    return b, U, z

In [4]:
# Parametric programming CV for linear regression
def linreg_pp_cv(X,y,list_lam,train,cov_scale):
    
    threshold = 20
    cov = np.identity(n) * cov_scale # so for beta regression put in cov_scale = 1/phi
    
    cutoff = int(train * n )
    
    X_train = X[:cutoff, :]
    y_train = y[:cutoff]
    
    X_val = X[cutoff:n, :]
    y_val = y[cutoff:n]
    
    min_cv_error = np.Inf
    lam = None
    lam_idx = None
    
    for i in range(len(list_lam)):
        
        each_lam = list_lam[i]
        clf_lam = linear_model.Lasso(alpha=each_lam, fit_intercept=False)
        clf_lam.fit(X_train, y_train)
        bh_lam = clf_lam.coef_
        bh_lam = bh_lam.reshape((len(bh_lam), 1))
        temp_cv_error = 0.5*sum((y_val - (np.dot(X_val, bh_lam)).flatten())**2)
        
        if temp_cv_error < min_cv_error:
            min_cv_error = temp_cv_error
            lam = each_lam
            lam_idx = i
    
    best_lam = list_lam[lam_idx]
    clf = linear_model.Lasso(alpha=lam, fit_intercept=False)
    clf.fit(X, y)
    bh = clf.coef_
    
    y = y.reshape((n, 1))
    
    
    A, XA, Ac, XAc, bhA = util.construct_A_XA_Ac_XAc_bhA(X, bh, n, p)

    p_val = [None]*p
    CI_lo_005 = [None]*p
    CI_up_005 = [None]*p
    
    for j_selected in A:
    
        etaj, etajTy = util.construct_test_statistic(j_selected, XA, y, A)
    
        a, b = crossvalidation_event.compute_a_b(y, etaj, n)
        a_flatten = a.flatten()
        b_flatten = b.flatten()
        a_train = (a_flatten[:cutoff]).reshape((cutoff, 1))
        b_train = (b_flatten[:cutoff]).reshape((cutoff, 1))
    
        a_val = (a_flatten[cutoff:n]).reshape((n - cutoff, 1))
        b_val = (b_flatten[cutoff:n]).reshape((n - cutoff, 1))
    
        list_zk_min_lam, list_bhz_min_lam, list_active_set_min_lam, list_etaAkz_min_lam, list_bhAz_min_lam = \
            parametric_lasso.run_parametric_lasso_cv(X_train, list_lam[lam_idx], X_train.shape[0], p, threshold, a_train, b_train)
    
        piecewise_quadratic_min_lam = crossvalidation_event.construct_piecewise_quadratic(a_val, b_val, X_val, list_zk_min_lam,
                                                                      list_active_set_min_lam, list_etaAkz_min_lam,
                                                                      list_bhAz_min_lam)
    
        set_piecewise_funct = [piecewise_quadratic_min_lam]
        set_list_zk = [list_zk_min_lam]
    
        for i in range(len(list_lam)):
            if i == lam_idx:
                continue
    
            list_zk_i, list_bhz_i, list_active_set_i, list_etaAkz_i, list_bhAz_i = \
                parametric_lasso.run_parametric_lasso_cv(X_train, list_lam[i], X_train.shape[0], p, threshold, a_train, b_train)
    
            piecewise_quadratic_i = crossvalidation_event.construct_piecewise_quadratic(a_val, b_val, X_val, list_zk_i,
                                                                  list_active_set_i, list_etaAkz_i, list_bhAz_i)
    
            set_piecewise_funct.append(piecewise_quadratic_i)
            set_list_zk.append(list_zk_i)
    
        z_interval_cv = crossvalidation_event.construct_z_interval_cv(set_piecewise_funct, set_list_zk)
    
        list_zk, list_bhz, list_active_set = parametric_lasso.run_parametric_lasso(X, y, lam, etaj, n, p, threshold)
    
        z_interval_m = crossvalidation_event.construct_m_z_interval(A, list_active_set, list_zk)
    
        z_interval = crossvalidation_event.construct_z_interval(z_interval_m, z_interval_cv)
    
        pivot = util.pivot_with_specified_interval(z_interval, etaj, etajTy, cov, 0)

        if pivot is None:
            p_val[j_selected] = None
        else: 
            p_val[j_selected] = 2 * min(1 - pivot, pivot)

        confidence_interval_005 = ci.compute_ci_with_specified_interval(z_interval, etaj, etajTy, cov, bh[j_selected], 0.05)
        if confidence_interval_005 is None:
            CI_lo_005[j_selected] = None
            CI_up_005[j_selected] = None
        else:
            CI_lo_005[j_selected] = confidence_interval_005[0]
            CI_up_005[j_selected] = confidence_interval_005[1]
        
    return best_lam, A, bh, p_val, CI_lo_005, CI_up_005

In [5]:
# Parametric programming CV for poisson regression
def poireg_pp_cv(X,y,list_lambda,train):
    
    # fit the poisson regression
    bmle, Umle, zmle = poireg(X,y)
    
    # remove effect of the intercept column from both zmle and Umle
    u0 = Umle[:,0]
    z = zmle
    Pu0 = 1 / sum(u0**2) * np.outer(u0,u0.T)
    U = np.delete(Umle,0,1)
    U = U - np.matmul(Pu0,U)
    
    # feed U and z as well as 1/phi into the linreg_pp_cv function
    best_lam, A, bh, p_val, CI_lo_005, CI_up_005 = linreg_pp_cv(U,z,list_lambda,train,cov_scale = 1)

    return best_lam, A, bh, p_val, CI_lo_005, CI_up_005

### Simulation

In [6]:
n = 500
p = 20
beta_vec = [-2,1,1,-1] + [0]*17
X, y = gen_Xy(n=n, beta_vec=beta_vec)
lambdas_lo = 15/n 
lambdas_hi = 28/n
n_lambdas = 20
lambdas = np.logspace(np.log10(lambdas_lo), np.log10(lambdas_hi), num=n_lambdas)

train = 0.7
best_lam, A, bh, p_val, CI_lo, CI_up = poireg_pp_cv(X=X, y=y, list_lambda=lambdas, train=train)

In [7]:
lambdas

array([0.03      , 0.03100187, 0.0320372 , 0.03310711, 0.03421275,
       0.03535531, 0.03653602, 0.03775617, 0.03901707, 0.04032007,
       0.04166659, 0.04305808, 0.04449604, 0.04598201, 0.04751762,
       0.04910451, 0.05074439, 0.05243903, 0.05419027, 0.056     ])

In [8]:
best_lam

0.05600000000000001

In [9]:
p_val

[1.421634632237101e-08,
 0.0,
 7.6398286751496335e-28,
 0.5077558081956315,
 None,
 None,
 0.8262788858349608,
 0.6127836253765926,
 0.7890453126209109,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]