In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import special
from sklearn import linear_model
import parametric_lasso 
import util 
import ci 
from statsmodels.othermod.betareg import BetaModel
import warnings

In [2]:
# Generate data
def gen_Xy(n,beta_vec,phi_true):
    p = len(beta_vec)-1
    X = np.column_stack((np.ones(n), np.random.normal(0,1,n*p).reshape(n,p)))
    eta_true = np.dot(X,beta_vec)
    mu_true = np.exp(eta_true)/(1 + np.exp(eta_true))
    y = np.random.beta(mu_true*phi_true,(1-mu_true)*phi_true,n)
    return X, y

In [3]:
# MLE
def betareg(X,y,tol=1e-6,kmax=20):

    # get dimension of X
    dm = X.shape
    n = dm[0]
    p = dm[1]

    # bound y away from 0 and 1 by 1e-6 for the sake of numerical stability
    y = (y < 1e-6) * 1e-6 + (y > (1 - 1e-6)) * (1 - 1e-6) + y *( (y > 1e-6) & (y < (1 - 1e-6)))
    
    # compute transformed response values
    y_tilde = np.log(y/(1-y))
    
    # initialize b and phi
    b = np.zeros(p)
    phi = 1
    
    # update b and phi until convergence
    conv = 0
    k = 1
    while conv == 0 and k < kmax:
        
        b0 = b

        eta = np.dot(X,b)
        mu = np.exp(eta)/(1 + np.exp(eta))
        d = mu*(1-mu)
        w = d * np.sqrt(phi * (special.polygamma(1,mu*phi) + special.polygamma(1,(1-mu)*phi)))
        mu_tilde = special.polygamma(0,mu*phi) - special.polygamma(0,(1-mu)*phi)
        
        # update b while holding phi fixed
        U = np.multiply(w.reshape((n,1)),X)
        z = np.dot(U,b) + d / w * (y_tilde - mu_tilde)
        b = np.linalg.lstsq(U,z,rcond=None)[0]
    
        # update phi while holding b fixed
        dphi = n*special.polygamma(0,phi) + np.sum(mu*(y_tilde - mu_tilde) + np.log(1-y) - special.polygamma(0,(1-mu)*phi))
        d2phi = n*special.polygamma(1,phi) - np.sum(mu**2 * special.polygamma(1,mu*phi) + (1-mu)**2 * special.polygamma(1,(1-mu)*phi))
        phi = phi - dphi / d2phi

        # check convergence and increment k
        conv = max(abs(b-b0)) < tol
        k = k+1

    return b, phi, U, z


In [4]:
# Parametric programming with the pseudo-response
def betareg_pp(X,y,lam):

    # fit the beta regression
    bmle, phimle, Umle, zmle = betareg(X,y)

    # set tuning parameter for the lasso
    cov = np.identity(n) / phimle
    
    # some tuning parameter set thus in Le Duy's code
    threshold = 20
    
    # remove effect of the intercept column from both zmle and Umle
    z = zmle
    u0 = Umle[:,0]
    Pu0 = 1 / sum(u0**2) * np.outer(u0,u0.T)
    U = np.delete(Umle,0,1)
    U = U - np.matmul(Pu0,U)
    lasso = linear_model.Lasso(alpha=lam, fit_intercept=False)
    lasso.fit(U,z)
    bh = lasso.coef_

    p_val = [None]*p
    CI_lo = [None]*p
    CI_up = [None]*p
    
    # prepare for pp
    z = z.reshape((n,1))
    A, XA, Ac, XAc, bhA = util.construct_A_XA_Ac_XAc_bhA(U, bh, n, p)

    # obtain pp p-values for each coefficient in selected model
    for j_selected in A:
        etaj, etajTy = util.construct_test_statistic(j_selected, XA, z, A)
        list_zk, list_bhz, list_active_set = parametric_lasso.run_parametric_lasso(U, z, lam, etaj, n, p, threshold)
        p_val[j_selected] = util.p_value(A, bh, list_active_set, list_zk, list_bhz, etaj, etajTy, cov)
        CI = ci.compute_ci(A, bh, list_active_set, list_zk, list_bhz, etaj, etajTy, cov, bh[j_selected], 0.05)
        CI_lo[j_selected] = CI[0]
        CI_up[j_selected] = CI[1]
    
    return A, bh, p_val, CI_lo, CI_up

### Simulation

In [6]:
n = 500
p = 20
beta_vec = [-2, 1, -1/2, 1/2] + [0] * 17
phi_true = 10
X, y = gen_Xy(n=n, beta_vec=beta_vec, phi_true = phi_true)
A, bh, p_val, CI_lo, CI_up = betareg_pp(X=X, y=y, lam=0.005)    
