In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt; plt.style.use("ggplot")
import seaborn as sns
import sys; sys.path.append("/data/jerrylee/pjt/BIGFAM.v.2.0")
from src import obj2
from tqdm import tqdm
import statsmodels.formula.api as smf
from scipy.optimize import minimize

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)



# Effect of L2

In [4]:
def _make_simul_reg(A, X, S, ra, rx, rs, n=1, noise_sd=0.01):
    """Make FR-reg coefficient."""
    by_A = ra * A
    by_X = rx * X
    by_S = rs * S
    by_E = np.random.normal(0, noise_sd, n)
    
    coef = by_A + by_X + by_S + by_E
    
    if len(coef) == 1:
        coef = coef[0]
        
    return coef

def _resamplingFRregCoefficients(df_lmbds, n_resample=100, n_block=10):
    """
    df_lmbds : summary of FR-reg
    """
    df_block = pd.DataFrame()

    for rel_type in df_lmbds["rel_type"].unique():
        df_rel = df_lmbds[df_lmbds["rel_type"] == rel_type].copy()
        dor, rel_type, sex_type, rx, slope, se_slope = df_rel.iloc[0].values

        resampled_slopes = np.random.normal(slope, se_slope, (n_block, n_resample // n_block))
        resampled_slopes = resampled_slopes.flatten()

        df_tmp = pd.DataFrame({
            "DOR": dor,
            "rel_type": rel_type,
            "sex_type": sex_type,
            "rx": rx,
            "slope": resampled_slopes,
            "block": np.repeat(np.arange(n_block), n_resample // n_block),
        })

        df_block = pd.concat([df_block, df_tmp], ignore_index=True)

    return df_block

def _regressOutMean(df_block):
    df_res = df_block.copy()
    # etas = {}

    def regress_out_mean(group):
        ll = smf.ols(formula="slope ~ 1", data=group).fit()
        # etas[group["DOR"].iloc[0]] = 2**group["DOR"].iloc[0] * ll.params["Intercept"]
        group["eta"] = 2**group["DOR"].iloc[0] * ll.params["Intercept"]
        group["residual"] = ll.resid
        group["tl"] = group["rx"] - group["rx"].mean()
        return group

    df_res = df_res.groupby(["block", "DOR"]).apply(regress_out_mean)

    return df_res

def _lossFuncX(x, df, alpha):
    # Fidelity term
    loss_fid = np.sum((df["residual"] - df["tl"] * x) ** 2)

    # L2 term
    loss_l2 = alpha * (x ** 2)

    return loss_fid + loss_l2

def _optToFindX(df_block, alpha):
    x0 = [0.01]
    bounds = [(-1, 1)]

    model = minimize(
        fun=_lossFuncX,
        x0=x0,
        args=(df_block, alpha),
        bounds=bounds,
        tol=1e-6
    )

    return model

# relationships

In [5]:
# load reference familial relationships
df_frreg = pd.read_csv(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/UKB/obj2/frreg/Weight.REL.frreg",
    sep = '\t')
# df_frreg = obj2._matchType(df_frreg)

relations = df_frreg[["DOR", "relationship", "sex_type", "Erx"]]
relations

Unnamed: 0,DOR,relationship,sex_type,Erx
0,1,daughter-father,FM,0.707107
1,1,daughter-mother,FF,0.5
2,1,daughter-sister,FF,0.75
3,1,different-sex-sibling,FM,0.353553
4,1,son-brother,MM,0.5
5,1,son-father,MM,0.0
6,1,son-mother,FM,0.707107
7,2,daughter-father-brother,FM,0.353553
8,2,daughter-father-sister,FF,0.25
9,2,daughter-mother-brother,FM,0.176777


# with L2 vs. without L2

In [6]:
# ridge vs. no ridge
As = [0.1, 0.2, 0.4, 0.6] #[0.6, 0.4, 0.2]
Ss = [0.2, 0.1, 0.05] # DOR1, DOR2, DOR3
se_frreg = 1e-2

alps = [-2, -1, 0, 1, 2, 3, 4]
n_resample = 1
n_block = 1

In [7]:
res_cols = ["A_true", "X_true", 
            "eta", "alp", 
            "X_ridge", "X_noRidge"]
df_res = pd.DataFrame(columns=res_cols)

for A in As:
    X = A / 20 
    
    # 100 different shared environmental situations
    for idx_s in tqdm(range(100)): 
        
        # simulate FR-reg coefficients
        df_lmbds = pd.DataFrame(columns=["DOR", "rel_type", "sex_type", "rx", "slope", "se_slope"])
        
        for _, row in relations.iterrows():
            
            # set true parameters
            dor, rel_type, sex_type, rx = row.values
            ra = 0.5**dor
            rs = np.random.normal(0.5, 0.1)
            # rs = np.random.uniform(0, 1)
            S = Ss[dor-1]
            
            lmbd = _make_simul_reg(A, X, S, ra, rx, rs, n=1, noise_sd=se_frreg)
            
            df_lmbds.loc[len(df_lmbds)] = [dor, rel_type, sex_type, rx, lmbd, se_frreg]
        
        # resampling for block-jackkinfe
        df_block = _resamplingFRregCoefficients(df_lmbds, n_resample, n_block)
        
        # regress out mean
        df_block = _regressOutMean(df_block)
        
        # estimate X with SE using resampled FR-reg coefficients
        for ib in range(n_block):
            cv_block = df_block[df_block["block"] == ib]
            
            # NO RIDGE
            ll_no_ridege = (smf.ols(formula="residual ~ 0 + tl",
                                    data=cv_block)
                            .fit())
            
            # RIDGE
            mean_eta = cv_block["eta"].mean()
            for alp in alps:
                alpha = ((1/mean_eta)**alp) / cv_block.shape[0]
                
                MODEL = _optToFindX(cv_block, alpha)
                df_res.loc[len(df_res)] \
                    = [A, X, mean_eta, alp, MODEL.x[0], ll_no_ridege.params["tl"]]

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:09<00:00, 10.90it/s]
100%|██████████| 100/100 [00:08<00:00, 11.34it/s]
100%|██████████| 100/100 [00:08<00:00, 11.65it/s]
100%|██████████| 100/100 [00:08<00:00, 11.67it/s]


In [8]:
alp = 2

for_plot = df_res[df_res["alp"] == alp]
xtrues = sorted(for_plot["X_true"].unique(), reverse=True)

for x_true in xtrues:
    tmp = for_plot[for_plot["X_true"] == x_true]
    print(f"{x_true:.3f}=========")
    med_nr = np.mean(tmp["X_noRidge"])
    lower_nr = np.percentile(tmp["X_noRidge"], 2.5)
    upper_nr = np.percentile(tmp["X_noRidge"], 97.5)
    med_r = np.mean(tmp["X_ridge"])
    lower_r = np.percentile(tmp["X_ridge"], 2.5)
    upper_r = np.percentile(tmp["X_ridge"], 97.5)
    
    print("No Ridge : {med:.4f}({lower:.4f}, {upper:.4f})".
          format(med = med_nr,
                 lower = lower_nr,
                 upper = upper_nr
                 ))
    print("Ridge : {med:.4f}({lower:.4f}, {upper:.4f})".
          format(med = med_r,
                 lower = lower_r, 
                 upper = upper_r
                 ))
    percent_shrink = 1 - ((upper_r - lower_r) / (upper_nr - lower_nr))
    percent_shrink = percent_shrink * 100
    print(f"{percent_shrink:.3f}% CIs shrink")

No Ridge : 0.0326(-0.0121, 0.0858)
Ridge : 0.0298(-0.0110, 0.0783)
8.919% CIs shrink
No Ridge : 0.0198(-0.0279, 0.0702)
Ridge : 0.0169(-0.0238, 0.0598)
14.860% CIs shrink
No Ridge : 0.0026(-0.0400, 0.0458)
Ridge : 0.0019(-0.0288, 0.0329)
27.996% CIs shrink
No Ridge : 0.0122(-0.0360, 0.0653)
Ridge : 0.0071(-0.0221, 0.0369)
41.738% CIs shrink


In [9]:
df_res.to_csv(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/simulation/obj2/simulation.l2.tsv",
    sep='\t',
    index=False
)