In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from tqdm import tqdm

In [2]:
def load_ldsc(fn, thred=1e-4):
    df = pd.read_csv(fn, sep='\t')
    df = df.rename(columns={
        "ldsc": "LDSC",
        "lower_ldsc": "lower_LDSC",
        "upper_ldsc": "upper_LDSC"})
    
    df_sig = df[df["lower_LDSC"] > thred].reset_index(drop=True)
    return df, df_sig

def load_ldpred(fn, thred=1e-4):
    df = pd.read_csv(fn, sep='\t')
    df_sig = df[df["lower_LDpred2"] > thred].reset_index(drop=True)

    return df, df_sig

def load_rdr(fn, thred=1e-4):
    df = pd.read_csv(fn, sep='\t')
    df["lower_RDR"] = df["estimate"] - 1.96 * df["std_error"]
    df["upper_RDR"] = df["estimate"] + 1.96 * df["std_error"]
    df = (df[["cohort", "phenotype", "estimate", "lower_RDR", "upper_RDR"]]
            .rename(columns={"phenotype": "pheno", "estimate": "RDR"}))
    df_sig = df[df["lower_RDR"] > thred].reset_index(drop=True)
    return df, df_sig

def load_he(fn, thred=1e-4):
    df = pd.read_csv(fn, sep='\t')
    
    df_cp = df[df["Method"] == "HE-CP"].reset_index(drop=True)
    df_cp["lower_HE-CP"] = df_cp["estimate"] - 1.96 * df_cp["std_error"]
    df_cp["upper_HE-CP"] = df_cp["estimate"] + 1.96 * df_cp["std_error"]
    df_cp = (df_cp
             .rename(columns={"phenotype": "pheno", "estimate": "HE-CP"})
             .drop(columns=["Method", "std_error"]))
    df_cp_sig = df_cp[df_cp["lower_HE-CP"] > thred].reset_index(drop=True)
    
    df_sd = df[df["Method"] == "HE-SD"].reset_index(drop=True)
    df_sd["lower_HE-SD"] = df_sd["estimate"] - 1.96 * df_sd["std_error"]
    df_sd["upper_HE-SD"] = df_sd["estimate"] + 1.96 * df_sd["std_error"]
    df_sd = (df_sd
             .rename(columns={"phenotype": "pheno", "estimate": "HE-SD"})
             .drop(columns=["Method", "std_error"]))
    df_sd_sig = df_sd[df_sd["lower_HE-SD"] > thred].reset_index(drop=True)

    df = pd.merge(df_cp, df_sd, on=["cohort", "pheno"], how="outer")
    df_sig = (pd.merge(df_cp_sig, df_sd_sig, on=["cohort", "pheno"], how="outer")
                .reset_index(drop=True))
    return df, df_sig

def load_bigfam(fn, thred=1e-4):
    df = pd.read_csv(fn, sep='\t')
    
    df_Vg = df[df["param"] == "V(g)"].reset_index(drop=True)
    df_Vg = (df_Vg
             .rename(columns={
                 "median": "BIGFAM",
                 "lower(2.5%)": "lower_BIGFAM", 
                 "upper(97.5%)": "upper_BIGFAM"})
             .drop(columns=["param"])
             .reset_index(drop=True))
    
    df_Vg_sig = df_Vg[df_Vg["lower_BIGFAM"] > thred].reset_index(drop=True)
    return df_Vg, df_Vg_sig

def load_bksk(fn, thred=1e-4):
    df = pd.read_csv(fn, sep='\t')
    
    df_snp = df[df["param"] == "h2_snp"].reset_index(drop=True)
    df_snp = (df_snp
              .rename(columns={
                  "median": "GCTA-snp", 
                  "lower(2.5%)": "lower_GCTA-snp", 
                  "upper(97.5%)": "upper_GCTA-snp"})
              .drop(columns=["param"])
              .reset_index(drop=True))
    df_snp_sig = df_snp[df_snp["lower_GCTA-snp"] > thred].reset_index(drop=True)
    
    df_ped = df[df["param"] == "h2_ped"].reset_index(drop=True)
    df_ped = (df_ped
              .rename(columns={
                  "median": "GCTA-ped",
                  "lower(2.5%)": "lower_GCTA-ped", 
                  "upper(97.5%)": "upper_GCTA-ped"})
              .drop(columns=["param"])
              .reset_index(drop=True))
    df_ped_sig = df_ped[df_ped["lower_GCTA-ped"] > thred].reset_index(drop=True)
    
    df = pd.merge(df_snp, df_ped, on=["cohort", "pheno"], how="outer")
    df_sig = pd.merge(df_snp_sig, df_ped_sig, on=["cohort", "pheno"], how="outer")
    return df, df_sig

def load_sem(fn, thred=1e-4):
    df = pd.read_csv(fn, sep='\t')
    df_A = (df[df["param"] == "A"]
            .reset_index(drop=True)
            .rename(columns={"median": "SEM", 
                             "lower(2.5%)": "lower_SEM", 
                             "upper(97.5%)": "upper_SEM"})
            .drop(columns=["param"]))
    df_A_sig = df_A[df_A["lower_SEM"] > thred].reset_index(drop=True)
    return df_A, df_A_sig


In [3]:
# Load results
df_ldsc, df_ldsc_sig = load_ldsc(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/LDSC/UKB.ldsc.tsv"
)

df_ldpred, df_ldpred_sig = load_ldpred(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/LDpred2/UKB.LDpred2.tsv"
)

df_rdr, df_rdr_sig = load_rdr(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/RDR/RDR.ivw.tsv"
)

df_he, df_he_sig = load_he(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/HE/HE.ivw.tsv"
)


df_bigfam, df_bigfam_sig = load_bigfam(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/BIGFAM/BIGFAM.csv"
)


df_bksk, df_bksk_sig = load_bksk(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/GREML/bksk/GREML-bksk.tsv"
)

df_sem, df_sem_sig = load_sem(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/SEM/SEM.tsv"
)

In [4]:
def merge_methods(dfs: list[pd.DataFrame]):
    df = dfs[0]
    
    for df_other in dfs[1:]:
        df = pd.merge(df, df_other, on=["cohort", "pheno"], how="outer")
    return df
    
df_mrg = merge_methods([df_ldsc, df_ldpred, df_rdr, df_he, df_bigfam, df_bksk, df_sem])
df_mrg_sig = merge_methods([df_ldsc_sig, df_ldpred_sig, df_rdr_sig, df_he_sig, df_bigfam_sig, df_bksk_sig, df_sem_sig])

In [5]:
methods = [col for col in df_mrg.columns if not (col.startswith('lower_') or col.startswith('upper_'))]
methods.remove("cohort")
methods.remove("pheno")
methods


['LDSC',
 'LDpred2',
 'RDR',
 'HE-CP',
 'HE-SD',
 'BIGFAM',
 'GCTA-snp',
 'GCTA-ped',
 'SEM']

# save merged dataframe

In [8]:
df_mrg.to_csv(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/all.raw.tsv",
    sep = "\t",
    index = False
)

In [9]:
df_gs = (df_mrg_sig[df_mrg_sig["cohort"] == "GS"]
 .drop(columns=[
     "LDSC", "lower_LDSC", "upper_LDSC",
     "LDpred2", "lower_LDpred2", "upper_LDpred2"])
 .dropna())

df_gs.to_csv(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/GS.sig.tsv",
    sep = "\t",
    index = False
)

In [10]:
df_gs_mean = df_gs.loc[:, [method for method in methods if method in df_gs.columns]]
df_gs_mean.corr().round(3)

Unnamed: 0,RDR,HE-CP,HE-SD,BIGFAM,GCTA-snp,GCTA-ped,SEM
RDR,1.0,0.986,0.987,0.967,0.993,0.975,0.975
HE-CP,0.986,1.0,1.0,0.986,0.983,0.998,0.995
HE-SD,0.987,1.0,1.0,0.986,0.984,0.997,0.995
BIGFAM,0.967,0.986,0.986,1.0,0.98,0.991,0.994
GCTA-snp,0.993,0.983,0.984,0.98,1.0,0.976,0.983
GCTA-ped,0.975,0.998,0.997,0.991,0.976,1.0,0.997
SEM,0.975,0.995,0.995,0.994,0.983,0.997,1.0


In [11]:
df_ukb = (df_mrg_sig[df_mrg_sig["cohort"] == "UKB"].dropna())

df_ukb.to_csv(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/UKB.sig.tsv",
    sep = "\t",
    index = False
)

In [12]:
df_ukb_mean = df_ukb.loc[:, [method for method in methods if method in df_ukb.columns]]
df_ukb_mean.corr().round(3)

Unnamed: 0,LDSC,LDpred2,RDR,HE-CP,HE-SD,BIGFAM,GCTA-snp,GCTA-ped,SEM
LDSC,1.0,0.976,0.82,0.858,0.855,0.866,0.879,0.825,0.742
LDpred2,0.976,1.0,0.854,0.884,0.875,0.894,0.888,0.832,0.762
RDR,0.82,0.854,1.0,0.97,0.935,0.859,0.927,0.774,0.849
HE-CP,0.858,0.884,0.97,1.0,0.989,0.867,0.958,0.902,0.861
HE-SD,0.855,0.875,0.935,0.989,1.0,0.848,0.958,0.939,0.831
BIGFAM,0.866,0.894,0.859,0.867,0.848,1.0,0.852,0.771,0.781
GCTA-snp,0.879,0.888,0.927,0.958,0.958,0.852,1.0,0.883,0.859
GCTA-ped,0.825,0.832,0.774,0.902,0.939,0.771,0.883,1.0,0.753
SEM,0.742,0.762,0.849,0.861,0.831,0.781,0.859,0.753,1.0
