In [1]:
import pandas as pd
import numpy as np
import os
from multiprocessing import Pool
from tqdm import tqdm

In [2]:
cohort = "GS"

In [3]:
df_pair = pd.read_csv(
    f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/{cohort}/relationship_information/relatives.formatted.info",
    sep='\t'
)
df_pair

Unnamed: 0,DOR,rcode,relationship,volid,relid,volage,relage,volsex,relsex,Erx
0,1,SB,daughter-sister,18826,21244,50,36,F,F,0.750000
1,1,SB,different-sex-sibling,34422,23884,33,35,F,M,0.353553
2,1,PC,daughter-mother,79198,67531,66,44,F,F,0.500000
3,1,SB,daughter-sister,20399,67531,38,44,F,F,0.750000
4,1,SB,daughter-sister,67267,67531,43,44,F,F,0.750000
...,...,...,...,...,...,...,...,...,...,...
38001,3,HAV,son-mother-father-daughter,34570,78069,50,39,F,M,0.353553
38002,3,HAV,son-mother-father-daughter,97449,79361,40,29,F,M,0.353553
38003,3,HAV,son-mother-father-daughter,97449,5360,40,22,F,M,0.353553
38004,3,HAV,son-mother-father-daughter,15442,83545,35,29,F,M,0.353553


In [4]:
def get_unique_ids(df):
    return list(set(df["volid"]) | set(df["relid"]))

In [None]:
def process_phenotype(args):
    pheno_fn, pheno_path, df_pair, get_unique_ids = args
    
    pheno = pheno_fn.split(".")[0]
    df_pheno = pd.read_csv(f"{pheno_path}/{pheno_fn}", sep='\t')
    
    for DOR in [1, 2, 3]:
        target_pairs = df_pair[df_pair["DOR"] == DOR]
        target_ids = get_unique_ids(target_pairs)
        
        tmp = df_pheno[df_pheno["eid"].isin(target_ids)][["eid", "eid", "pheno"]].copy()
        tmp.columns = ["FID", "IID", "trait"]
        
        # save phenotype 
        tmp_fn = f"/data/jerrylee/pjt/BIGFAM.v.2.0/tmp/{pheno}.txt"
        tmp.to_csv(tmp_fn, sep='\t', index=False)
        
        # run RDR
        if cohort == "UKB":
            cmd = "python /data/jerrylee/pjt/BIGFAM.v.0.1/BIGFAM/RELT.py {grm_id} {grm_bin} {pheno_fn} {out_fn}".format(
                grm_id = f"/data/jerrylee/data/{cohort}/grm_rel/DOR{DOR}_chrALL.grm.id",
                grm_bin = f"/data/jerrylee/data/{cohort}/grm_rel/DOR{DOR}_chrALL.grm.bin",
                pheno_fn = tmp_fn,
                out_fn = f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/RDR/{pheno}.{DOR}"
            )
        if cohort == "GS":
            cmd = "python /data/jerrylee/pjt/BIGFAM.v.0.1/BIGFAM/RELT.py {grm_id} {grm_bin} {pheno_fn} {out_fn}".format(
                grm_id = f"/data/jerrylee/data/{cohort}/GRM/GS_GWAS.grm.id",
                grm_bin = f"/data/jerrylee/data/{cohort}/GRM/GS_GWAS.grm.bin",
                pheno_fn = tmp_fn,
                out_fn = f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/RDR/{cohort}/{pheno}.{DOR}"
            )
        os.system(cmd)
        
        # remove phenotype file
        cmd = f"rm {tmp_fn}"
        os.system(cmd)

# 메인 코드
pheno_path = f"/data/jerrylee/pjt/BIGFAM.v.0.1/data/{cohort}/phenotype"
pheno_fns = os.listdir(pheno_path)

# 프로세스 풀 생성
num_processes = 20  # 원하는 프로세스 수로 조정
pool = Pool(processes=num_processes)

# 각 phenotype 파일에 대한 인자 생성
args = [(pheno_fn, pheno_path, df_pair, get_unique_ids) for pheno_fn in pheno_fns]

# 병렬 처리 실행 (tqdm으로 진행상황 표시)
for _ in tqdm(pool.imap_unordered(process_phenotype, args), total=len(args)):
    pass

# 풀 종료
pool.close()
pool.join()

Computing regression estimate
-2.3459480334165624e-07
0.014311912467275098
Computing regression estimate
-2.235911192025749e-07
0.014843655539399474
Computing regression estimate
-2.388709138267211e-07
0.014029225374098536
v: 0.378
Phenotypic variance estimate: 1.015
Estimating heritability
468 pairs with relatedness >0.05 excluded
Computing regression estimate
1.2127136629977334e-05v: 0.598
Phenotypic variance estimate: 0.992

Estimating heritability
0.0702541500156487
456 pairs with relatedness >0.05 excluded
Computing regression estimate
Computing regression estimate
1.2616005963222027e-05
0.07259377001161403
-2.503712948353354e-07
0.014650000121768285
v_g: 0.468
Estimating standard error
v_g: 0.092
Estimating standard error
v_g: 0.092 (0.407)v_g: 0.468 (0.422)
Estimating variance of phenotypic variance estimate

Estimating variance of phenotypic variance estimate
Estimating covariance between genetic variance and phenotypic variance estimate
Estimating covariance between genetic va

# merge results

In [29]:
df_results = pd.DataFrame(columns=[
    "cohort", "phenotype", "DOR", "estimate", "std_error"])

for cohort in ["UKB", "GS"]:
    path = f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/RDR/{cohort}"
    pheno_fns = os.listdir(path)
    
    for pheno_fn in pheno_fns:
        tmp = pd.read_csv(f"{path}/{pheno_fn}", sep='\t')
        dor = pheno_fn.split(".")[-2]
        pheno = pheno_fn.split(".")[0]
        estimate, se = tmp[tmp['Unnamed: 0'] == 'h2'][['Estimate', 'S.E.']].values[0]
        
        df_results = df_results.append({
            "cohort": cohort,
            "phenotype": pheno,
            "DOR": dor,
            "estimate": estimate,
            "std_error": se
        }, ignore_index=True)


In [33]:
(df_results
 .sort_values(by=["cohort", "phenotype", "DOR"])
 .reset_index(drop=True)
 .to_csv(f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/RDR/RDR.raw.tsv", sep='\t', index=False)
)

In [34]:
df_results

Unnamed: 0,cohort,phenotype,DOR,estimate,std_error
0,UKB,Leg_predicted_mass__right_,2,0.431320,0.096937
1,UKB,Impedance_of_leg__left_,2,0.294856,0.092966
2,UKB,Neutrophill_count,2,0.197015,0.085134
3,UKB,Potassium_in_urine,2,0.020760,0.077551
4,UKB,Neutrophill_percentage,3,0.152399,0.021078
...,...,...,...,...,...
433,GS,expected,1,0.608415,0.082247
434,GS,QRS_duration,1,0.017542,0.019264
435,GS,PR_interval,3,0.022232,0.123828
436,GS,ECG_Count,1,0.022207,0.016893


In [35]:
# IVW 계산을 위한 그룹별 함수 정의
def calculate_ivw(group):
    weights = 1 / (group['std_error'] ** 2)
    ivw_estimate = sum(group['estimate'] * weights) / sum(weights)
    ivw_se = np.sqrt(1 / sum(weights))
    
    return pd.Series({
        'ivw_estimate': ivw_estimate,
        'ivw_se': ivw_se,
        'n_dor': len(group)  # DOR 개수 확인용
    })

# IVW 계산 실행
df_ivw = (df_results
    .groupby(['cohort', 'phenotype'])
    .apply(calculate_ivw)
    .reset_index()
)

In [36]:
df_ivw

Unnamed: 0,cohort,phenotype,ivw_estimate,ivw_se,n_dor
0,GS,Creat_mgdl,0.173045,0.039223,3.0
1,GS,Creatinine,0.178608,0.040295,3.0
2,GS,ECG_Count,0.019802,0.015527,3.0
3,GS,FEF,0.244723,0.037647,3.0
4,GS,FEV,0.370123,0.045040,3.0
...,...,...,...,...,...
141,UKB,Weight,0.291792,0.022395,3.0
142,UKB,White_blood_cell__leukocyte__count,0.145200,0.016690,3.0
143,UKB,Whole_body_fat-free_mass,0.394194,0.025403,3.0
144,UKB,Whole_body_fat_mass,0.257473,0.021227,3.0


In [37]:
(
    df_ivw
    .drop(columns=["n_dor"])
    .to_csv("/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/RDR.ivw.tsv", 
              sep='\t', index=False)
)