In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import sys; sys.path.append("/data/jerrylee/pjt/BIGFAM.v.2.0")
from src import frreg, tools
import importlib

In [2]:
cohort = "GS"

# Step 1. Load relatives phenotype data

In [3]:
# parameters
info_fn = f"/data/jerrylee/pjt/BIGFAM.v.0.1/data/{cohort}/relative_information/relatives.formatted.info"

In [4]:
# relative information format
df_pair = pd.read_csv(info_fn, sep='\t')
df_pair.head()

Unnamed: 0,DOR,rcode,relationship,volid,relid,volage,relage,volsex,relsex,Erx
0,1,SB,daughter-sister,18826,21244,50,36,F,F,0.75
1,1,SB,different-sex-sibling,34422,23884,33,35,F,M,0.353553
2,1,PC,daughter-mother,79198,67531,66,44,F,F,0.5
3,1,SB,daughter-sister,20399,67531,38,44,F,F,0.75
4,1,SB,daughter-sister,67267,67531,43,44,F,F,0.75


# Step 2. Do FR-reg

In [5]:
pheno_path = f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/{cohort}/phenotype"
frreg_path = f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/{cohort}/frreg"

In [9]:
pheno_fns = os.listdir(pheno_path)
len(pheno_fns)

40

## Step 2.1. DOR level

In [10]:
warning_dicts = {}

for fn in tqdm(pheno_fns):
    pheno = fn.split(".")[0]
    pheno_fn = f"{pheno_path}/{fn}"
    df_pheno = pd.read_csv(pheno_fn, sep="\t")
    df_pheno = frreg.remove_outliers(df_pheno, "pheno")
    
    # merge pheno with relatives
    df_mrg = frreg.merge_pheno_info(df_pheno, df_pair)
    try:
        df_frreg, msgs = frreg.familial_relationship_regression_DOR(
            df_mrg.drop(columns=["rcode", "relationship", "Erx"]),
            n_bootstrap=10
        )
        
        if len(msgs) > 0:
            warning_dicts[pheno] = msgs
            continue
        
        df_frreg.to_csv(
            f"{frreg_path}/{pheno}.DOR.frreg",
            sep='\t',
            index=False
        )
    except:
        continue
    
    break

100%|██████████| 10/10 [00:07<00:00,  1.42it/s]
100%|██████████| 10/10 [00:02<00:00,  4.10it/s]
100%|██████████| 10/10 [00:10<00:00,  1.04s/it]
  0%|          | 0/106 [00:44<?, ?it/s]


## Step 2.2. REL level

In [7]:
warning_dicts = {}

for fn in tqdm(pheno_fns):
    pheno = fn.split(".")[0]
    pheno_fn = f"{pheno_path}/{fn}"
    df_pheno = pd.read_csv(pheno_fn, sep="\t")
    df_pheno = frreg.remove_outliers(df_pheno, "pheno")
    
    # merge pheno with relatives
    df_mrg = frreg.merge_pheno_info(df_pheno, df_pair)
    
    try:
        df_frreg, msgs = frreg.familial_relationship_regression_REL(
            df_mrg,
            n_bootstrap=10
            )
        
        df_frreg_positive = (df_frreg[df_frreg["slope"] > 0]
                             .reset_index(drop=True))
        
        if len(msgs) > 0:
            print(fn, msgs)
            warning_dicts[pheno] = msgs
        
        if len(df_frreg_positive) < 10:
            print(fn, "not sufficient data")
            continue
    
        df_frreg_positive.to_csv(
            f"{frreg_path}/{pheno}.REL.frreg",
            sep='\t',
            index=False
        )
    except:
        continue

100%|██████████| 10/10 [00:02<00:00,  3.73it/s]
100%|██████████| 10/10 [00:02<00:00,  3.68it/s]
100%|██████████| 10/10 [00:02<00:00,  3.72it/s]
100%|██████████| 10/10 [00:02<00:00,  3.69it/s]
100%|██████████| 10/10 [00:02<00:00,  3.70it/s]
100%|██████████| 10/10 [00:02<00:00,  3.69it/s]
100%|██████████| 10/10 [00:02<00:00,  3.67it/s]
100%|██████████| 10/10 [00:01<00:00,  6.62it/s]
100%|██████████| 10/10 [00:01<00:00,  7.91it/s]
100%|██████████| 10/10 [00:01<00:00,  7.89it/s]
100%|██████████| 10/10 [00:01<00:00,  7.89it/s]
100%|██████████| 10/10 [00:01<00:00,  7.86it/s]
100%|██████████| 10/10 [00:01<00:00,  7.91it/s]
100%|██████████| 10/10 [00:01<00:00,  7.97it/s]
100%|██████████| 10/10 [00:01<00:00,  7.88it/s]
100%|██████████| 10/10 [00:01<00:00,  7.92it/s]
100%|██████████| 10/10 [00:01<00:00,  7.87it/s]
100%|██████████| 10/10 [00:01<00:00,  7.90it/s]
100%|██████████| 10/10 [00:00<00:00, 26.69it/s]
100%|██████████| 10/10 [00:00<00:00, 26.12it/s]
100%|██████████| 10/10 [00:00<00:00, 26.

  0%|          | 0/10 [00:00<?, ?it/s]
  8%|▊         | 3/40 [01:26<13:59, 22.70s/it]

# Step 3. PO-SIB


In [10]:
output_path = f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/{cohort}/po-sib"
output_path

'/data/jerrylee/pjt/BIGFAM.v.2.0/data/GS/po-sib'

In [13]:
subgroups = ["PC", "SB"]

for fn in tqdm(pheno_fns):
    pheno = fn.split(".")[0]
    pheno_fn = f"{pheno_path}/{fn}"
    df_pheno = pd.read_csv(pheno_fn, sep="\t")
    df_pheno = frreg.remove_outliers(df_pheno, "pheno")
    
    for subgroup in subgroups:
        # DOR=1인 데이터 중 rcode="SB"만 남기고, 나머지 DOR은 유지
        df_pair_filtered = pd.concat([
            # DOR=1이면서 rcode="SB"인 데이터
            df_pair[(df_pair["DOR"] == 1) & (df_pair["rcode"] == subgroup)],
            # DOR이 1이 아닌 모든 데이터
            df_pair[df_pair["DOR"] != 1]
        ]).reset_index(drop=True)

        # merge pheno with relatives
        df_mrg = frreg.merge_pheno_info(df_pheno, df_pair_filtered)
        break
        try:
            df_frreg, msgs = frreg.familial_relationship_regression_DOR(
                df_mrg.drop(columns=["rcode", "relationship", "Erx"]),
                n_bootstrap=100
            )
            
            df_frreg.to_csv(
                f"{output_path}/{pheno}.{subgroup}.frreg",
                sep='\t',
                index=False
            )
                
        except:
            continue
    break

  0%|          | 0/40 [00:00<?, ?it/s]


In [17]:
df_pair.loc[df_pair["DOR"] == 1]["rcode"].unique()

array(['SB', 'PC'], dtype=object)

In [27]:
df_frreg

Unnamed: 0,DOR,slope,se,p,n
0,1,0.18923,0.008803,0.0,8724
1,2,0.092695,0.007556,0.0,17450
2,3,0.04582,0.003846,0.0,82132
