In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import semopy
from semopy import Model, Optimizer
from semopy.inspector import inspect
from tqdm import tqdm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# UKB

In [70]:
df_pair = pd.read_csv(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/UKB/relationship_information/relatives.formatted.info",
    sep='\t'
)
df_pair

Unnamed: 0,DOR,rcode,relationship,volid,relid,volage,relage,volsex,relsex,Erx
0,1,SB,daughter-sister,1000094,3653174,65,64,F,F,0.75
1,1,,,1000220,1691267,64,64,F,F,
2,1,,,1000286,1571411,53,70,F,F,
3,1,,,1000295,1045127,60,41,F,F,
4,1,,,1000476,3599303,50,51,F,M,
...,...,...,...,...,...,...,...,...,...,...
81321,3,1C,son-(father-brother)/(father-sister)-daughter,6023723,4863061,62,64,F,M,0.00
81322,3,,,6024211,1209127,53,60,M,F,
81323,3,,,6024384,1854265,62,44,M,M,
81324,3,1C,son-(father-sister)/(father-sister)/(mother-br...,6024486,3148753,58,56,M,M,0.00


In [71]:
df_pair["A"] = (0.5)**df_pair["DOR"]
df_pair["S1"] = 0; df_pair["S2"] = 0; df_pair["S3"] = 0
df_pair.loc[df_pair["DOR"] == 1, "S1"] = 1
df_pair.loc[df_pair["DOR"] == 2, "S2"] = 1
df_pair.loc[df_pair["DOR"] == 3, "S3"] = 1

df_pair


Unnamed: 0,DOR,rcode,relationship,volid,relid,volage,relage,volsex,relsex,Erx,A,S1,S2,S3
0,1,SB,daughter-sister,1000094,3653174,65,64,F,F,0.75,0.500,1,0,0
1,1,,,1000220,1691267,64,64,F,F,,0.500,1,0,0
2,1,,,1000286,1571411,53,70,F,F,,0.500,1,0,0
3,1,,,1000295,1045127,60,41,F,F,,0.500,1,0,0
4,1,,,1000476,3599303,50,51,F,M,,0.500,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81321,3,1C,son-(father-brother)/(father-sister)-daughter,6023723,4863061,62,64,F,M,0.00,0.125,0,0,1
81322,3,,,6024211,1209127,53,60,M,F,,0.125,0,0,1
81323,3,,,6024384,1854265,62,44,M,M,,0.125,0,0,1
81324,3,1C,son-(father-sister)/(father-sister)/(mother-br...,6024486,3148753,58,56,M,M,0.00,0.125,0,0,1


In [72]:
# read phenotype
pheno_path = f"/data/jerrylee/pjt/BIGFAM.v.0.1/data/UKB/phenotype"
pheno_fns = os.listdir(pheno_path)
len(pheno_fns)

106

In [92]:
for pheno_fn in tqdm(pheno_fns):
    pheno = pheno_fn.split(".")[0]
    tmp_pair = df_pair[["DOR", "volid", "relid", "A", "S1", "S2", "S3"]].copy()

    # load phenotypes
    tmp_pheno = pd.read_csv(f"{pheno_path}/{pheno_fn}", sep='\t')
    tmp_pheno["pheno"] = (tmp_pheno["pheno"] - np.mean(tmp_pheno["pheno"])) / np.std(tmp_pheno["pheno"])
    # remove outliers
    tmp_pheno = tmp_pheno[np.abs(tmp_pheno["pheno"]) < 3]
    tmp_pheno = tmp_pheno.astype({"eid": int, "pheno": float})

    # merge relationship information and phenotype
    tmp_pair = tmp_pair.merge(
        tmp_pheno.rename(columns={"eid":"volid", "pheno":"volphen"}),
        on=["volid"])
    tmp_pair = tmp_pair.merge(
        tmp_pheno.rename(columns={"eid":"relid", "pheno":"relphen"}),
        on=["relid"])
    
    tmp_pair["cp"] = tmp_pair["volphen"] * tmp_pair["relphen"]
    
    # remove outliers
    tmp_pair = tmp_pair[np.abs(tmp_pair["cp"]) < 3]
    
    df_res = pd.DataFrame()
    for ir in range(1000):
        df_resampled = tmp_pair.sample(len(tmp_pair), replace=True)
        model = smf.ols("cp ~ 0 + A + S1 + S2 + S3", data=df_resampled).fit()
        df_res = pd.concat([df_res, pd.DataFrame(model.params).T])
    
    df_res.to_csv(
        f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/SEM/UKB/{pheno}.sem", 
        sep='\t', 
        index=False
    )

  2%|▏         | 2/106 [01:39<1:26:31, 49.92s/it]


KeyboardInterrupt: 

# GS

In [None]:
df_pair = pd.read_csv(
    "/data/jerrylee/pjt/BIGFAM.v.2.0/data/GS/relationship_information/relatives.formatted.info",
    sep='\t'
)
df_pair["A"] = (0.5)**df_pair["DOR"]
df_pair["S1"] = 0; df_pair["S2"] = 0; df_pair["S3"] = 0
df_pair.loc[df_pair["DOR"] == 1, "S1"] = 1
df_pair.loc[df_pair["DOR"] == 2, "S2"] = 1
df_pair.loc[df_pair["DOR"] == 3, "S3"] = 1

df_pair


In [32]:
# read phenotype
pheno_path = f"/data/jerrylee/pjt/BIGFAM.v.0.1/data/GS/phenotype"
pheno_fns = os.listdir(pheno_path)
len(pheno_fns)

40

In [None]:
for pheno_fn in tqdm(pheno_fns):
    pheno = pheno_fn.split(".")[0]
    tmp_pair = df_pair[["DOR", "volid", "relid", "A", "S1", "S2", "S3"]].copy()

    # load phenotypes
    tmp_pheno = pd.read_csv(f"{pheno_path}/{pheno_fn}", sep='\t')
    tmp_pheno["pheno"] = (tmp_pheno["pheno"] - np.mean(tmp_pheno["pheno"])) / np.std(tmp_pheno["pheno"])
    # remove outliers
    tmp_pheno = tmp_pheno[np.abs(tmp_pheno["pheno"]) < 3]
    tmp_pheno = tmp_pheno.astype({"eid": int, "pheno": float})

    # merge relationship information and phenotype
    tmp_pair = tmp_pair.merge(
        tmp_pheno.rename(columns={"eid":"volid", "pheno":"volphen"}),
        on=["volid"])
    tmp_pair = tmp_pair.merge(
        tmp_pheno.rename(columns={"eid":"relid", "pheno":"relphen"}),
        on=["relid"])
    
    tmp_pair["cp"] = tmp_pair["volphen"] * tmp_pair["relphen"]
    
    # remove outliers
    tmp_pair = tmp_pair[np.abs(tmp_pair["cp"]) < 3]
    
    df_res = pd.DataFrame()
    for ir in range(1000):
        df_resampled = tmp_pair.sample(len(tmp_pair), replace=True)
        model = smf.ols("cp ~ 0 + A + S1 + S2 + S3", data=df_resampled).fit()
        df_res = pd.concat([df_res, pd.DataFrame(model.params).T])
    
    df_res.to_csv(
        f"/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/SEM/GS/{pheno}.sem", 
        sep='\t', 
        index=False
    )

# merge

In [3]:
cohorts = ["UKB", "GS"]
path = "/data/jerrylee/pjt/BIGFAM.v.2.0/data/other-methods/SEM"

In [14]:
df = pd.DataFrame(columns=[
    "cohort", "pheno", "param", "median", "lower(2.5%)", "upper(97.5%)"
])

for cohort in cohorts:
    path_cohort = f"{path}/{cohort}"
    fns = os.listdir(path_cohort)

    for fn in tqdm(fns):
        tmp = pd.read_csv(f"{path_cohort}/{fn}", sep='\t')
        pheno = fn.split(".")[0]
        
        for param in ["A", "S1", "S2", "S3"]:
            median, lower, upper = tmp[param].median(), tmp[param].quantile(0.025), tmp[param].quantile(0.975)
            df.loc[len(df)] = [cohort, pheno, param, median, lower, upper]
            

100%|██████████| 106/106 [00:01<00:00, 53.09it/s]
100%|██████████| 40/40 [00:00<00:00, 52.98it/s]


In [18]:
df.to_csv(
    f"{path}/SEM.tsv",
    sep='\t',
    index=False
)

In [10]:
df