In [1]:
# GOAL : make sample info file with relative information

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
# from pandas_plink import read_grm
import sys; sys.path.append("/data/jerrylee/pjt/BIGFAM.v.0.1")
from BIGFAM.pedigree import ped
import BIGFAM.tools as tools
import importlib

# UKB

## Step 1. Load sample info

In [3]:
# [Arguments]
basic_info_path = '/data/jerrylee/data/UKB/pheno/pheno_basic' 
# basic_info includes [sex, age, ethnicitiy, kinship, PCA]
eur_info_fn = "/data02/jerrylee/xChrom/data/info_family/sample.eur.info"
kinship_fn = "/data01/UKB202005/data/genotype/ukbgene_rel/ukb59688_rel_s488249.dat"
fam_fn = '/data/jerrylee/data/UKB/geno/ukb_chr{}.qc.fam'

In [4]:
# load files
df_info_raw = pd.read_csv(eur_info_fn, sep='\t')
df_kinship_raw = pd.read_csv(kinship_fn, sep='\s+')

# make kinship long (concat with flipped dataframe)
df_kinship_flip = df_kinship_raw.copy()
df_kinship_flip.columns = ["ID2" ,"ID1", "HetHet", "IBS0", "Kinship"]
df_kinship = pd.concat([df_kinship_raw, df_kinship_flip]).reset_index(drop=True)
df_kinship.columns = ["eid", "eid_rel", "HetHet", "IBS0", "Kinship"]

# merge with EUR info
df_info = df_info_raw[["eid", "age", "sex"]]
df_tmp = pd.merge(df_info, df_kinship, on=["eid"])
df_info_tmp = df_info.copy()
df_info_tmp.columns = ["eid_rel", "age_rel", "sex_rel"]
df_merged = pd.merge(df_tmp, df_info_tmp, on=["eid_rel"])
df_merged["kinship"] = 2 * df_merged["Kinship"]

n_unique_eid = len(df_merged["eid"].unique())
print(n_unique_eid)
df_merged

142217


Unnamed: 0,eid,age,sex,eid_rel,HetHet,IBS0,Kinship,age_rel,sex_rel,kinship
0,1000031,52.0,0.0,4977365,0.046,0.0139,0.0684,45.0,0.0,0.1368
1,1000073,54.0,1.0,1468150,0.047,0.0131,0.0753,60.0,0.0,0.1506
2,1000094,65.0,0.0,3653174,0.077,0.0052,0.2471,64.0,0.0,0.4942
3,1000122,68.0,0.0,1390862,0.051,0.0094,0.1215,63.0,0.0,0.2430
4,1000174,69.0,0.0,3765836,0.052,0.0092,0.1252,45.0,0.0,0.2504
...,...,...,...,...,...,...,...,...,...,...
205987,6024822,41.0,1.0,3535828,0.044,0.0150,0.0504,50.0,1.0,0.1008
205988,6024988,60.0,0.0,4993350,0.046,0.0153,0.0568,64.0,0.0,0.1136
205989,6024988,60.0,0.0,1871086,0.044,0.0157,0.0467,56.0,0.0,0.0934
205990,6024988,60.0,0.0,3891070,0.045,0.0156,0.0478,58.0,0.0,0.0956


## Step 2. Add `DOR` annotation using kinship coefficient

In [5]:
# ad-hoc functions
def get_unique_eid(df):
    """Count unique `eid` from given dataframe."""
    eids = set(df["eid"])
    eid_rels = set(df["eid_rel"])
    return list(eids | eid_rels)

def save_relation(df, path, fn):
    df_tmp = (df[["eid", "age", "sex", "eid_rel", "age_rel", "sex_rel", "Kinship"]]
              .astype({"age": int, "sex": int, "age_rel": int, "sex_rel": int})
    )
    df_tmp.to_csv(path + "/" + fn, sep='\t', index=False)
    
def drop_dupicate_eid(df):
    "left unique eid pairs (remove swapped eid-eid_rel pairs)"
    tmp = df.copy()
    tmp['key'] = tmp.apply(lambda row: map(str, sorted([row['eid'], row['eid_rel']])), axis=1)
    tmp["key"] = tmp.apply(lambda row: "_".join(row["key"]), axis=1)
    
    return tmp.drop_duplicates(subset="key").drop(columns="key")

In [6]:
# Check Degree of Relativeness (DOR)
# number of sample in each relatives -- check
kinship_coefficients = {1: 0.5,
                        2: 0.5 / 2,
                        3: 0.5 / 4,
                        4: 0.5 / 8}

for dor, kinship_coeff in kinship_coefficients.items():
    lower_lim = 0.8 * kinship_coeff
    upper_lim = 1.2 * kinship_coeff
    is_kinship_near_coeff = (df_merged["kinship"] > lower_lim) & (df_merged["kinship"] < upper_lim)
    df_merged.loc[is_kinship_near_coeff, "DOR"] = dor
    print(
        "{} DOR (n_pair = {}, n={}): kinship coefficient within ({:.3f}, {:.3f})".format(
        dor,
        int(df_merged[df_merged["DOR"] == dor].shape[0] / 2),
        len(get_unique_eid(df_merged[df_merged["DOR"] == dor])),
        lower_lim,
        upper_lim
    ),
          flush=True)
    

1 DOR (n_pair = 27365, n=49083): kinship coefficient within (0.400, 0.600)


2 DOR (n_pair = 9447, n=17240): kinship coefficient within (0.200, 0.300)
3 DOR (n_pair = 44514, n=72944): kinship coefficient within (0.100, 0.150)
4 DOR (n_pair = 0, n=0): kinship coefficient within (0.050, 0.075)


In [7]:
df = (df_merged[["DOR", "eid", "age", "sex", "eid_rel", "age_rel", "sex_rel", "kinship"]]
      .dropna())
df

Unnamed: 0,DOR,eid,age,sex,eid_rel,age_rel,sex_rel,kinship
0,3.0,1000031,52.0,0.0,4977365,45.0,0.0,0.1368
2,1.0,1000094,65.0,0.0,3653174,64.0,0.0,0.4942
3,2.0,1000122,68.0,0.0,1390862,63.0,0.0,0.2430
4,2.0,1000174,69.0,0.0,3765836,45.0,0.0,0.2504
5,2.0,1000174,69.0,0.0,4593489,42.0,1.0,0.2868
...,...,...,...,...,...,...,...,...
205985,2.0,6024699,65.0,1.0,5945059,40.0,0.0,0.2660
205986,1.0,6024769,69.0,0.0,4102199,44.0,0.0,0.5082
205987,3.0,6024822,41.0,1.0,3535828,50.0,1.0,0.1008
205988,3.0,6024988,60.0,0.0,4993350,64.0,0.0,0.1136


## Step 3. add X correlation

In [45]:
df["xcor"] = np.nan

# add sex_type column
df["sex_type"] = np.nan

is_sex = {"mm": df["sex"] + df["sex_rel"] == 2,
          "mf": df["sex"] + df["sex_rel"] == 1,
          "ff": df["sex"] + df["sex_rel"] == 0,}

for sex_type in ["mm", "mf", "ff"]:
    df.loc[is_sex[sex_type], "sex_type"] = sex_type
    
df.head()

Unnamed: 0,DOR,eid,age,sex,eid_rel,age_rel,sex_rel,kinship,xcor,sex_type
0,3.0,1000031,52.0,0.0,4977365,45.0,0.0,0.1368,,ff
3,1.0,1000094,65.0,0.0,3653174,64.0,0.0,0.4942,,ff
4,2.0,1000122,68.0,0.0,1390862,63.0,0.0,0.243,,ff
5,2.0,1000174,69.0,0.0,3765836,45.0,0.0,0.2504,,ff
6,3.0,1950469,41.0,1.0,3765836,45.0,0.0,0.1022,,mf


In [46]:
path = "/data/jerrylee/data/UKB/grm_rel"

### DOR1

In [11]:
# DOR1
rel_to_sp = {"FS": "mm", 
             "MS": "mf", 
             "FD": "mf", 
             "MD": "ff", 
             "SS": "mm", 
             "SD": "mf", 
             "DD": "ff"}

for rel in tqdm(rel_to_sp.keys()):
    is_cond = (df["DOR"] == 1) & (df["sex_type"] == rel_to_sp[rel])
    df_subset = df[is_cond]
    
    # load GRM
    filepath = f"{path}/dor1_grms/rel/{rel}.grm.bin"
    id_filepath = f"{path}/dor1_grms/rel/{rel}.grm.id"
    n_snps_filepath = f"{path}/dor1_grms/rel/{rel}.grm.N.bin"
    
    (K, _) = read_grm(filepath, id_filepath, n_snps_filepath)
    
    # for each pairs
    for index, row in df_subset.iterrows():
        id1, id2 = row[["eid", "eid_rel"]].values
        try:
            cr = float(K.loc[id1, id2].values)
            # add c_r
            this_rel = (df["eid"] == id1) & (df["eid_rel"] == id2)
            
            # check cryptic relatedness
            if sum(this_rel) > 1:
                print(id1, id2)
            df.loc[this_rel, "xcor"] = cr
        except:
            pass

100%|██████████| 7/7 [11:47<00:00, 101.04s/it]


### DOR2

In [13]:
# DOR2
for sex_type in tqdm(["mm", "mf", "ff"]):
    is_cond = (df["DOR"] == 2) & (df["sex_type"] == sex_type)
    df_subset = df[is_cond]
    
    # load GRM
    filepath = f"{path}/dor2_grms/dor2_{sex_type}.grm.bin"
    id_filepath = f"{path}/dor2_grms/dor2_{sex_type}.grm.id"
    n_snps_filepath = f"{path}/dor2_grms/dor2_{sex_type}.grm.N.bin"
    
    (K, _) = read_grm(filepath, id_filepath, n_snps_filepath)
    
    # for each pairs
    for _, row in df_subset.iterrows():
        id1, id2 = row[["eid", "eid_rel"]].values
        try:
            cr = float(K.loc[id1, id2].values)
            # add c_r
            this_rel = (df["eid"] == id1) & (df["eid_rel"] == id2)
            # check cryptic relatedness
            if sum(this_rel) > 1:
                print(id1, id2)
            df.loc[this_rel, "xcor"] = cr
        except:
            pass

100%|██████████| 3/3 [04:38<00:00, 92.76s/it] 


### DOR3

In [14]:
# DOR3
for sex_type in tqdm(["mm", "mf", "ff"]):
    is_cond = (df["DOR"] == 3) & (df["sex_type"] == sex_type)
    df_subset = df[is_cond]
    
    # load GRM
    filepath = f"{path}/dor3_grms/dor3_{sex_type}.grm.bin"
    id_filepath = f"{path}/dor3_grms/dor3_{sex_type}.grm.id"
    n_snps_filepath = f"{path}/dor3_grms/dor3_{sex_type}.grm.N.bin"
    
    (K, _) = read_grm(filepath, id_filepath, n_snps_filepath)
    
    # for each pairs
    for _, row in df_subset.iterrows():
        id1, id2 = row[["eid", "eid_rel"]].values
        try:
            cr = float(K.loc[id1, id2].values)
            # add c_r
            this_rel = (df["eid"] == id1) & (df["eid_rel"] == id2)
            # check cryptic relatedness
            if sum(this_rel) > 1:
                print(id1, id2)
            df.loc[this_rel, "xcor"] = cr
        except:
            pass

100%|██████████| 3/3 [22:40<00:00, 453.38s/it]


In [16]:
df

Unnamed: 0,DOR,eid,age,sex,eid_rel,age_rel,sex_rel,kinship,xcor,sex_type
0,3.0,1000031,52.0,0.0,4977365,45.0,0.0,0.1368,0.040435,ff
3,1.0,1000094,65.0,0.0,3653174,64.0,0.0,0.4942,0.632092,ff
4,2.0,1000122,68.0,0.0,1390862,63.0,0.0,0.2430,0.501473,ff
5,2.0,1000174,69.0,0.0,3765836,45.0,0.0,0.2504,0.332779,ff
6,3.0,1950469,41.0,1.0,3765836,45.0,0.0,0.1022,0.012638,mf
...,...,...,...,...,...,...,...,...,...,...
205986,2.0,6024607,43.0,1.0,4730713,63.0,0.0,0.2502,-0.012986,mf
205988,1.0,6024638,68.0,1.0,5832823,64.0,1.0,0.5940,0.386101,mm
205989,3.0,6024646,46.0,0.0,1244265,42.0,1.0,0.1178,0.719852,mf
205990,1.0,6024769,69.0,0.0,4102199,44.0,0.0,0.5082,0.472529,ff


### Step 3.1 Formatting dataframe

In [17]:
def check_coln(df):
    coln_type = {
        "eid": int, "eid_rel": int,
        "age": int, "age_rel": int,
        "sex": int, "sex_rel": int,
        "sex_type": str,
        "DOR": int, 
        "kinship": float, "xcor": float,
    }
    
    for coln, dtype in coln_type.items():
        assert coln in df.columns, f"No {coln} in dataframe."
        
        df[coln] = df[coln].astype(dtype)
    
    return df

In [28]:
df_relInfo = check_coln(df)
df_relInfo["age_diff"] = np.abs(df_relInfo["age"] - df_relInfo["age_rel"])
df_relInfo

Unnamed: 0,DOR,eid,age,sex,eid_rel,age_rel,sex_rel,kinship,xcor,sex_type,age_diff
0,3,1000031,52,0,4977365,45,0,0.1368,0.040435,ff,7
3,1,1000094,65,0,3653174,64,0,0.4942,0.632092,ff,1
4,2,1000122,68,0,1390862,63,0,0.2430,0.501473,ff,5
5,2,1000174,69,0,3765836,45,0,0.2504,0.332779,ff,24
6,3,1950469,41,1,3765836,45,0,0.1022,0.012638,mf,4
...,...,...,...,...,...,...,...,...,...,...,...
205986,2,6024607,43,1,4730713,63,0,0.2502,-0.012986,mf,20
205988,1,6024638,68,1,5832823,64,1,0.5940,0.386101,mm,4
205989,3,6024646,46,0,1244265,42,1,0.1178,0.719852,mf,4
205990,1,6024769,69,0,4102199,44,0,0.5082,0.472529,ff,25


In [29]:
# remove duplicated rows
df_relInfo['eid_tuple'] = df_relInfo[['eid', 'eid_rel']].apply(lambda x: tuple(sorted(x)), axis=1)

# Identify and drop rows with unique (eid, eid_rel)
unique_indices = ~df_relInfo.duplicated(subset='eid_tuple', keep='first')
df_sample = df_relInfo[unique_indices].drop(columns=['eid_tuple'])

## Step 4. Add `rel_type` annotation

## DOR1

In [30]:
# add relationship annotation
is_dor = df_sample["DOR"] == 1
is_sib = (df_sample["age_diff"] >= 1) & (df_sample["age_diff"] <= 5)
is_po = (df_sample["age_diff"] >= 20) & (df_sample["age_diff"] <= 28)

# male-male
is_mm = df_sample["sex_type"] == "mm"

exps = {"SB": 1/2,
        "FS": 0}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    if rel == "FS":
        df_sample.loc[is_dor & is_mm & is_po & is_rel, "rel_type"] = rel
        df_sample.loc[is_dor & is_mm & is_po & is_rel, "cx"] = xcor
    elif rel == "SB":
        df_sample.loc[is_dor & is_mm & is_sib & is_rel, "rel_type"] = rel
        df_sample.loc[is_dor & is_mm & is_sib & is_rel, "cx"] = xcor

# df_sample.loc[is_dor & is_mm & is_sib, "rel_type"] = "SB"
# df_sample.loc[is_dor & is_mm & is_sib, "cx"] = 1/2
# df_sample.loc[is_dor & is_mm & is_po, "rel_type"] = "FS"
# df_sample.loc[is_dor & is_mm & is_po, "cx"] = 0

# female-female
is_ff = df_sample["sex_type"] == "ff"

exps = {"MD": 1/2,
        "DS": 3/4}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    if rel == "MD":
        df_sample.loc[is_dor & is_ff & is_po & is_rel, "rel_type"] = rel
        df_sample.loc[is_dor & is_ff & is_po & is_rel, "cx"] = xcor
    elif rel == "DS":
        df_sample.loc[is_dor & is_ff & is_sib & is_rel, "rel_type"] = rel
        df_sample.loc[is_dor & is_ff & is_sib & is_rel, "cx"] = xcor
        
# df_sample.loc[is_dor & is_ff & is_sib, "rel_type"] = "DS"
# df_sample.loc[is_dor & is_ff & is_sib, "cx"] = 3/4
# df_sample.loc[is_dor & is_ff & is_po, "rel_type"] = "MD"
# df_sample.loc[is_dor & is_ff & is_po, "cx"] = 1/2

# male-female
is_mf = df_sample["sex_type"] == "mf"
is_female_older = ((df_sample["age"] > df_sample["age_rel"]) & (df_sample["sex"] == 0)) | ((df_sample["age"] < df_sample["age_rel"]) & (df_sample["sex_rel"] == 0))
is_female_younger = ((df_sample["age"] > df_sample["age_rel"]) & (df_sample["sex_rel"] == 0)) | ((df_sample["age"] < df_sample["age_rel"]) & (df_sample["sex"] == 0))

exps = {"SS_DB": np.sqrt(2)/4,
        "MS": np.sqrt(2)/2,
        "FD": np.sqrt(2)/2}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    if rel == "SS_DB":
        df_sample.loc[is_dor & is_mf & is_sib & is_rel, "rel_type"] = rel
        df_sample.loc[is_dor & is_mf & is_sib & is_rel, "cx"] = xcor
    elif rel == "MS":
        df_sample.loc[is_dor & is_mf & is_po & is_female_older & is_rel, "rel_type"] = rel
        df_sample.loc[is_dor & is_mf & is_po & is_female_older & is_rel, "cx"] = xcor
    elif rel == "FD":
        df_sample.loc[is_dor & is_mf & is_po & is_female_younger & is_rel, "rel_type"] = rel
        df_sample.loc[is_dor & is_mf & is_po & is_female_younger & is_rel, "cx"] = xcor
        
        
# df_sample.loc[is_dor & is_mf & is_sib, "rel_type"] = "SS_DB"
# df_sample.loc[is_dor & is_mf & is_sib, "cx"] = np.sqrt(2)/4

# is_female_older = ((df_sample["age"] > df_sample["age_rel"]) & (df_sample["sex"] == 0)) | ((df_sample["age"] < df_sample["age_rel"]) & (df_sample["sex_rel"] == 0))
# is_female_younger = ((df_sample["age"] > df_sample["age_rel"]) & (df_sample["sex_rel"] == 0)) | ((df_sample["age"] < df_sample["age_rel"]) & (df_sample["sex"] == 0))
# df_sample.loc[is_dor & is_mf & is_po & is_female_older, "rel_type"] = "MS" # FD or MS
# df_sample.loc[is_dor & is_mf & is_po & is_female_older, "cx"] = np.sqrt(2)/2
# df_sample.loc[is_dor & is_mf & is_po & is_female_younger, "rel_type"] = "FD"
# df_sample.loc[is_dor & is_mf & is_po & is_female_younger, "cx"] = np.sqrt(2)/2

In [37]:
for_sums = df_sample[is_dor]

tmp_sums = (for_sums[["rel_type", "age_diff", "kinship", "xcor", "cx"]]
            .dropna(subset=["rel_type"])
            .groupby("rel_type")
            .mean()
            .round(3))
tmp_size = for_sums.dropna(subset=["rel_type"]).groupby("rel_type").size()
pd.merge(pd.DataFrame(tmp_size, columns=["size"]), 
         tmp_sums,
         on="rel_type")

Unnamed: 0_level_0,size,age_diff,kinship,xcor,cx
rel_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DS,3137,2.582,0.498,0.743,0.75
FD,949,24.335,0.497,0.68,0.707
FS,637,24.425,0.496,-0.0,0.0
MD,1874,23.776,0.497,0.485,0.5
MS,1281,23.754,0.496,0.678,0.707
SB,665,2.615,0.498,0.497,0.5
SS_DB,1861,3.012,0.499,0.354,0.354


## DOR2

In [38]:
is_dor = df_sample["DOR"] == 2

# add relationship annotation
is_parent_s_sib = (df_sample["age_diff"] >= 15) & (df_sample["age_diff"] <= 29)

is_mm = df_sample["sex_type"] == "mm"
exps = {"SFB": 0,
        "SMB": 1/4}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    df_sample.loc[is_dor & is_mm & is_parent_s_sib & is_rel, "rel_type"] = rel
    df_sample.loc[is_dor & is_mm & is_parent_s_sib & is_rel, "cx"] = xcor


# female-female
is_ff = df_sample["sex_type"] == "ff"
exps = {"DFS": 1/4,
        "DMS": 3/8}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    df_sample.loc[is_dor & is_ff & is_parent_s_sib & is_rel, "rel_type"] = rel
    df_sample.loc[is_dor & is_ff & is_parent_s_sib & is_rel, "cx"] = xcor

# male-female
is_mf = df_sample["sex_type"] == "mf"
exp_SFS = 0
exp_SMS = 3 * np.sqrt(2) / 8
exp_DFB = np.sqrt(2) / 4
exp_DMB = np.sqrt(2) / 8

is_SFS = (df_sample["xcor"] >= 0.8*exp_SFS) & (df_sample["xcor"] <= 1.2*exp_SFS)
is_SMS = (df_sample["xcor"] >= 0.8*exp_SMS) & (df_sample["xcor"] <= 1.2*exp_SMS)
is_DFB = (df_sample["xcor"] >= 0.8*exp_DFB) & (df_sample["xcor"] <= 1.2*exp_DFB)
is_DMB = (df_sample["xcor"] >= 0.8*exp_DMB) & (df_sample["xcor"] <= 1.2*exp_DMB)

is_female_older = ((df_sample["age"] > df_sample["age_rel"]) & (df_sample["sex"] == 0)) | ((df_sample["age"] < df_sample["age_rel"]) & (df_sample["sex_rel"] == 0))
is_female_younger = ((df_sample["age"] > df_sample["age_rel"]) & (df_sample["sex_rel"] == 0)) | ((df_sample["age"] < df_sample["age_rel"]) & (df_sample["sex"] == 0))

df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_SFS & is_female_older, "rel_type"] = "SFS"
df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_SFS & is_female_older, "cx"] = exp_SFS

df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_SMS & is_female_older, "rel_type"] = "SMS"
df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_SMS & is_female_older, "cx"] = exp_SMS

df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_DFB & is_female_younger, "rel_type"] = "DFB"
df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_DFB & is_female_younger, "cx"] = exp_DFB

df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_DMB & is_female_younger, "rel_type"] = "DMB"
df_sample.loc[is_dor & is_mf & is_parent_s_sib & is_DMB & is_female_younger, "cx"] = exp_DMB

In [39]:
for_sums = df_sample[is_dor]

tmp_sums = (for_sums[["rel_type", "age_diff", "kinship", "xcor", "cx"]]
            .dropna(subset=["rel_type"])
            .groupby("rel_type")
            .mean()
            .round(3))
tmp_size = for_sums.dropna(subset=["rel_type"]).groupby("rel_type").size()
pd.merge(pd.DataFrame(tmp_size, columns=["size"]), 
         tmp_sums,
         on="rel_type")

Unnamed: 0_level_0,size,age_diff,kinship,xcor,cx
rel_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DFB,286,20.052,0.249,0.346,0.354
DFS,330,20.206,0.249,0.252,0.25
DMB,229,20.31,0.249,0.175,0.177
DMS,579,20.352,0.248,0.378,0.375
SFB,586,20.444,0.247,-0.0,0.0
SMB,98,20.378,0.25,0.246,0.25
SMS,346,20.049,0.251,0.538,0.53


## DOR3

In [40]:
is_dor = df_sample["DOR"] == 3
# add relationship annotation
is_offlevel = (df_sample["age_diff"] >= 0) & (df_sample["age_diff"] <= 5)

# male-male
is_mm = df_sample["sex_type"] == "mm"
exps = {"SFBS_SFSS_SMBS": 0,
        "SMSS": 3/8}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    df_sample.loc[is_dor & is_mm & is_offlevel & is_rel, "rel_type"] = rel
    df_sample.loc[is_dor & is_mm & is_offlevel & is_rel, "cx"] = xcor


# female-female
is_ff = df_sample["sex_type"] == "ff"
exps = {"DFBD_DMBD": 1/4,
        "DFSD": 1/8,
        "DMSD": 3/16}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    df_sample.loc[is_dor & is_ff & is_offlevel & is_rel, "rel_type"] = rel
    df_sample.loc[is_dor & is_ff & is_offlevel & is_rel, "cx"] = xcor


# male-female
is_mf = df_sample["sex_type"] == "mf"

exps = {"SFBD_SFSD_DFBS_DMBS": 0,
        "SMBD_DFSS" : np.sqrt(2)/8,
        "SMSD_DMSS" : 3*np.sqrt(2)/16,}

for rel, xcor in exps.items():
    if xcor == 0:
        is_rel = (df_sample["xcor"] >= -0.05) & (df_sample["xcor"] <= 0.05)
    else:
        is_rel = (df_sample["xcor"] >= 0.8*xcor) & (df_sample["xcor"] <= 1.2*xcor)

    df_sample.loc[is_dor & is_mf & is_offlevel & is_rel, "rel_type"] = rel
    df_sample.loc[is_dor & is_mf & is_offlevel & is_rel, "cx"] = xcor


In [41]:
for_sums = df_sample[is_dor]

tmp_sums = (for_sums[["rel_type", "age_diff", "kinship", "xcor", "cx"]]
            .dropna(subset=["rel_type"])
            .groupby("rel_type")
            .mean()
            .round(3))
tmp_size = for_sums.dropna(subset=["rel_type"]).groupby("rel_type").size()
pd.merge(pd.DataFrame(tmp_size, columns=["size"]), 
         tmp_sums,
         on="rel_type")

Unnamed: 0_level_0,size,age_diff,kinship,xcor,cx
rel_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DFBD_DMBD,921,2.494,0.125,0.261,0.25
DFSD,935,2.536,0.124,0.125,0.125
DMSD,1274,2.572,0.124,0.186,0.188
SFBD_SFSD_DFBS_DMBS,5970,2.56,0.124,0.001,0.0
SFBS_SFSS_SMBS,3369,2.588,0.124,-0.001,0.0
SMBD_DFSS,947,2.571,0.124,0.176,0.177
SMSD_DMSS,1051,2.523,0.124,0.263,0.265
SMSS,287,2.7,0.124,0.372,0.375


In [43]:
df_sample.dropna()

Unnamed: 0,DOR,eid,age,sex,eid_rel,age_rel,sex_rel,kinship,xcor,sex_type,age_diff,rel_type,cx
3,1,1000094,65,0,3653174,64,0,0.4942,0.632092,ff,1,DS,0.750000
5,2,1000174,69,0,3765836,45,0,0.2504,0.332779,ff,24,DMS,0.375000
6,3,1950469,41,1,3765836,45,0,0.1022,0.012638,mf,4,SFBD_SFSD_DFBS_DMBS,0.000000
9,3,1950469,41,1,4593489,42,1,0.1154,-0.000106,mm,1,SFBS_SFSS_SMBS,0.000000
24,3,1000243,64,1,2489078,62,1,0.1070,0.049882,mm,2,SFBS_SFSS_SMBS,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
203475,1,5903106,52,0,5935693,55,0,0.5042,0.609106,ff,3,DS,0.750000
203680,3,5912328,58,0,5959541,58,1,0.1290,0.296907,mf,0,SMSD_DMSS,0.265165
203747,3,5915997,69,1,5970575,69,1,0.1154,-0.042444,mm,0,SFBS_SFSS_SMBS,0.000000
204312,1,5943063,42,0,6007495,64,1,0.4972,0.617095,mf,22,FD,0.707107


## Step 5. save results

In [55]:
path_to_save = "/home/jerrylee/data/pjt/BIGFAM.v.0.1/data/UKB/relative_information"

In [45]:
def check_coln(df):
    coln_type = {
        "eid": int, "eid_rel": int,
        "age": int, "age_rel": int,
        "sex": int, "sex_rel": int,
        "sex_type": str, "rel_type": str,
        "DOR": int, 
        "kinship": float, 
        "xcor": float, "cx": float,
    }
    
    for coln, dtype in coln_type.items():
        assert coln in df.columns, f"No {coln} in dataframe."
        
        df[coln] = df[coln].astype(dtype)
    
    return df

In [71]:
df_to_save = (check_coln(df_sample)
              .rename(columns={
                  "kinship": "ra",
                  "cx": "rx",
                  "xcor": "Erx"
              })
              .drop(columns=["age_diff"]))
df_to_save["Era"] = 0.5 ** df_to_save["DOR"]

(df_to_save[["DOR", "eid", "age", "sex", "eid_rel", "age_rel", "sex_rel", "sex_type", "rel_type", "Era", "ra", "Erx", "rx"]]
 .sort_values(by=["DOR", "eid", "eid_rel"])
 .reset_index(drop=True)
 .to_csv(
     f"{path_to_save}/relatives.info",
     sep='\t',
     index=False
 ))

In [72]:
df_to_save[["DOR", "eid", "age", "sex", "eid_rel", "age_rel", "sex_rel", "sex_type", "rel_type", "Era", "ra", "Erx", "rx"]].head(3)

Unnamed: 0,DOR,eid,age,sex,eid_rel,age_rel,sex_rel,sex_type,rel_type,Era,ra,Erx,rx
0,3,1000031,52,0,4977365,45,0,ff,,0.125,0.1368,0.040435,
3,1,1000094,65,0,3653174,64,0,ff,DS,0.5,0.4942,0.632092,0.75
4,2,1000122,68,0,1390862,63,0,ff,,0.25,0.243,0.501473,


# GS

## Step 1. Load data

In [73]:
df_kinship = pd.read_csv("/data/jerrylee/data/GS/kinpair.xls", sep='\t')
df_pedigree = pd.read_csv("/data/jerrylee/data/GS/pedigree.xls", sep='\t',
                          dtype={"volid": int, "father": int, "mother": int})
df_pheno = pd.read_csv("/data/jerrylee/data/GS/phenotypes.xls", sep='\t')

In [76]:
tt = tools.remove_duplicate_relpair(df_kinship.copy(), ["volid", "relid"])
n_pair = len(tt)
n_indiv = len((set(tt["volid"]) | set(tt["relid"])))
n_pair, n_indiv

(40254, 18236)

## Step 2. DOR

### Step 2.1. Check the number of relationships in each DOR

In [71]:
rels = {
    1: ["PC", "SB"],
    2: ["AV", "GP", "HSB"],
    3: ["1C", "HAV", "GG"],
    4: ["H1C"],
}

In [72]:
df_kinship["DOR"] = 0

for dor in rels.keys():
    rels_in_dor = rels[dor]
    df_kinship.loc[df_kinship["rcode"].isin(rels_in_dor), "DOR"] = int(dor)

df_kinship.head()

Unnamed: 0,volid,relid,rcode,famid,DOR
0,21244,18826,SB,2,1
1,18826,21244,SB,2,1
2,34422,23884,SB,3,1
3,23884,34422,SB,3,1
4,67267,67531,SB,4,1


In [67]:
df_kinship.groupby("DOR").size() / 2

DOR
0    11377.0
1    18259.0
2     7796.0
3     2747.0
4       75.0
dtype: float64

In [68]:
sum(df_kinship.groupby("DOR").size() / 2)

40254.0

In [6]:
for d in sorted(rels.keys()):
    print(f"{d} degree")
    
    for rel in sorted(rels[d]):
        n_pair = int(len(df_kinship[df_kinship["rcode"] == rel])/2)
        print(f"- {rel} : {n_pair}")

1 degree
- PC : 9812
- SB : 8447
2 degree
- AV : 6576
- GP : 842
- HSB : 378
3 degree
- 1C : 2431
- GG : 3
- HAV : 313
4 degree
- H1C : 75


In [7]:
len(df_kinship["famid"].unique())

4830

### Step 2.2 make info

In [8]:
# relationship (volid, relid, rcode, famid, dor)
df_dor123 = (df_kinship[df_kinship["DOR"].isin([1, 2, 3])]
             .copy()
             .reset_index(drop=True))
# sex info (volid, sex)
df_sex = (df_pheno[["id", "sex"]]
          .copy()
          .rename(columns={"id": "volid"}))

In [9]:
# validate dataframe

# relationship in `df_dor123` has two row in one pair 
# example. (id1, id2) and (id2, id1)
# remove duplicated sets

# is volid-relid duplicated in `df_dor123`?
unique_relpairs = tools.get_unique_set(df_dor123, ["volid", "relid"])
print(len(unique_relpairs)) # 28802 unique relative pairs

# Is all ids in `df_pair` have sex info in `df_sex`?
ids_have_rel = set([int(item) for inner_list in [item.split("_") for item in unique_relpairs] for item in inner_list])
ids_have_sex = set(df_sex["volid"].values)

print(len(ids_have_rel)) # 18009 id have relatives
print(len(ids_have_sex)) # 19982 id have sex info
if len(ids_have_rel - ids_have_sex) == 0:
    print("all ids in `df_pairs` have sex info in `df_sex`")
else:
    print(ids_have_rel - ids_have_sex)



28802
18009
19982
all ids in `df_pairs` have sex info in `df_sex`


## Step 3. Add X correlation & rel_type annotation

In [10]:
dict_rel_rx = ped.dict_rel_rx

In [11]:
df_pairs = (pd.merge(df_sex, df_dor123, on="volid")
            .sort_values(by="famid")
            .reset_index(drop=True))

In [12]:
df_pairs # [r1, r2] and [r2, r1]

Unnamed: 0,volid,sex,relid,rcode,famid,DOR
0,18826,F,21244,SB,2,1
1,21244,F,18826,SB,2,1
2,34422,F,23884,SB,3,1
3,23884,M,34422,SB,3,1
4,79198,F,67531,PC,4,1
...,...,...,...,...,...,...
57599,18001,M,79563,SB,9007,1
57600,79563,F,80945,PC,9007,1
57601,18001,M,80945,AV,9007,2
57602,80945,F,79563,PC,9007,1


In [35]:
def rcode_len_matching(rcode, path):
    if (rcode in ["PC", "SB"]) & (len(path) == 2):
        return True
    if (rcode in ["HSB", "GP", "AV"]) & (len(path) == 3):
        return True
    if (rcode in ["1C", "GG", "HAV"]) & (len(path) == 4):
        return True
    return False

In [51]:
df_new = pd.DataFrame()

# stop=False
# path_lens = pd.DataFrame(columns=["rcode", "len"])

for famid in tqdm(df_pairs["famid"].unique()):
    
    if famid == 4091: # strange..
        continue
    
    df_pair = df_pairs[(df_pairs["famid"] == famid)]
    df_parent = df_pedigree[df_pedigree["famid"] == famid]
    
    # left offspring which has sex info
    df_ped = (pd.merge(df_parent, 
                       df_sex[df_sex["volid"].isin(df_parent[["volid", "father", "mother"]].values.flatten())],
                       on="volid", 
                       how="outer")
              .rename(columns={"volid": "offspring"}))
    df_ped = df_ped[~((df_ped["father"] == 0) & (df_ped["mother"] == 0))].copy()
    
    # add sex which does not exist in `df_sex`, but can infer from `df_pedigree`
    for _, ambig_row in df_ped[df_ped.isna().any(axis=1)].iterrows():
        ambig_id = ambig_row["offspring"]
        
        if ambig_id in df_ped["father"].unique():
            ambig_sex = "M"
        elif ambig_id in df_ped["mother"].unique():
            ambig_sex = "F"
        else:
            ambig_sex = np.nan
        
        if ambig_sex == np.nan:
            pass
        else:
            df_ped.loc[df_ped["offspring"] == ambig_id, "sex"] = ambig_sex

    df_ped = df_ped.copy().dropna()
    
    # make edge
    df_edges = ped.make_direct_edges(df_ped)
    graph, idx_id = ped.df2graph(df_edges)
    
    # get relationship and rx
    for ii, row in df_pair.iterrows():
        rcode = row["rcode"]
        try:
            ids = row[["volid", "relid"]].to_list()
            idx1 = idx_id.loc[idx_id["id"] == ids[0], "idx"].values[0]
            idx2 = idx_id.loc[idx_id["id"] == ids[1], "idx"].values[0]
            
            shortest_path = ped.get_shortest_path(graph, idx1, idx2)
            
            if not rcode_len_matching(rcode, shortest_path):
                continue
            
            # path_lens.loc[len(path_lens)] = [row["rcode"], len(shortest_path)]
            # convert index to id
            id_path = []
            for idx in shortest_path:
                id_path += [idx_id.loc[idx_id["idx"] == idx, "id"].values[0]]
            
            # iteratively compute rel & rx
            rels = []
            rxs = 1
            i = 0
            for i in range(len(id_path)-1):
                id1 = id_path[i]
                id2 = id_path[i+1]
                relationship = df_edges.loc[((df_edges["id1"] == id1) & (df_edges["id2"] == id2)) | ((df_edges["id1"] == id2) & (df_edges["id2"] == id1)), "relationship"].values[0]
                rel, rx = dict_rel_rx[relationship]
                rels += [rel]
                rxs *= rx
                
            row["rel_type"] = "_".join(rels)
            row["rx"] = rxs
            df_new = pd.concat([df_new, row.to_frame().T], axis=0)
        except:
            print(row, flush=True)
        
    #     if row["rcode"] == "AV":
    #         stop = True
    #         break
    # if stop:
    #     break
    
    # df_new.to_csv(
    #     "/data/jerrylee/pjt/BIGFAM.v.0.1/data/GS/relative_information/tmp.info", 
    #     sep='\t',
    #     index=False)

100%|██████████| 4757/4757 [09:43<00:00,  8.15it/s]


In [52]:
df_pairs[df_pairs["volid"] == 93961]

Unnamed: 0,volid,sex,relid,rcode,famid,DOR
40634,93961,M,50162,AV,4091,2
40635,93961,M,77470,1C,4091,3


In [53]:
df_pairs[df_pairs["famid"] == 4091]

Unnamed: 0,volid,sex,relid,rcode,famid,DOR
40630,50162,F,77470,PC,4091,1
40631,50162,F,93961,AV,4091,2
40632,77470,F,50162,PC,4091,1
40633,77470,F,93961,1C,4091,3
40634,93961,M,50162,AV,4091,2
40635,93961,M,77470,1C,4091,3


In [55]:
# remove famid=4091 (strange...)
df_gs = (df_new[df_new["famid"] != 4091]
         .copy()
         .reset_index(drop=True))
df_gs

Unnamed: 0,volid,sex,relid,rcode,famid,DOR,rel_type,rx
0,18826,F,21244,SB,2,1,DS,0.75
1,21244,F,18826,SB,2,1,DS,0.75
2,34422,F,23884,SB,3,1,SS+DB,0.353553
3,23884,M,34422,SB,3,1,SS+DB,0.353553
4,79198,F,67531,PC,4,1,DM,0.5
...,...,...,...,...,...,...,...,...
57589,18001,M,79563,SB,9007,1,SS+DB,0.353553
57590,79563,F,80945,PC,9007,1,DM,0.5
57591,18001,M,80945,AV,9007,2,SS+DB_DM,0.176777
57592,80945,F,79563,PC,9007,1,DM,0.5


## Step 4. Save results

In [22]:
# left only unique relative pairs
# df_gs = tools.remove_duplicate_relpair(df_gs, ["volid", "relid"])

In [58]:
df_agesex = (df_pheno[["id", "sex", "age"]]
             .copy()
             .rename(columns={
                 "id": "volid",
                 "sex": "volsex",
                 "age": "volage",
                 }))

df_mrg = pd.merge(
    df_gs,
    df_agesex,
    on=["volid"]
)

df_mrg = pd.merge(
    df_mrg, 
    df_agesex.rename(
        columns={
            "volid": "relid", 
            "volsex": "relsex",
            "volage": "relage"
            }
        ),
    on="relid")
df_mrg

Unnamed: 0,volid,sex,relid,rcode,famid,DOR,rel_type,rx,volsex,volage,relsex,relage
0,18826,F,21244,SB,2,1,DS,0.75,F,50,F,36
1,21244,F,18826,SB,2,1,DS,0.75,F,36,F,50
2,34422,F,23884,SB,3,1,SS+DB,0.353553,F,33,M,35
3,23884,M,34422,SB,3,1,SS+DB,0.353553,M,35,F,33
4,79198,F,67531,PC,4,1,DM,0.5,F,66,F,44
...,...,...,...,...,...,...,...,...,...,...,...,...
57589,80945,F,18001,AV,9007,2,DM_SS+DB,0.176777,F,30,M,66
57590,79563,F,80945,PC,9007,1,DM,0.5,F,60,F,30
57591,18001,M,80945,AV,9007,2,SS+DB_DM,0.176777,M,66,F,30
57592,18001,M,79563,SB,9007,1,SS+DB,0.353553,M,66,F,60


In [59]:
df_mrg["Era"] = 0.5**df_mrg["DOR"]
df_mrg["Erx"] = df_mrg["rx"]

# sex_type
# df_mrg["sex_type"] = "mf"
# df_mrg.loc[(df_mrg["sex"] == "M") & (df_mrg["sex_rel"] == "M"), "sex_type"] = "mm"
# df_mrg.loc[(df_mrg["sex"] == "F") & (df_mrg["sex_rel"] == "F"), "sex_type"] = "ff"

# recode sex
# df_mrg["sex"] = df_mrg["sex"].replace({"M": 1, "F": 0})

In [60]:
target_colns = ["DOR", "volid", "volage", "volsex", "relid", "relage", "relsex", "rcode", "rel_type", "Era", "Erx"]

df_mrg = df_mrg[target_colns]

In [61]:
df_mrg

Unnamed: 0,DOR,volid,volage,volsex,relid,relage,relsex,rcode,rel_type,Era,Erx
0,1,18826,50,F,21244,36,F,SB,DS,0.5,0.75
1,1,21244,36,F,18826,50,F,SB,DS,0.5,0.75
2,1,34422,33,F,23884,35,M,SB,SS+DB,0.5,0.353553
3,1,23884,35,M,34422,33,F,SB,SS+DB,0.5,0.353553
4,1,79198,66,F,67531,44,F,PC,DM,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...
57589,2,80945,30,F,18001,66,M,AV,DM_SS+DB,0.25,0.176777
57590,1,79563,60,F,80945,30,F,PC,DM,0.5,0.5
57591,2,18001,66,M,80945,30,F,AV,SS+DB_DM,0.25,0.176777
57592,1,18001,66,M,79563,60,F,SB,SS+DB,0.5,0.353553


In [62]:
df_mrg.to_csv(
    "/data/jerrylee/pjt/BIGFAM.v.0.1/data/GS/relative_information/relatives.raw.info",
    sep='\t',
    index=False
)