In [5]:
from pathlib import Path
import pandas as pd
import re
import numpy as np
from yaml import load
from yaml import CLoader as Loader
import sys
from collections import defaultdict

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [24]:
result_tsv = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01d.genotype.manual/results/IFNL3_4.consolidate.tsv")
id_ref = Path("/home/_shared/jscliu/project/2025/Flagship/reference/sample_info_annot.2024-11-12.csv")
allele_func = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/script/aldy_custom_allele_functions.yml")

# Output
sumDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01d.genotype.manual/summary")
all_perGene_prefix:str = str(sumDir/"all_perGenes")
perGene_prefix:str = str(sumDir/"perGenes")
af_prefix:str = str(sumDir/"allele_frequency")
pheno_prefix:str = str(sumDir/"phenotype")

In [25]:
result_df = pd.read_table(result_tsv)
result_df['CPRA'] = result_df.apply(lambda r: f"{r.chr}.{r.pos}.{r.ref}.{r.alt}", axis=1)
rsid_map:dict = {
    'chr19.39252525.T.G': 'rs8099917', 
    'chr19.39248513.CT.C': 'rs11322783_1', 
    'chr19.39248515.T.G': 'rs11322783_2'
}
result_df['rsID'] = result_df['CPRA'].apply(lambda x: rsid_map[x])

In [26]:
result_df

Unnamed: 0,participant_id,chr,pos,ref,alt,GT,CPRA,rsID
0,SRE009873,chr19,39248513,CT,C,0|1,chr19.39248513.CT.C,rs11322783_1
1,SRE009873,chr19,39248515,T,G,0|1,chr19.39248515.T.G,rs11322783_2
2,SRE019319,chr19,39248513,CT,C,0|1,chr19.39248513.CT.C,rs11322783_1
3,SRE019319,chr19,39248515,T,G,0|1,chr19.39248515.T.G,rs11322783_2
4,SRE019319,chr19,39252525,T,G,0/1,chr19.39252525.T.G,rs8099917
...,...,...,...,...,...,...,...,...
8241,SRE026509,chr19,39248515,T,G,0|1,chr19.39248515.T.G,rs11322783_2
8242,SRE026509,chr19,39252525,T,G,0/1,chr19.39252525.T.G,rs8099917
8243,SRE018821,chr19,39248513,CT,C,0|1,chr19.39248513.CT.C,rs11322783_1
8244,SRE018821,chr19,39248515,T,G,0|1,chr19.39248515.T.G,rs11322783_2


In [27]:
# Read id reference and subset for useful columns
def read_id_ref(id_ref, u_chi=True):
    keep_col = ["sre_participant_id", "founder_status", "sre_lab_id", "hkgp_id"]
    # if u_chi, get unrelated-Chinese only
    id_ref_df = pd.read_csv(id_ref)
    if u_chi:
        id_ref_df = id_ref_df.loc[
            (id_ref_df['founder_status']=='Founder') & (id_ref_df['treated_ethnicity']=='Chinese'), 
            keep_col
        ].copy()
    else:
        id_ref_df = id_ref_df.loc[:, keep_col].copy()
    id_ref_df = id_ref_df.loc[~id_ref_df.duplicated(subset=['sre_participant_id']), :]
    id_ref_df.rename(columns={"sre_lab_id": "sample_id"}, inplace=True)
    return id_ref_df

id_ref_df = read_id_ref(id_ref, u_chi=True)
all_id_ref_df = read_id_ref(id_ref, u_chi=False)

In [28]:
# All annotations we have 
id_map_d:dict = all_id_ref_df.loc[:, ['hkgp_id', 'sre_participant_id']].set_index('hkgp_id').to_dict()['sre_participant_id']

In [30]:
def annotate_haplotype(gene, haplotype, allele_map_d):
    # Error if gene is not defined
    if gene not in allele_map_d.keys():
        print(f"{gene} not defined. Exiting")
        sys.exit(1)

    # Return original haplotype if all alleles are mapped
    if allele_map_d[gene] == "all_mapped":
        return haplotype

    # Return annotated allele if defined in allele_map_d, otherwise return haplotype
    if haplotype in allele_map_d[gene].keys():
        return allele_map_d[gene][haplotype]
    else:
        return haplotype

def count_zygosity(called_df):
    """Intake called_df, count the number of homozygous and heterozygous carriers into a dict"""
    homo_dict=defaultdict(int)
    hetero_dict=defaultdict(int)
    for _, row in called_df.iterrows():
        if row.haplotype_1 == row.haplotype_2:
            # Homozygous
            homo_dict[row.haplotype_1] += 1
        else:
            # Heterozygous
            hetero_dict[row.haplotype_1] += 1
            hetero_dict[row.haplotype_2] += 1
    return homo_dict, hetero_dict

In [31]:
with open(allele_func) as handle:
    allele_func_d = load(handle, Loader=Loader)

# Summarize rs11322783

In [32]:
perGene_df_d = dict()

In [33]:
complete_rs11322783_genotypes:dict = dict()
ambiguous_rs11322783_genotypes:list = list()
rs11322783_split_df = result_df.loc[result_df['rsID'].str.startswith('rs11322783'), :].copy()

In [36]:
for sre_patient_id in rs11322783_split_df.participant_id.unique():
    tmp_df = rs11322783_split_df.loc[rs11322783_split_df['participant_id']==sre_patient_id, :].set_index('rsID', drop=True)
    if set(tmp_df.index) != {'rs11322783_1', 'rs11322783_2'}:
        # Not exactly _1 and _2
        ambiguous_rs11322783_genotypes.append(sre_patient_id)
    elif tmp_df.loc['rs11322783_1', 'GT'] != tmp_df.loc['rs11322783_2', 'GT']:
        # Genotype unmatched
        ambiguous_rs11322783_genotypes.append(sre_patient_id)
    else:
        # Complete and consistent genotype
        complete_rs11322783_genotypes[sre_patient_id] =tmp_df.loc['rs11322783_1', 'GT']

In [37]:
if len(ambiguous_rs11322783_genotypes) > 0:
    print(f"ERROR: ambiguous genotype identify: {ambiguous_rs11322783_genotypes}")
    sys.exit(1)

In [41]:
rs11322783_df = pd.DataFrame(complete_rs11322783_genotypes, index=['GT']).T

# Merge rs11322783_df to all_id_ref_df
ifnl4_rs11322783_df = all_id_ref_df.set_index("sre_participant_id").merge(rs11322783_df, left_index=True, right_index=True, how='left')

In [42]:
def interpret_gt(GT):
    ref = "TT"
    alt = "G"
    if GT != GT:
        # Assumed homozygous ref
        major = "0|0"
        hap_1 = ref
        hap_2 = ref
    elif GT == "0|1":
        # Heterozygous
        major = GT
        hap_1 = ref
        hap_2 = alt
    elif GT == "1|1":
        # Homozygous alt
        major = GT
        hap_1 = alt
        hap_2 = alt
    else:
        return pd.Series([np.nan]*4)
    genotype = f"{hap_1}/{hap_2}"
    return pd.Series([major, hap_1, hap_2, genotype])
ifnl4_rs11322783_df[['major', 'haplotype_1', 'haplotype_2', 'genotype']] = ifnl4_rs11322783_df['GT'].apply(interpret_gt)
ifnl4_rs11322783_df.dropna(subset=['major', 'haplotype_1', 'haplotype_2', 'genotype'], inplace=True)

In [43]:
ifnl4_rs11322783_df['gene'] = 'IFNL4_rs11322783'

In [46]:
# Annotate the haplotype functions
gene = "IFNL4_rs11322783"
keep_col = ['founder_status', 'gene', 'sample_id', 'major', 'haplotype_1', 'haplotype_2', 'genotype']
df = ifnl4_rs11322783_df.loc[:, keep_col].copy()
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype(f"{gene}", r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype(f"{gene}", r.haplotype_2, allele_func_d), axis=1)

In [47]:
# Annotate the phenotype
def annotate_ifnl4_1b_phenotype(hap_1_func, hap_2_func):
    if "Carrier of rs11322783 variant (G)" in [hap_1_func, hap_2_func]:
        phenotype = "Decreased response (including sustained virological response (svr)) to pegIFN-alpha/ribavirin"
    else:
        phenotype = "Improved response (including sustained virological response (svr)) to pegIFN-alpha/ribavirin"
    return pd.Series([phenotype, np.nan])
df[['phenotype', 'EHR_priority_notation']] = df.apply(
    lambda r: annotate_ifnl4_1b_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1
)

In [48]:
perGene_df_d[gene] = df

# Summarize IFNL3 rs8099917

In [49]:
rs8099917_df = result_df.loc[result_df['rsID'].str.startswith('rs8099917'), ['GT']].copy()

In [50]:
# Merge rs8099917_df to all_id_ref_df
ifnl3_rs8099917_df = all_id_ref_df.set_index("sre_participant_id").merge(rs8099917_df, left_index=True, right_index=True, how='left')

In [51]:
def interpret_rs8099917_gt(GT):
    ref = "T"
    alt = "G"
    if GT != GT:
        # Assumed homozygous ref
        major = "0/0"
        hap_1 = ref
        hap_2 = ref
    elif GT == "0/1":
        # Heterozygous
        major = GT
        hap_1 = ref
        hap_2 = alt
    elif GT == "1/1":
        # Homozygous alt
        major = GT
        hap_1 = alt
        hap_2 = alt
    else:
        return pd.Series([np.nan]*4)
    genotype = f"{hap_1}/{hap_2}"
    return pd.Series([major, hap_1, hap_2, genotype])
ifnl3_rs8099917_df[['major', 'haplotype_1', 'haplotype_2', 'genotype']] = ifnl3_rs8099917_df['GT'].apply(interpret_rs8099917_gt)

In [52]:
ifnl3_rs8099917_df['gene'] = 'IFNL3_rs8099917'

In [53]:
# Annotate the haplotype functions
gene = "IFNL3_rs8099917"
keep_col = ['founder_status', 'gene', 'sample_id', 'major', 'haplotype_1', 'haplotype_2', 'genotype']
df = ifnl3_rs8099917_df.loc[:, keep_col].copy()
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype(f"{gene}", r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype(f"{gene}", r.haplotype_2, allele_func_d), axis=1)

In [54]:
# Annotate the phenotype
def annotate_ifnl3_1b_phenotype(hap_1_func, hap_2_func):
    if "Carrier of rs8099917 variant (G)" in [hap_1_func, hap_2_func]:
        phenotype = "Decreased response (lower SVR) to peginterferon alfa and ribavirin therapy"
    else:
        phenotype = "Increased response (higher SVR) to peginterferon alfa and ribavirin therapy"
    return pd.Series([phenotype, np.nan])
df[['phenotype', 'EHR_priority_notation']] = df.apply(
    lambda r: annotate_ifnl3_1b_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1
)

In [55]:
perGene_df_d[gene] = df

In [59]:
for gene, all_df in perGene_df_d.items():
    founder_id:list = id_ref_df['sre_participant_id'].tolist()
    called_id:list = all_df.index.tolist()
    called_founder_id:list = list(set(called_id).intersection(set(founder_id)))
    df = all_df.loc[called_founder_id, :].copy()
    
    # Groupby allele and calculate the frequencies
    # Initialize af_df
    af_df = pd.concat([
        df.loc[:, ['haplotype_1', 'haplotype_1_func']].rename(columns={'haplotype_1': 'variant', 'haplotype_1_func': 'allele_function'}), 
        df.loc[:, ['haplotype_2', 'haplotype_2_func']].rename(columns={'haplotype_2': 'variant', 'haplotype_2_func': 'allele_function'})
    ]).groupby('variant').agg(list)
    
    # Get general information from af_df
    af_df['allele_count'] = af_df['allele_function'].apply(len)
    af_df['gene'] = gene
    af_df['allele_function'] = af_df['allele_function'].apply(lambda x: x[0])
    af_df.dropna(subset=['allele_function'], inplace=True)
    af_df.sort_values(by=['allele_function', 'allele_count'], ascending=False, inplace=True)
    af_df['variant'] = af_df.index
    af_df['allele_number'] = af_df['allele_count'].sum()
    af_df['allele_frequency'] = af_df.apply(lambda r: r.allele_count / r.allele_number, axis=1)
    
    # Count zygosity and merge it to df
    homo_d, hetero_d = count_zygosity(df)
    homo_df = pd.DataFrame(homo_d, index=["no_homozygous_carriers"]).T
    hetero_df = pd.DataFrame(hetero_d, index=["no_heterozygous_carriers"]).T
    zygosity_df = hetero_df.merge(homo_df, left_index=True, right_index=True, how='outer').fillna(0).astype(int)
    af_df = af_df.merge(zygosity_df, left_index=True, right_index=True, how='left')
    
    # Subset for required columns and export to CSV
    af_colnames = [
        "gene", "variant", "allele_function", 
        "allele_frequency", "allele_count", "allele_number", 
        "no_heterozygous_carriers", "no_homozygous_carriers"
    ]
    af_df = af_df.loc[:, af_colnames].copy()
    
    # For phenotype CSV
    pheno_df = df.groupby('phenotype').agg(list)
    pheno_df['pharmacogene'] = gene
    pheno_df['genotype'] = pheno_df['genotype'].apply(lambda x: ",".join(list(set(x))))
    pheno_df['phenotype'] = pheno_df.index
    pheno_df['no_carriers_with_risk_alleles'] = pheno_df['sample_id'].apply(len)
    pheno_df['all_genotyped_individuals'] = pheno_df['no_carriers_with_risk_alleles'].sum()
    pheno_df['carrier_frequency'] = pheno_df.apply(lambda r: r.no_carriers_with_risk_alleles / r.all_genotyped_individuals, axis=1)
    pheno_df.sort_values(by=['carrier_frequency'], ascending=False, inplace=True)
    
    # Subset of columns to be exported
    keep_colnames = [
        'pharmacogene', 'genotype', 'phenotype', 'carrier_frequency', 
        'no_carriers_with_risk_alleles', 'all_genotyped_individuals'
    ]
    pheno_df = pheno_df.loc[:, keep_colnames].copy()
    
    # Export to CSV
    all_perGene_csv = f"{all_perGene_prefix}.{gene}.csv"
    perGene_csv = f"{perGene_prefix}.{gene}.csv"
    af_csv = f"{af_prefix}.{gene}.csv"
    pheno_csv = f"{pheno_prefix}.{gene}.csv"
    all_df.to_csv(all_perGene_csv, index=True, index_label='sre_participant_id')
    df.to_csv(perGene_csv, index=True, index_label='sre_participant_id')
    af_df.to_csv(af_csv, index=False)
    pheno_df.to_csv(pheno_csv, index=False)