In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import multiprocessing
from collections import defaultdict
from yaml import load
from yaml import CLoader as Loader

In [2]:
sumDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/summary")
allParticipant_aldy_out_tsv = sumDir/"allParticipants_aldy_out.tsv"
founder_aldy_out_tsv = sumDir/"founder_aldy_out.tsv"
allele_map = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/script/aldy_allele_map.yml")
allele_func = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/script/aldy_custom_allele_functions.yml")
all_perGene_prefix:str = str(sumDir/"all_perGenes")
perGene_prefix:str = str(sumDir/"perGenes")
af_prefix:str = str(sumDir/"allele_frequency")
pheno_prefix:str = str(sumDir/"phenotype")

genes_need_custom_def = ["CFTR", "CYP3A4", "CYP4F2", "IFNL3", "VKORC1"]
perGene_df_d:dict = dict()

In [3]:
with open(allele_func) as handle:
    allele_func_d = load(handle, Loader=Loader)

In [4]:
def annotate_haplotype(gene, haplotype, allele_map_d):
    # Error if gene is not defined
    if gene not in allele_map_d.keys():
        print(f"{gene} not defined. Exiting")
        sys.exit(1)

    # Return original haplotype if all alleles are mapped
    if allele_map_d[gene] == "all_mapped":
        return haplotype

    # Return annotated allele if defined in allele_map_d, otherwise return haplotype
    if haplotype in allele_map_d[gene].keys():
        return allele_map_d[gene][haplotype]
    else:
        return haplotype

def count_zygosity(called_df):
    """Intake called_df, count the number of homozygous and heterozygous carriers into a dict"""
    homo_dict=defaultdict(int)
    hetero_dict=defaultdict(int)
    for _, row in called_df.iterrows():
        if row.haplotype_1 == row.haplotype_2:
            # Homozygous
            homo_dict[row.haplotype_1] += 1
        else:
            # Heterozygous
            hetero_dict[row.haplotype_1] += 1
            hetero_dict[row.haplotype_2] += 1
    return homo_dict, hetero_dict

In [5]:
allParticipant_aldy_out= pd.read_table(allParticipant_aldy_out_tsv , index_col=[0])
custom_allParticipant_aldy_out = allParticipant_aldy_out.loc[allParticipant_aldy_out['gene'].isin(genes_need_custom_def), :].copy()

In [6]:
founder_aldy_out = pd.read_table(founder_aldy_out_tsv, index_col=[0])
custom_founder_aldy_out = founder_aldy_out.loc[founder_aldy_out['gene'].isin(genes_need_custom_def), :].copy()

In [7]:
gene_df_d:dict = dict()
for gene in genes_need_custom_def:
    df = allParticipant_aldy_out.loc[allParticipant_aldy_out['gene']==gene, :].copy()
    sample_w_sol2:list = df.loc[df['solutionID']==2, 'sample_id'].unique().tolist()
    gene_df_d[gene] = df.loc[~df['sample_id'].isin(sample_w_sol2), :].copy()

# CFTR

In [8]:
gene = 'CFTR'
perGene_csv = f"{perGene_prefix}.{gene}.csv"
af_csv = f"{af_prefix}.{gene}.csv"
pheno_csv = f"{pheno_prefix}.{gene}.csv"
df = gene_df_d[gene].copy()

In [9]:
# Reannotate *WT+rs76151804 as 3272-26A->G according to pharmGKB annotations
df['haplotype_1'] = df['haplotype_1'].apply(lambda x: "3272-26A->G" if x == "*WT+rs76151804" else x)
df['haplotype_2'] = df['haplotype_2'].apply(lambda x: "3272-26A->G" if x == "*WT+rs76151804" else x)

In [10]:
df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)

In [11]:
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype('CFTR', r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype('CFTR', r.haplotype_2, allele_func_d), axis=1)

In [12]:
# For perGene csv: Annotate the phenotype and EHR_priority_notation
def annotate_cftr_phenotype(hap_1_func, hap_2_func):
    if (hap_1_func == "Non response to ivacaftor if homozygous") & (hap_2_func == "Non response to ivacaftor if homozygous"):
        return pd.Series(["Non response to ivacaftor if homozygous", np.nan])
    else:
        return pd.Series(["Responsive", np.nan])
df[['phenotype', 'EHR_priority_notation']] = df.apply(
    lambda r: annotate_cftr_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1
)
perGene_df_d[gene] = df

# CYP3A4

In [13]:
gene = 'CYP3A4'
df = gene_df_d[gene].copy()

In [14]:
# Prep perGene CSV
# Drop entries (genotypes) which haplotype_1 or (and) haplotype_2 is (are) undefined
df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype('CYP3A4', r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype('CYP3A4', r.haplotype_2, allele_func_d), axis=1)
df = df.loc[(~df['haplotype_1_func'].isin(['NA'])) & (~df['haplotype_2_func'].isin(['NA'])), :].copy()

In [15]:
# Add the phenotype and EHR_prioriy_notation
def annotate_cyp3a4_phenotype(hap_1_func, hap_2_func):
    if ("Decreased function" in hap_1_func) & ("Decreased function" in hap_2_func):
        cyp3a4_pheno = "CYP3A4 Poor Metabolizer"
    elif ("Decreased function" in hap_1_func) | ("Decreased function" in hap_2_func):
        cyp3a4_pheno = "CYP3A4 Intermediate Metabolizer"
    else:
        cyp3a4_pheno = "CYP3A4 Normal Metabolizer"
    return pd.Series([cyp3a4_pheno, np.nan])
    
df[['phenotype', 'EHR_priority_notation']] = df.apply(lambda r: annotate_cyp3a4_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1)
perGene_df_d[gene] = df

# CYP4F2

In [16]:
gene = 'CYP4F2'
df = gene_df_d[gene].copy()

In [17]:
# Prep perGene CSV
df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype('CYP4F2', r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype('CYP4F2', r.haplotype_2, allele_func_d), axis=1)

In [18]:
# Add the phenotype and EHR_prioriy_notation
def annotate_cyp4f2_phenotype(hap_1_func, hap_2_func):
    if 'Carrier of rs2108622 (A)' in [hap_1_func, hap_2_func]:
        return pd.Series(['Decreased function', np.nan])
    else:
        return pd.Series(['Normal function', np.nan])
df[['phenotype', 'EHR_priority_notation']] = df.apply(lambda r: annotate_cyp4f2_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1)
perGene_df_d[gene] = df

# IFNL3

In [19]:
gene = 'IFNL3'
df = gene_df_d[gene].copy()

In [20]:
# Prep perGene CSV
df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype('IFNL3', r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype('IFNL3', r.haplotype_2, allele_func_d), axis=1)

In [21]:
# Add the phenotype and EHR_prioriy_notation
def annotate_ifnl3_phenotype(hap_1_func, hap_2_func):
    if 'Carrier of rs12979860 (T)' in [hap_1_func, hap_2_func]:
        return pd.Series(['Unfavourable response', np.nan])
    else:
        return pd.Series(['Favourable response', np.nan])
df[['phenotype', 'EHR_priority_notation']] = df.apply(lambda r: annotate_ifnl3_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1)
perGene_df_d[gene] = df

# VKORC1

In [22]:
gene = 'VKORC1'
df = gene_df_d[gene].copy()

In [23]:
# Prep perGene CSV
df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype('VKORC1', r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype('VKORC1', r.haplotype_2, allele_func_d), axis=1)

In [24]:
# Add the phenotype and EHR_prioriy_notation
def annotate_vkorc1_phenotype(hap_1_func, hap_2_func):
    if 'Carrier of rs9923231 variant (T)' in [hap_1_func, hap_2_func]:
        return pd.Series(['Increased risk of over-anticoagulation', np.nan])
    else:
        return pd.Series(['Normal risk', np.nan])
df[['phenotype', 'EHR_priority_notation']] = df.apply(lambda r: annotate_vkorc1_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1)
perGene_df_d[gene] = df

# Calculate the allele frequencies and phenotype carrier frequencies

In [25]:
for gene, all_df in perGene_df_d.items():
    df = all_df.loc[all_df['founder_status']=='Founder', :].copy()
    
    # Groupby allele and calculate the frequencies
    # Initialize af_df
    af_df = pd.concat([
        df.loc[:, ['haplotype_1', 'haplotype_1_func']].rename(columns={'haplotype_1': 'variant', 'haplotype_1_func': 'allele_function'}), 
        df.loc[:, ['haplotype_2', 'haplotype_2_func']].rename(columns={'haplotype_2': 'variant', 'haplotype_2_func': 'allele_function'})
    ]).groupby('variant').agg(list)
    
    # Get general information from af_df
    af_df['allele_count'] = af_df['allele_function'].apply(len)
    af_df['gene'] = gene
    af_df['allele_function'] = af_df['allele_function'].apply(lambda x: x[0])
    af_df.dropna(subset=['allele_function'], inplace=True)
    af_df.sort_values(by=['allele_function', 'allele_count'], ascending=False, inplace=True)
    af_df['variant'] = af_df.index
    af_df['allele_number'] = af_df['allele_count'].sum()
    af_df['allele_frequency'] = af_df.apply(lambda r: r.allele_count / r.allele_number, axis=1)
    
    # Count zygosity and merge it to df
    homo_d, hetero_d = count_zygosity(df)
    homo_df = pd.DataFrame(homo_d, index=["no_homozygous_carriers"]).T
    hetero_df = pd.DataFrame(hetero_d, index=["no_heterozygous_carriers"]).T
    zygosity_df = hetero_df.merge(homo_df, left_index=True, right_index=True, how='outer').fillna(0).astype(int)
    af_df = af_df.merge(zygosity_df, left_index=True, right_index=True, how='left')
    
    # Subset for required columns and export to CSV
    af_colnames = [
        "gene", "variant", "allele_function", 
        "allele_frequency", "allele_count", "allele_number", 
        "no_heterozygous_carriers", "no_homozygous_carriers"
    ]
    af_df = af_df.loc[:, af_colnames].copy()
    
    # For phenotype CSV
    pheno_df = df.groupby('phenotype').agg(list)
    pheno_df['pharmacogene'] = gene
    pheno_df['genotype'] = pheno_df['genotype'].apply(lambda x: ",".join(list(set(x))))
    pheno_df['phenotype'] = pheno_df.index
    pheno_df['no_carriers_with_risk_alleles'] = pheno_df['sample_id'].apply(len)
    pheno_df['all_genotyped_individuals'] = pheno_df['no_carriers_with_risk_alleles'].sum()
    pheno_df['carrier_frequency'] = pheno_df.apply(lambda r: r.no_carriers_with_risk_alleles / r.all_genotyped_individuals, axis=1)
    pheno_df.sort_values(by=['carrier_frequency'], ascending=False, inplace=True)
    
    # Subset of columns to be exported
    keep_colnames = [
        'pharmacogene', 'genotype', 'phenotype', 'carrier_frequency', 
        'no_carriers_with_risk_alleles', 'all_genotyped_individuals'
    ]
    pheno_df = pheno_df.loc[:, keep_colnames].copy()
    
    # Export to CSV
    all_perGene_csv = f"{all_perGene_prefix}.{gene}.csv"
    perGene_csv = f"{perGene_prefix}.{gene}.csv"
    af_csv = f"{af_prefix}.{gene}.csv"
    pheno_csv = f"{pheno_prefix}.{gene}.csv"
    all_df.to_csv(all_perGene_csv, index=True, index_label='sre_patient_id')
    df.to_csv(perGene_csv, index=True, index_label='sre_patient_id')
    af_df.to_csv(af_csv, index=False)
    pheno_df.to_csv(pheno_csv, index=False)