In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import sys
from tqdm import tqdm
import multiprocessing
from collections import defaultdict
from yaml import load
from yaml import CLoader as Loader

In [2]:
sumDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/summary")
all_aldy_out_tsv = sumDir/"allParticipants_aldy_out.tsv"
founder_aldy_out_tsv = sumDir/"founder_aldy_out.tsv"
allele_map = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/script/aldy_allele_map.yml")
allele_func = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/script/aldy_custom_allele_functions.yml")
all_perGene_prefix:str = str(sumDir/"all_perGenes")
perGene_prefix:str = str(sumDir/"perGenes")
af_prefix:str = str(sumDir/"allele_frequency")
pheno_prefix:str = str(sumDir/"phenotype")

genes_need_custom_def = ["CYP2A6", "NAT2", "VKORC1"]
perGene_df_d:dict = dict()

In [3]:
with open(allele_func) as handle:
    allele_func_d = load(handle, Loader=Loader)

In [4]:
def annotate_haplotype(gene, haplotype, allele_map_d):
    # Error if gene is not defined
    if gene not in allele_map_d.keys():
        print(f"{gene} not defined. Exiting")
        sys.exit(1)

    # Return original haplotype if all alleles are mapped
    if allele_map_d[gene] == "all_mapped":
        return haplotype

    # Return annotated allele if defined in allele_map_d, otherwise return haplotype
    if haplotype in allele_map_d[gene].keys():
        return allele_map_d[gene][haplotype]
    else:
        return haplotype

def count_zygosity(called_df):
    """Intake called_df, count the number of homozygous and heterozygous carriers into a dict"""
    homo_dict=defaultdict(int)
    hetero_dict=defaultdict(int)
    for _, row in called_df.iterrows():
        if row.haplotype_1 == row.haplotype_2:
            # Homozygous
            homo_dict[row.haplotype_1] += 1
        else:
            # Heterozygous
            hetero_dict[row.haplotype_1] += 1
            hetero_dict[row.haplotype_2] += 1
    return homo_dict, hetero_dict

In [5]:
all_aldy_out = pd.read_table(all_aldy_out_tsv, index_col=[0])
custom_all_aldy_out = all_aldy_out.loc[all_aldy_out['gene'].isin(genes_need_custom_def), :].copy()

In [6]:
founder_aldy_out = pd.read_table(founder_aldy_out_tsv, index_col=[0])
custom_founder_aldy_out = founder_aldy_out.loc[founder_aldy_out['gene'].isin(genes_need_custom_def), :].copy()

In [8]:
gene_df_d:dict = dict()
for gene in genes_need_custom_def:
    df = all_aldy_out.loc[all_aldy_out['gene']==gene, :].copy()
    sample_w_sol2:list = df.loc[df['solutionID']==2, 'sample_id'].unique().tolist()
    gene_df_d[gene] = df.loc[~df['sample_id'].isin(sample_w_sol2), :].copy()

# NAT2

In [9]:
gene = 'NAT2'
all_perGene_csv = f"{all_perGene_prefix}.{gene}.csv"
perGene_csv = f"{perGene_prefix}.{gene}.csv"
af_csv = f"{af_prefix}.{gene}.csv"
pheno_csv = f"{pheno_prefix}.{gene}.csv"
df = gene_df_d[gene].copy()

In [10]:
df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)

In [11]:
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype(gene, r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype(gene, r.haplotype_2, allele_func_d), axis=1)

In [12]:
# For perGene csv: Annotate the phenotype and EHR_priority_notation
def annotate_nat2_phenotype(hap_1_func, hap_2_func):
    if (hap_1_func == "Decreased risk of developing toxic liver disease when treated with isoniazid regimens") or (hap_2_func == "Decreased risk of developing toxic liver disease when treated with isoniazid regimens"):
        phenotype = "Decreased risk of developing toxic liver disease when treated with isoniazid regimens"
    if (hap_1_func == "Increased risk of developing toxic liver disease when treated with isoniazid regimens") and (hap_2_func == "Increased risk of developing toxic liver disease when treated with isoniazid regimens"):
        phenotype = "Increased risk of developing toxic liver disease when treated with isoniazid regimens"
    else:
        phenotype = "Indeterminate"
    return pd.Series([phenotype, np.nan])

df[['phenotype', 'EHR_priority_notation']] = df.apply(
    lambda r: annotate_nat2_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1
)
perGene_df_d[gene] = df

# CYP2A6

In [13]:
gene = 'CYP2A6'
all_perGene_csv = f"{all_perGene_prefix}.{gene}.csv"
perGene_csv = f"{perGene_prefix}.{gene}.csv"
af_csv = f"{af_prefix}.{gene}.csv"
pheno_csv = f"{pheno_prefix}.{gene}.csv"
df = gene_df_d[gene].copy()

In [14]:
df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)

In [15]:
df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype(gene, r.haplotype_1, allele_func_d), axis=1)
df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype(gene, r.haplotype_2, allele_func_d), axis=1)

In [16]:
# For perGene csv: Annotate the phenotype and EHR_priority_notation
def annotate_cyp2a6_phenotype(hap_1_func, hap_2_func):
    if "Indeterminate" in [hap_1_func, hap_2_func]:
        phenotype = "Indeterminate"
    elif hap_1_func == hap_2_func == "Normal metabolism of nicotine":
        phenotype = "Normal metabolism of nicotine"
    else:
        phenotype = "Decreased metabolism of nicotine"
    return pd.Series([phenotype, np.nan])

df[['phenotype', 'EHR_priority_notation']] = df.apply(
    lambda r: annotate_cyp2a6_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1
)
perGene_df_d[gene] = df

# VKORC1: 4 variants with clinical annotation level 1B

In [17]:
gene = 'VKORC1'
all_perGene_csv = f"{all_perGene_prefix}.{gene}.csv"
perGene_csv = f"{perGene_prefix}.{gene}.csv"
af_csv = f"{af_prefix}.{gene}.csv"
pheno_csv = f"{pheno_prefix}.{gene}.csv"
vkorc1_df = gene_df_d[gene].copy()

In [18]:
def annotate_vkorc1_1b_phenotype(var, hap_1_func, hap_2_func):
    if var == "rs7294":
        if "Carrier of rs7294 variant (A)" in [hap_1_func, hap_2_func]:
            phenotype = "May require higher dose of warfarin"
        else:
            phenotype = "May require lower dose of warfarin"
    elif var == "rs2359612":
        if "Carrier of rs2359612 variant (T)" not in [hap_1_func, hap_2_func]:
            phenotype = "May require increased dose of warfarin"
        else:
            phenotype = "May require decreased dose of warfarin"
    elif var == "rs8050894":
        if "Carrier of rs8050894 variant (C)" in [hap_1_func, hap_2_func]:
            phenotype = "May require a lower dose of warfarin"
        else:
            phenotype = "May require a higher dose of warfarin"
    elif var == "rs9934438":
        if "Carrier of rs9934438 variant (T)" not in [hap_1_func, hap_2_func]:
            phenotype = "May require higher dose of warfarin"
        else:
            phenotype = "May require lower dose of warfarin"
    return pd.Series([phenotype, np.nan])

In [19]:
vkorc1_1b_variants:list = ['rs7294', 'rs2359612', 'rs8050894', 'rs9934438']
for var in vkorc1_1b_variants:
    df = vkorc1_df.copy()
    df['genotype'] = df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
    df['haplotype_1_func'] = df.apply(lambda r: annotate_haplotype(f"{gene}_{var}", r.haplotype_1, allele_func_d), axis=1)
    df['haplotype_2_func'] = df.apply(lambda r: annotate_haplotype(f"{gene}_{var}", r.haplotype_2, allele_func_d), axis=1)
    df[['phenotype', 'EHR_priority_notation']] = df.apply(
        lambda r: annotate_vkorc1_1b_phenotype(var, r.haplotype_1_func, r.haplotype_2_func), axis=1
    )
    perGene_df_d[f"{gene}_{var}"] = df

# Calculate the allele frequencies and phenotype carrier frequencies

In [21]:
for gene, all_df in perGene_df_d.items():
    # Get df of foudners
    df = all_df.loc[all_df['founder_status']=="Founder", :].copy()
    
    # Groupby allele and calculate the frequencies
    # Initialize af_df
    af_df = pd.concat([
        df.loc[:, ['haplotype_1', 'haplotype_1_func']].rename(columns={'haplotype_1': 'variant', 'haplotype_1_func': 'allele_function'}), 
        df.loc[:, ['haplotype_2', 'haplotype_2_func']].rename(columns={'haplotype_2': 'variant', 'haplotype_2_func': 'allele_function'})
    ]).groupby('variant').agg(list)
    
    # Get general information from af_df
    af_df['allele_count'] = af_df['allele_function'].apply(len)
    af_df['gene'] = gene
    af_df['allele_function'] = af_df['allele_function'].apply(lambda x: x[0])
    af_df.dropna(subset=['allele_function'], inplace=True)
    af_df.sort_values(by=['allele_function', 'allele_count'], ascending=False, inplace=True)
    af_df['variant'] = af_df.index
    af_df['allele_number'] = af_df['allele_count'].sum()
    af_df['allele_frequency'] = af_df.apply(lambda r: r.allele_count / r.allele_number, axis=1)
    
    # Count zygosity and merge it to df
    homo_d, hetero_d = count_zygosity(df)
    homo_df = pd.DataFrame(homo_d, index=["no_homozygous_carriers"]).T
    hetero_df = pd.DataFrame(hetero_d, index=["no_heterozygous_carriers"]).T
    zygosity_df = hetero_df.merge(homo_df, left_index=True, right_index=True, how='outer').fillna(0).astype(int)
    af_df = af_df.merge(zygosity_df, left_index=True, right_index=True, how='left')
    
    # Subset for required columns and export to CSV
    af_colnames = [
        "gene", "variant", "allele_function", 
        "allele_frequency", "allele_count", "allele_number", 
        "no_heterozygous_carriers", "no_homozygous_carriers"
    ]
    af_df = af_df.loc[:, af_colnames].copy()
    
    # For phenotype CSV
    pheno_df = df.groupby('phenotype').agg(list)
    pheno_df['pharmacogene'] = gene
    pheno_df['genotype'] = pheno_df['genotype'].apply(lambda x: ",".join(list(set(x))))
    pheno_df['phenotype'] = pheno_df.index
    pheno_df['no_carriers_with_risk_alleles'] = pheno_df['sample_id'].apply(len)
    pheno_df['all_genotyped_individuals'] = pheno_df['no_carriers_with_risk_alleles'].sum()
    pheno_df['carrier_frequency'] = pheno_df.apply(lambda r: r.no_carriers_with_risk_alleles / r.all_genotyped_individuals, axis=1)
    pheno_df.sort_values(by=['carrier_frequency'], ascending=False, inplace=True)
    
    # Subset of columns to be exported
    keep_colnames = [
        'pharmacogene', 'genotype', 'phenotype', 'carrier_frequency', 
        'no_carriers_with_risk_alleles', 'all_genotyped_individuals'
    ]
    pheno_df = pheno_df.loc[:, keep_colnames].copy()
    
    # Export to CSV
    all_perGene_csv = f"{all_perGene_prefix}.{gene}.csv"
    perGene_csv = f"{perGene_prefix}.{gene}.csv"
    af_csv = f"{af_prefix}.{gene}.csv"
    pheno_csv = f"{pheno_prefix}.{gene}.csv"
    all_df.to_csv(all_perGene_csv, index=True, index_label='sre_patient_id')
    df.to_csv(perGene_csv, index=True, index_label='sre_patient_id')
    af_df.to_csv(af_csv, index=False)
    pheno_df.to_csv(pheno_csv, index=False)