# Summarize output from aldy

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import multiprocessing
from collections import defaultdict
from yaml import load
from yaml import CLoader as Loader
import matplotlib.pyplot as plt

In [2]:
# Input
resultDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/results")
pgx_genes = [
    "CFTR", "CYP2A6", "CYP2B6", "CYP2C19", 
    "CYP2C9", "CYP3A4", "CYP3A5", "CYP4F2", 
    "DPYD", "G6PD", "IFNL3", "IFNL4", 
    "NAT2", "NUDT15", "SLCO1B1", "TPMT", 
    "UGT1A1", "VKORC1"
]

# Reference
all_id_ref = "/home/_shared/jscliu/project/2025/Flagship/reference/sample_info_annot.2024-11-12.csv"
referenceDir = Path("/home/_shared/jscliu/project/2025/Flagship/reference/PharmGKB_ref")
allele_map = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/script/aldy_allele_map.yml")

# Output
sumDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01a.genotype.aldy/summary")
all_per_gene_prefix = sumDir/"all_perGenes"
per_gene_prefix = sumDir/"perGenes"
af_prefix = sumDir/"allele_frequency"
pheno_prefix = sumDir/"phenotype"

all_aldy_out_tsv = sumDir/"allParticipants_aldy_out.tsv"
founder_aldy_out_tsv = sumDir/"founder_aldy_out.tsv"

# Make output directories if not existed yet
sumDir.mkdir(exist_ok=True, parents=True)

In [5]:
aldy_colnames = [
    "sample_id", "gene", "solutionID", "major", "minor", 
    "copy", "allele", "location", "mutation", "coverage", 
    "effect", "dbSNP", "Code", "Status"
]

# Read sample info table as df

In [None]:
# Read id reference and subset for useful columns
def read_id_ref(id_ref, read_sex=False):
    keep_col = ["sre_participant_id", "founder_status", "is_proband", "healthStatus", "treated_ethnicity"]
    if read_sex:
        keep_col += ['inferred_sex']
    id_ref_df = pd.read_csv(id_ref).loc[:, keep_col]
    id_ref_df = id_ref_df.loc[~id_ref_df.duplicated(subset=['sre_participant_id']), :].set_index('sre_participant_id')
    return id_ref_df.loc[id_ref_df['treated_ethnicity']=='Chinese']

id_ref_df = read_id_ref(all_id_ref, read_sex=True)

# Process each aldy output and concatenate

In [6]:
# Get list of Aldy's output files for comparison and processing
aldy_out_ls = [ list(resultDir.glob(f"{id}*.aldy"))[0] for id in tqdm(id_ref_df.index) ]


In [7]:
def process_aldy_out(aldy_out, pgx_genes):
    # Try reading aldy_out as pd.DataFrame
    try:
        aldy_df = pd.read_table(aldy_out, comment="#", names=aldy_colnames).drop(columns=["Code", "Status"])
    except Exception as e: 
        print(f"{e} - {aldy_out}")
        return

    # Filter for pgx genes with unique solutions
    ambiguous_genes:list = aldy_df.loc[aldy_df['solutionID']==2, 'gene'].unique().tolist()
    keep_genes:list = list(set(pgx_genes).difference(set(ambiguous_genes)))
    aldy_df = aldy_df.loc[aldy_df['gene'].isin(keep_genes), :].copy()
    
    # Drop duplicates and get df ready then concat
    concat_colnames:list = ['gene', 'sample_id', 'solutionID', 'major', 'minor']
    aldy_df = aldy_df.drop_duplicates(subset=concat_colnames)[concat_colnames]
    return aldy_df

In [8]:
pool = multiprocessing.Pool(processes=10)    # Create Pool of 10 parallel processes max
results = pool.starmap(process_aldy_out, [(aldy_out, pgx_genes) for aldy_out in aldy_out_ls])
pool.close()    # Wait until all processes completed
pool.join()    # Join all output from processes

concat_aldy_out:pd.DataFrame = pd.concat(results).reset_index(drop=True)    # Remove index from the master df

In [9]:
concat_aldy_out[['haplotype_1', 'haplotype_2']] = concat_aldy_out['major'].apply(lambda x: pd.Series(x.split('/')))

# Append sample info to the dataframes

In [11]:
concat_aldy_out['sre_participant_id'] = concat_aldy_out['sample_id'].apply(lambda x: re.match(r"SRE[0-9]+", x)[0])
concat_aldy_out.set_index('sre_participant_id', inplace=True)

# Matched concat_aldy_out to id_ref_df to get a subset of founder's genotypes

## All participants

In [12]:
all_aldy_out = id_ref_df.merge(concat_aldy_out, left_index=True, right_index=True, how='left').dropna(subset=['gene']).astype({'solutionID': int})

In [13]:
all_aldy_out.to_csv(all_aldy_out_tsv, index=True, index_label='sre_participant_id', sep='\t')

## Unrelated individuals only

In [14]:
founder_aldy_out = all_aldy_out.loc[all_aldy_out['founder_status']=="Founder", :].copy()

In [15]:
founder_aldy_out.to_csv(founder_aldy_out_tsv, index=True, index_label='sre_participant_id', sep='\t')

# Annotate the allele function with PharmGKB's allele definitions

In [16]:
for gene in pgx_genes:
    allele_func:Path = referenceDir/f"{gene}/{gene}_allele_functionality_reference.csv"
    if not allele_func.is_file():
        print(f"{gene}: PharmGKB definition not exist\n")
        continue
    allele_func_df:pd.DataFrame = pd.read_csv(allele_func, skiprows=1)
    pharmgkb_alleles:list = allele_func_df['Allele/cDNA/rsID'].tolist()
    hap_1:list = founder_aldy_out.loc[founder_aldy_out['gene']==gene, 'haplotype_1'].tolist()
    hap_2:list = founder_aldy_out.loc[founder_aldy_out['gene']==gene, 'haplotype_2'].tolist()
    defined:set= set(hap_1+hap_2) & set(pharmgkb_alleles)
    undefined:set = set(hap_1+hap_2) - set(pharmgkb_alleles)
    print(f"{gene}_defined: {defined}")
    print(f"{gene}_undefined: {undefined}\n")

In [17]:
genes_with_pharmgkb_def = ["CYP2B6", "CYP2C19", "CYP2C9", "CYP3A5", "DPYD", "G6PD", "NUDT15", "SLCO1B1", "TPMT", "UGT1A1"]
genes_need_custom_def = ["CFTR", "CYP2A6", "CYP3A4", "CYP4F2", "IFNL3", "IFNL4", "NAT2", "VKORC1"]

# Separate concat_aldy_out by gene

In [20]:
with open(allele_map) as handle:
    allele_map_d = load(handle, Loader=Loader)

In [22]:
def annotate_haplotype(gene, haplotype, allele_map_d):
    # Error if gene is not defined
    if gene not in allele_map_d.keys():
        print(f"{gene} not defined. Exiting")
        sys.exit(1)

    # Return original haplotype if all alleles are mapped
    if allele_map_d[gene] == "all_mapped":
        return haplotype

    # Return annotated allele if defined in allele_map_d, otherwise return haplotype
    if haplotype in allele_map_d[gene].keys():
        return allele_map_d[gene][haplotype]
    else:
        return haplotype

## All participants

In [23]:
pharmgkb_all_aldy_out = all_aldy_out.loc[all_aldy_out['gene'].isin(genes_with_pharmgkb_def), :].copy()
custom_all_aldy_out = all_aldy_out.loc[all_aldy_out['gene'].isin(genes_need_custom_def), :].copy()

In [24]:
pharmgkb_all_aldy_out['haplotype_1'] = pharmgkb_all_aldy_out.apply(lambda r: annotate_haplotype(r.gene, r.haplotype_1, allele_map_d), axis=1)
pharmgkb_all_aldy_out['haplotype_2'] = pharmgkb_all_aldy_out.apply(lambda r: annotate_haplotype(r.gene, r.haplotype_2, allele_map_d), axis=1)
pharmgkb_all_aldy_out['genotype'] = pharmgkb_all_aldy_out.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
pharmgkb_all_aldy_out.head()

In [26]:
allParticipants_gene_df_d:dict = dict()
for gene in pharmgkb_all_aldy_out['gene'].unique().tolist():
    # Extract allele definitions
    allele_func:Path = referenceDir/f"{gene}/{gene}_allele_functionality_reference.csv"
    allele_func_df:pd.DataFrame = pd.read_csv(allele_func, skiprows=1, index_col=[0])
    allele_func_d:dict = allele_func_df.to_dict()['Allele Clinical Functional Status (Required)']

    # Extract diplotype definitions
    diplotype:Path = referenceDir/f"{gene}/{gene}_Diplotype_Phenotype_Table.csv"
    diplotype_df = pd.read_csv(diplotype, index_col=[0]).loc[:, ["Coded Diplotype/Phenotype Summary", "EHR Priority Notation"]]
    diplotype_df.rename(columns={"Coded Diplotype/Phenotype Summary": "phenotype", "EHR Priority Notation": "EHR_priority_notation"}, inplace=True)

    # Define function of haplotype_1 and haplotype_2
    gene_df = pharmgkb_all_aldy_out.loc[pharmgkb_all_aldy_out['gene']==gene, :].copy()
    for hap in ['haplotype_1', 'haplotype_2']:
        gene_df[f'{hap}_func'] = gene_df[hap].apply(lambda x: allele_func_d[x] if x in allele_func_d.keys() else np.nan)

    # Define phenotype based on genotype
    gene_df = gene_df.merge(diplotype_df, left_on='genotype', right_index=True, how='left')

    # Add gene_df to gene_df_d
    allParticipants_gene_df_d[gene] = gene_df

## Founder only

In [27]:
pharmgkb_founder_aldy_out = founder_aldy_out.loc[founder_aldy_out['gene'].isin(genes_with_pharmgkb_def), :].copy()
custom_founder_aldy_out = founder_aldy_out.loc[founder_aldy_out['gene'].isin(genes_need_custom_def), :].copy()

In [28]:
pharmgkb_founder_aldy_out['haplotype_1'] = pharmgkb_founder_aldy_out.apply(lambda r: annotate_haplotype(r.gene, r.haplotype_1, allele_map_d), axis=1)
pharmgkb_founder_aldy_out['haplotype_2'] = pharmgkb_founder_aldy_out.apply(lambda r: annotate_haplotype(r.gene, r.haplotype_2, allele_map_d), axis=1)
pharmgkb_founder_aldy_out['genotype'] = pharmgkb_founder_aldy_out.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
pharmgkb_founder_aldy_out.head()

In [30]:
gene_df_d:dict = dict()
for gene in pharmgkb_founder_aldy_out['gene'].unique().tolist():
    # Extract allele definitions
    allele_func:Path = referenceDir/f"{gene}/{gene}_allele_functionality_reference.csv"
    allele_func_df:pd.DataFrame = pd.read_csv(allele_func, skiprows=1, index_col=[0])
    allele_func_d:dict = allele_func_df.to_dict()['Allele Clinical Functional Status (Required)']

    # Extract diplotype definitions
    diplotype:Path = referenceDir/f"{gene}/{gene}_Diplotype_Phenotype_Table.csv"
    diplotype_df = pd.read_csv(diplotype, index_col=[0]).loc[:, ["Coded Diplotype/Phenotype Summary", "EHR Priority Notation"]]
    diplotype_df.rename(columns={"Coded Diplotype/Phenotype Summary": "phenotype", "EHR Priority Notation": "EHR_priority_notation"}, inplace=True)

    # Define function of haplotype_1 and haplotype_2
    gene_df = pharmgkb_founder_aldy_out.loc[pharmgkb_founder_aldy_out['gene']==gene, :].copy()
    for hap in ['haplotype_1', 'haplotype_2']:
        gene_df[f'{hap}_func'] = gene_df[hap].apply(lambda x: allele_func_d[x] if x in allele_func_d.keys() else np.nan)

    # Define phenotype based on genotype
    gene_df = gene_df.merge(diplotype_df, left_on='genotype', right_index=True, how='left')

    # Add gene_df to gene_df_d
    gene_df_d[gene] = gene_df

# Count the number breakdowns of genotyped individual

In [31]:
# Consolidate the numbers of genotypes called
count_colnames = ['gene', 'all_individuals', 'aldy_called', 'aldy_called_unique_sol', 'aldy_called_unique_defined']
count_ser_ls:list=  list()
for gene, df in gene_df_d.items():
    aldy_called:int = len(df['sample_id'].unique())
    sample_w_sol2:list = df.loc[df['solutionID']==2, 'sample_id'].unique().tolist()
    aldy_called_unique:int = len(df.loc[~df['sample_id'].isin(sample_w_sol2), 'sample_id'].unique())
    aldy_called_unique_defined:int = len(df.loc[(~df['sample_id'].isin(sample_w_sol2)) & (~df['phenotype'].isna()), :])
    count_ser_ls.append(pd.DataFrame([gene, len(id_ref_df.loc[id_ref_df['founder_status']=='Founder']), aldy_called, aldy_called_unique, aldy_called_unique_defined], index=count_colnames).T)

In [32]:
# Append "custom" genes to count_df
for gene in custom_founder_aldy_out['gene'].unique().tolist():
    tmp_df = custom_founder_aldy_out.loc[custom_founder_aldy_out['gene']==gene, :].copy()
    aldy_called:int = len(tmp_df['sample_id'].unique())
    sample_w_sol2_custom:list = tmp_df.loc[tmp_df['solutionID']==2, 'sample_id'].unique().tolist()
    aldy_called_unique:int = len(tmp_df.loc[~tmp_df['sample_id'].isin(sample_w_sol2_custom), 'sample_id'].unique())
    count_ser_ls.append(pd.DataFrame([gene, len(id_ref_df.loc[id_ref_df['founder_status']=='Founder']), aldy_called, aldy_called_unique, np.nan], index=count_colnames).T)

In [33]:
count_df = pd.concat(count_ser_ls).set_index('gene')
count_df['missing_rate'] = count_df.apply(lambda r: (r.aldy_called - r.aldy_called_unique_sol)/(r.aldy_called), axis=1)
count_df

# Exclude samples with multiple solutions, then export individual gene_df to perGene CSV

In [37]:
def validate_G6PD_in_male(df):
    """Validate and reformat the X-linked G6PD genotypes in Male"""
    female_df = df.loc[df['inferred_sex']=='Female', :].copy()
    male_df = df.loc[df['inferred_sex']=='Male', :].copy()
    # Remove genotypes of Male with unmatched haplotypes
    male_df['same_haplotype'] = male_df.apply(lambda r: True if r.haplotype_1==r.haplotype_2 else False, axis=1)
    male_df = male_df.loc[male_df.same_haplotype, :].drop(columns=['same_haplotype'])
    # Drop redundant information
    male_df['haplotype_2'] = np.nan
    male_df['haplotype_2_func'] = np.nan
    male_df['genotype'] = male_df['genotype'].apply(lambda x: x.split('/')[0])
    # concat and return
    return pd.concat([male_df, female_df])

## All participants

In [38]:
all_gene_df_pairs = list(allParticipants_gene_df_d.items())
for gene, df in all_gene_df_pairs:
    # Drop samples with multiple solutions
    sample_w_sol2:list = df.loc[df['solutionID']==2, 'sample_id'].unique().tolist()
    unique_sol_df = df.loc[~df['sample_id'].isin(sample_w_sol2), :].copy()
    if gene == "G6PD":
        allParticipants_gene_df_d[gene] = validate_G6PD_in_male(unique_sol_df)
    else:
        allParticipants_gene_df_d[gene] = unique_sol_df.copy()
    
    # Export to perGene CSV
    per_gene_csv = f"{str(all_per_gene_prefix)}.{gene}.csv"
    allParticipants_gene_df_d[gene].to_csv(per_gene_csv, index=True, index_label="sre_patient_id ")

## Founder only

In [39]:
gene_df_pairs = list(gene_df_d.items())
for gene, df in gene_df_pairs:
    # Drop samples with multiple solutions
    sample_w_sol2:list = df.loc[df['solutionID']==2, 'sample_id'].unique().tolist()
    unique_sol_df = df.loc[~df['sample_id'].isin(sample_w_sol2), :].copy()
    if gene == "G6PD":
        gene_df_d[gene] = validate_G6PD_in_male(unique_sol_df)
    else:
        gene_df_d[gene] = unique_sol_df.copy()
    
    # Export to perGene CSV
    per_gene_csv = f"{str(per_gene_prefix)}.{gene}.csv"
    gene_df_d[gene].to_csv(per_gene_csv, index=True, index_label="sre_patient_id ")

# Calculate allele frequencies
Criteria: Both haplotype 1 & haploptype 2 were defined by PharmGKB (CPIC)

In [40]:
def count_zygosity(called_df):
    """Intake called_df, count the number of homozygous, heterozygous and hemizygous carriers into a dict"""
    homo_dict=defaultdict(int)
    hetero_dict=defaultdict(int)
    hemi_dict=defaultdict(int)
    for _, row in called_df.iterrows():
        if row.haplotype_1 == row.haplotype_2:
            # Homozygous
            homo_dict[row.haplotype_1] += 1
        elif row.haplotype_2 != row.haplotype_2:
            # Hemizygous
            hemi_dict[row.haplotype_1] += 1
        else:
            # Heterozygous
            hetero_dict[row.haplotype_1] += 1
            hetero_dict[row.haplotype_2] += 1
    return homo_dict, hetero_dict, hemi_dict

In [41]:
for gene, df in gene_df_d.items():
    # Initialize af_df
    af_df = pd.concat([
        df.loc[:, ['haplotype_1', 'haplotype_1_func']].rename(columns={'haplotype_1': 'variant', 'haplotype_1_func': 'allele_function'}), 
        df.loc[:, ['haplotype_2', 'haplotype_2_func']].rename(columns={'haplotype_2': 'variant', 'haplotype_2_func': 'allele_function'})
    ]).groupby('variant').agg(list)
    
    # Get general information from af_df
    af_df['allele_count'] = af_df['allele_function'].apply(len)
    af_df['gene'] = gene
    af_df['allele_function'] = af_df['allele_function'].apply(lambda x: x[0])
    af_df.dropna(subset=['allele_function'], inplace=True)
    af_df.sort_values(by=['allele_function', 'allele_count'], ascending=False, inplace=True)
    af_df['variant'] = af_df.index
    af_df['allele_number'] = af_df['allele_count'].sum()
    af_df['allele_frequency'] = af_df.apply(lambda r: r.allele_count / r.allele_number, axis=1)
    
    # Count zygosity and merge it to df
    homo_d, hetero_d, hemi_d = count_zygosity(df)
    homo_df = pd.DataFrame(homo_d, index=["no_homozygous_carriers"]).T
    hetero_df = pd.DataFrame(hetero_d, index=["no_heterozygous_carriers"]).T
    hemi_df = pd.DataFrame(hemi_d, index=["no_hemizygous_carriers"]).T
    zygosity_df = hetero_df.merge(homo_df, left_index=True, right_index=True, how='outer').merge(hemi_df, left_index=True, right_index=True, how='outer').fillna(0).astype(int)
    af_df = af_df.merge(zygosity_df, left_index=True, right_index=True, how='left')
    
    # Subset for required columns and export to CSV
    af_colnames = [
        "gene", "variant", "allele_function", 
        "allele_frequency", "allele_count", "allele_number", 
        "no_heterozygous_carriers", "no_homozygous_carriers", "no_hemizygous_carriers"
    ]
    af_df = af_df.loc[:, af_colnames].copy()
    af_csv = f"{str(af_prefix)}.{gene}.csv"
    af_df.to_csv(af_csv, index=False)

# Calculate the carrier frequency by phenotype

In [42]:
for gene, df in gene_df_d.items():
    pheno_df = df.groupby('phenotype').agg(list)
    pheno_df['pharmacogene'] = gene
    pheno_df['genotype'] = pheno_df['genotype'].apply(lambda x: ",".join(list(set(x))))
    pheno_df['phenotype'] = pheno_df.index
    pheno_df['no_carriers_with_risk_alleles'] = pheno_df['sample_id'].apply(len)
    pheno_df['all_genotyped_individuals'] = pheno_df['no_carriers_with_risk_alleles'].sum()
    pheno_df['carrier_frequency'] = pheno_df.apply(lambda r: r.no_carriers_with_risk_alleles / r.all_genotyped_individuals, axis=1)
    pheno_df.sort_values(by=['carrier_frequency'], ascending=False, inplace=True)
    
    # Subset of columns to be exported
    keep_colnames = [
        'pharmacogene', 'genotype', 'phenotype', 'carrier_frequency', 
        'no_carriers_with_risk_alleles', 'all_genotyped_individuals'
    ]
    pheno_df = pheno_df.loc[:, keep_colnames].copy()
    pheno_csv = Path(f"{str(pheno_prefix)}.{gene}.csv")
    pheno_df.to_csv(pheno_csv, index=False)