In [17]:
from pathlib import Path
import pandas as pd
import numpy as np
from yaml import load
from yaml import CLoader as Loader
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')

In [18]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/02.consolidate_genotypes")
summaryDir = projectDir/"summary"
master_csv = summaryDir/"pgx_allParticipants_allGenes.csv"
pheno_cat = projectDir/"script/phenotype_category.yml"

# Output table
per_gene_actionability = summaryDir/"per_gene_actionability.tsv"
per_indiv_actionability = summaryDir/"per_idv_actionability.tsv"

summaryDir.mkdir(exist_ok=True)

# Read master table (CSV) as df

In [19]:
master_df = pd.read_csv(master_csv, index_col=[0])
master_df = master_df.loc[
    (master_df['founder_status']=='Founder') & (master_df['treated_ethnicity']=='Chinese'), 
    :
].copy()

In [20]:
n_samples = len(master_df)

# Read pheno cat as python dict

In [21]:
with open(pheno_cat) as handle:
    pheno_cat_d = load(handle, Loader=Loader)

In [22]:
genes = list()
for col in master_df.columns:
    if "_phenotype" in col:
        genes.append(col.rsplit('_', maxsplit=1)[0])

# Calculate the number of actionable genes per individuals

In [23]:
gene_1b:list = [
    "NAT2", "CYP2A6", 
    'IFNL3_rs8099917', 
    'IFNL4_rs11322783', 
    'VKORC1_rs2359612', 
    'VKORC1_rs7294', 
    'VKORC1_rs8050894', 
    'VKORC1_rs9934438'
]

In [24]:
def count_actionables(r:pd.Series, gene_1b:list):
    # Given the PGx profile of an individual, 
    #   count the number of actionable 1A and 1B genes
    actionable_1a, actionable_1b = list(), list()
    for col, value in r.items():
        if "_actionability" not in col:
            continue    # Skip unrelated columns
        if value != "Actionable":
            continue    # Skip if not actionable
        
        # Determine whether the acitonable gene is class 1A or 1B
        gene = col.rsplit("_", maxsplit=1)[0]
        if gene not in gene_1b:
            actionable_1a.append(gene.split('_')[0])
        else:
            actionable_1b.append(gene.split('_')[0])
    # Deduplicate
    actionable_1a = list(set(actionable_1a))
    actionable_1b = list(set(actionable_1b))
    actionable_1ab = list(set(actionable_1a + actionable_1b))

    # Consolidate and return
    return pd.Series([
        "|".join(actionable_1a), 
        len(actionable_1a), 
        "|".join(actionable_1b), 
        len(actionable_1b), 
        len(actionable_1ab)])

master_df[[
    'actionable_1A_gene', 
    'actionable_1A_gene_count', 
    'actionable_1B_gene', 
    'actionable_1B_gene_count', 
    'actionable_gene_count'
]] = master_df.apply(lambda r: count_actionables(r, gene_1b), axis=1)

## Per sample: # pharmacogenes with clinical annotation level 1A + 1B

In [28]:
count_df = master_df.groupby('actionable_gene_count').agg(len)
count_df['n_actionable_genes'] = count_df.index
count_df.sort_index(inplace=True)

  count_df['n_actionable_genes'] = count_df.index


In [276]:
count_export_df = master_df.loc[:, ['actionable_gene_count']].copy()
count_export_df.to_csv(per_indiv_actionability, index=True, index_label='sre_participant_id', sep='\t')

## Per pharmacogenes

In [285]:
# Create a pd.DataFrame to illustrate the call rate & % of individuals with actionable phenotypes
col_prefix_1A = [
    "CYP4F2", "CYP2C19", "CYP2C9", "NUDT15", "TPMT", 
    "VKORC1", "CYP3A5", "CFTR", "CYP2B6", "DPYD", 
    "G6PD", "IFNL3", "SLCO1B1", "UGT1A1", "CYP3A4", 
    "CYP2D6", "HLA-A", "HLA-B", "CACNA1S", "RYR1", 
    "MT-RNR1", "ABCG2"
]
col_prefix_1B = [
    "NAT2", "CYP2A6", "IFNL3_rs8099917", "IFNL4_rs11322783", 
    "VKORC1_rs2359612", "VKORC1_rs7294", "VKORC1_rs8050894", 
    "VKORC1_rs9934438"
]

## Level 1A

In [315]:
per_gene_df_ls = list()
for gene in col_prefix_1A:
    called_cnt = len(master_df.loc[~master_df[f"{gene}_phenotype"].isna(), :])
    call_rate =  called_cnt / len(master_df)
    actionable_cnt = len(master_df.loc[master_df[f"{gene}_actionability"]=="Actionable", :])
    actionable_rate =  actionable_cnt / len(master_df)
    per_gene_df_ls.append(
        pd.DataFrame({gene: ['1A', call_rate, actionable_cnt, actionable_rate]}, index=['annotation_lv', 'call_rate', 'actionable_count', 'actionable_rate']).T
    )

In [316]:
level_1a_perGene_df = pd.concat(per_gene_df_ls)

## Level 1B

In [318]:
per_gene_df_ls = list()
for gene in col_prefix_1B:
    if "VKORC1" in gene:
        continue
    called_cnt = len(master_df.loc[~master_df[f"{gene}_phenotype"].isna(), :])
    call_rate =  called_cnt / len(master_df)
    actionable_cnt = len(master_df.loc[master_df[f"{gene}_actionability"]=="Actionable", :])
    actionable_rate =  actionable_cnt / len(master_df)
    per_gene_df_ls.append(
        pd.DataFrame({gene.split('_')[0]: ['1B', call_rate, actionable_cnt, actionable_rate]}, index=['annotation_lv', 'call_rate', 'actionable_count', 'actionable_rate']).T
    )

In [319]:
level_1b_perGene_df = pd.concat(per_gene_df_ls)

In [320]:
called_cnt = len(master_df.loc[~master_df["VKORC1_genotype"].isna(), :])
call_rate =  called_cnt / len(master_df)
def vkorc1_1b_actionable(r, col_prefix_1B):
    vkorc1_1b_actionability_cols:list = [ f"{a}_actionability" for a in col_prefix_1B if "VKORC1" in a ]
    vkorc1_1b_actionability = [ a for a in r[vkorc1_1b_actionability_cols] ]
    vkorc1_1b_is_actionable = [ True if a=="Actionable" else False for a in vkorc1_1b_actionability ]
    return any(vkorc1_1b_is_actionable)
master_df['VKORC1_1B_is_actionable'] = master_df.apply(lambda r: vkorc1_1b_actionable(r, col_prefix_1B), axis=1)
actionable_cnt = len(master_df.loc[master_df['VKORC1_1B_is_actionable'], :])
actionable_rate =  actionable_cnt / len(master_df)

In [321]:
level_1b_perGene_df = pd.concat([
    level_1b_perGene_df, 
    pd.DataFrame({'VKORC1': ['1B', call_rate, actionable_cnt, actionable_rate]}, index=['annotation_lv', 'call_rate', 'actionable_count', 'actionable_rate']).T
])

## Concatenate

In [323]:
gene_actionable_df = pd.concat([
    level_1a_perGene_df, 
    level_1b_perGene_df.loc[['NAT2', 'CYP2A6', 'IFNL4']]
])

In [325]:
gene_actionable_df.sort_values(by=['annotation_lv', 'actionable_rate'], ascending=[True, False], inplace=True)

In [326]:
gene_actionable_df['gene'] = gene_actionable_df.index
gene_actionable_df['gene_idx'] = pd.Series(list(range(0, len(gene_actionable_df))), index=gene_actionable_df.index)

## Considering AMP alleles 

In [31]:
# Actionability rate if only reference and the AMP alleles were considered
hkgp_amp = {
    "CYP4F2": ["*1", "*3"], 
    "VKORC1": ["*H6", "*H1", "*H2", "*T", "*H3+rs9934438+rs9923231", "*H3+rs9923231"], 
    "CYP2D6": [
        "*1", "*2", "*3", "*4", "*5", 
        "*6", "*7", "*8", "*9", "*10", 
        "*12", "*14", "*15", "*17", "*21", 
        "*29", "*31", "*40", "*41", "*42", 
        "*49", "*56", "*59"
    ], 
    "NUDT15": ["*1", "*2", "*3", "*4", "*6", "*9", "*14"], 
    "TPMT": ["*1", "*2", "*3A", "*3B", "*3C", "*11", "*29", "*42"], 
    "CYP2C19": ["*1", "*2", "*3", "*4", "*5", "*6", "*7", "*8", "*9", "*10", "*17", "*35"], 
    "CYP3A4": ["*1", "*20", "*22"], 
    "CYP3A5": ["*1", "*3", "*6", "*7"], 
    "CYP2C9": ["*1", "*2", "*3", "*5", "*6", "*8", "*11", "*12", "*13", "*15"], 
    "DPYD": [
        "Reference", 
        "c.1314T>G", 
        "c.1774C>T", 
        "c.1905+1G>A (*2A)", 
        "c.1129-5923C>G, c.1236G>A (HapB3)"
    ] 
}

In [None]:
amp_actionable_rates = dict()
for gene, ref_amp_alleles in hkgp_amp.items():
    ref_allele = ref_amp_alleles[0]
    amp_alleles = ref_amp_alleles[1:]
    
    tmp_df = master_df.copy()
    tmp_df['carry_amp'] = tmp_df.apply(
        lambda r: True if (r[f'{gene}_haplotype_1'] in amp_alleles) | (r[f'{gene}_haplotype_2'] in amp_alleles) else False, 
        axis=1
    )
    tmp_df['matches_amp'] = tmp_df.apply(
        lambda r: True if (r[f'{gene}_haplotype_1'] in ref_amp_alleles) & (r[f'{gene}_haplotype_2'] in ref_amp_alleles) else False, 
        axis=1
    )
    amp_actionable_cnt = len(tmp_df.loc[
        (tmp_df.carry_amp) & (tmp_df.matches_amp), :
    ])

    amp_actionable_rates[gene] = amp_actionable_cnt / n_samples
    
amp_rate_df = pd.DataFrame(amp_actionable_rates, index=['AMP_nonRef_rate']).T

## Merge amp_rate_df to gene_actionable_df

In [332]:
gene_actionable_df = gene_actionable_df.merge(amp_rate_df, left_index=True, right_index=True, how='left')

In [333]:
gene_actionable_df.to_csv(per_gene_actionability, index=True, index_label="Gene", sep='\t')