In [15]:
from pathlib import Path
import pandas as pd
import numpy as np
from yaml import load
from yaml import CLoader as Loader
import re
import multiprocessing
import sys
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import math

In [4]:
# Input
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01d.genotype.manual")
resultsDir = projectDir/"results"
gene = "MT-RNR1"

# Reference
id_ref = "/home/_shared/jscliu/project/2025/Flagship/reference/sample_info_annot.2024-11-12.csv"
referenceDir_master = Path("/home/_shared/jscliu/project/2025/Flagship/reference/PharmGKB_ref")
star_allele_yml = projectDir/"script/resource/star_allele_variants.yml"
allele_func_ref = referenceDir_master/f"{gene}/{gene}_allele_functionality_reference.csv"

# Output
summaryDir = projectDir/"summary"
summaryDir.mkdir(exist_ok=True)
all_idv_geno_csv = summaryDir/f"all_perGenes.{gene}.csv"
idv_geno_csv = summaryDir/f"perGenes.{gene}.csv"
af_csv = summaryDir/f"allele_frequency.{gene}.csv"
pheno_csv = summaryDir/f"phenotype.{gene}.csv"

# Read allele definitions as pd.DataFrame

In [5]:
with open(star_allele_yml) as handle:
    star_alleles:dict = load(handle, Loader=Loader)

In [6]:
# Process star_alleles for a ready-to-merge dataframe
column_names:list = ['gene', 'pharmgkb_allele', 'rsID', 'chr', 'start', 'end', 'allele_ref', 'allele_alt']
allele_ls:list = list()    # List of pd.Series of alleles
for tmp_gene, gene_info in star_alleles.items():
    gene_chr, gene_start, gene_end = re.split(":|-", gene_info['location'])
    for allele, mutations in gene_info['alleles'].items():
        if len(mutations['mutations']) == 0:
            pass
        else:
            for mut in mutations['mutations']:
                allele_entry = [tmp_gene, allele, mut[0], gene_chr, mut[1], mut[2], mut[3], mut[4]]
                allele_ls.append(pd.Series(allele_entry, index=column_names))

In [7]:
allele_def_df = pd.DataFrame(allele_ls)
allele_def_df['CPRA'] = allele_def_df.apply(lambda r: f"{r.chr}.{r.start}.{r.allele_ref}.{r.allele_alt}", axis=1)
allele_def_df.set_index('CPRA', inplace=True)
allele_def_df = allele_def_df.loc[allele_def_df['gene']==gene, :].copy()
allele_def_df.head()

Unnamed: 0_level_0,gene,pharmgkb_allele,rsID,chr,start,end,allele_ref,allele_alt
CPRA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chrM.663.A.G,MT-RNR1,m.663A>G,rs56489998,chrM,663,663,A,G
chrM.669.T.C,MT-RNR1,m.669T>C,669T>C,chrM,669,669,T,C
chrM.747.A.G,MT-RNR1,m.747A>G,747A>G,chrM,747,747,A,G
chrM.786.G.A,MT-RNR1,m.786G>A,786G>A,chrM,786,786,G,A
chrM.807.A.C,MT-RNR1,m.807A>C,807A>C,chrM,807,807,A,C


# Read allele function reference CSV

In [8]:
allele_func_df = pd.read_csv(allele_func_ref, skiprows=2, names=['allele', 'function'], usecols=[0,3], index_col=[0])
allele_func_df['function'] = allele_func_df['function'].apply(lambda x: x.replace('Normal', 'normal'))
allele_func_d = allele_func_df.to_dict()['function']

# Read id_ref as pd.DataFrame

In [9]:
# Read id reference and subset for useful columns
id_ref_df = pd.read_csv(id_ref, index_col=[0])
id_ref_df = id_ref_df.loc[id_ref_df['treated_ethnicity']=='Chinese', :].copy()    # Include Chinese only
id_ref_df =  id_ref_df[['founder_status']]
id_ref_df.head()

Unnamed: 0_level_0,founder_status
sre_participant_id,Unnamed: 1_level_1
SRE029440,Non-founder
SRE025838,Founder
SRE003784,Non-founder
SRE026979,Non-founder
SRE004787,Founder


# Read individual gene TSV as df

In [11]:
result_tsvs:list = list(resultsDir.glob("*MT-RNR1.tsv"))

In [21]:
def read_tsv(tsv:Path)->pd.DataFrame:
    df = pd.read_table(tsv)
    df['CPRA'] = df.apply(lambda r: f"{r.CHROM}.{r.POS}.{r.REF}.{r.ALT}", axis=1)
    df['sample_id'] = tsv.name.split('.')[0]
    df.set_index('CPRA', inplace=True)
    return df

In [22]:
pool = multiprocessing.Pool(processes=10)    # Create Pool of 10 parallel processes max
results = list(tqdm(pool.imap_unordered(read_tsv, result_tsvs), total=len(result_tsvs)))
pool.close()    # Wait until all processes completed
pool.join()    # Join all output from processes
all_sample_gene_var_df = pd.concat(results)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24108/24108 [00:17<00:00, 1370.53it/s]


# Call star alleles per sample

In [24]:
risk_allele_ls = [
    "m.1095T>C", 
    "m.1494C>T", 
    "m.1555A>G"
]

In [25]:
genotype_sr_ls = list()
af_cutoff = 0.8
cnt=0

for sample_gene_df in tqdm(results):
    tmp_check = False
    sample_id = sample_gene_df['sample_id'].unique().tolist()[0]
    out_genotype_tsv = resultsDir/f"{sample_id}.autosome_gene_manual.genotype.tsv"
    out_genotype_tsv.unlink(missing_ok=True)
    
    # Merge allele_def_df to sample_gene_df
    paired_df = sample_gene_df.merge(allele_def_df, left_index=True, right_index=True)

    # annotate the allele_function
    paired_df['allele_func'] = paired_df['pharmgkb_allele'].apply(lambda x: allele_func_d[x])
    
    tmp_allele, tmp_allele_func, tmp_af = list(), list(), list()
    if len(paired_df)>0:
        for _, row in paired_df.iterrows():
            if float(row.AF) >= af_cutoff:
                tmp_allele.append(row.pharmgkb_allele)
                tmp_allele_func.append(row.allele_func)
                tmp_af.append(row.AF)
        if len(tmp_allele) == 0:
            cnt+=1
            tmp_check=True
            tmp_allele.append("Reference")
            tmp_allele_func.append('normal risk of aminoglycoside-induced hearing loss')
        if 'increased risk of aminoglycoside-induced hearing loss' in tmp_allele_func:
            tmp_phenotype = 'increased risk of aminoglycoside-induced hearing loss'
        elif 'uncertain risk of aminoglycoside-induced hearing loss' in tmp_allele_func:
            tmp_phenotype = 'uncertain risk of aminoglycoside-induced hearing loss'
        else:
            tmp_phenotype = 'normal risk of aminoglycoside-induced hearing loss'
        with_1095 = True if "m.1095T>C" in tmp_allele else False
        with_1494 = True if "m.1494C>T" in tmp_allele else False
        with_1555 = True if "m.1555A>G" in tmp_allele else False
        tmp_genotype = '+'.join(tmp_allele)
    else: 
        tmp_genotype = "Reference"
        tmp_allele = [tmp_genotype]
        tmp_af = [np.nan]
        with_1095, with_1494, with_1555 = False, False, False
        tmp_phenotype = 'normal risk of aminoglycoside-induced hearing loss'
        
    genotype_sr_ls.append(pd.Series(
        [sample_id, tmp_genotype, tmp_allele, tmp_af, with_1095, with_1494, with_1555, tmp_phenotype], 
        index=['sample_id', 'genotype', 'allele_ls', 'af_ls', 'with_m.1095T>C', 'with_m.1494C>T', 'with_m.1555A>G', 'phenotype']
    ))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24108/24108 [00:22<00:00, 1053.82it/s]


In [26]:
all_genotype_df = pd.DataFrame(genotype_sr_ls)
all_genotype_df.head()

Unnamed: 0,sample_id,genotype,allele_ls,af_ls,with_m.1095T>C,with_m.1494C>T,with_m.1555A>G,phenotype
0,SRE015111,Reference,[Reference],[nan],False,False,False,normal risk of aminoglycoside-induced hearing ...
1,SRE002135,Reference,[Reference],[nan],False,False,False,normal risk of aminoglycoside-induced hearing ...
2,SRE013011,Reference,[Reference],[nan],False,False,False,normal risk of aminoglycoside-induced hearing ...
3,SRE001984,Reference,[Reference],[nan],False,False,False,normal risk of aminoglycoside-induced hearing ...
4,SRE023752,Reference,[Reference],[nan],False,False,False,normal risk of aminoglycoside-induced hearing ...


# Archive: To determine the cut-off of variants' allele fractions

In [26]:
# Box and whisker diagram of allele fraction
m1095 = [float(af) for af in all_genotype_df.loc[all_genotype_df['pharmgkb_allele']=="m.1095T>C", "AF"].tolist()]
m1494 = [float(af) for af in all_genotype_df.loc[all_genotype_df['pharmgkb_allele']=="m.1494C>T", "AF"].tolist()]
m1555 = [float(af) for af in all_genotype_df.loc[all_genotype_df['pharmgkb_allele']=="m.1555A>G", "AF"].tolist()]

In [None]:
# Use 0.8 as the allele fraction cutoff
def apply_cutoff(allele, af, cutoff):
    if af != af:
        return allele
    
    out_allele_ls = list()
    for tmp_allele, tmp_af in zip(allele.split(','), af.split(',')):
        if float(tmp_af) > cutoff:
            out_allele_ls.append(tmp_allele)
        else:
            out_allele_ls.append('Reference')
    return ','.join(out_allele_ls)
af_cutoff = 0.8
all_genotype_df['genotype'] = all_genotype_df.apply(lambda r: apply_cutoff(r.pharmgkb_allele, r.AF, af_cutoff), axis=1)

In [None]:
# Discard sample calls if multiple pharmgkb_allele is identified
all_genotype_df['multiple_allele'] = all_genotype_df['genotype'].apply(lambda x: True if "," in x else False)
all_genotype_df = all_genotype_df.loc[~all_genotype_df['multiple_allele'], :].copy()

# Merge id_ref_df to all_genotype_df, then export to CSV

In [28]:
# Extract sre_patient_id from sample_id and set as index
all_genotype_df['sre_patient_id'] = all_genotype_df['sample_id']
all_genotype_df.set_index('sre_patient_id', inplace=True)

In [31]:
# Merge the df
per_gene_df = id_ref_df.merge(all_genotype_df, left_index=True, right_index=True)
per_gene_df['tmp'] = per_gene_df.index
per_gene_df = per_gene_df.drop_duplicates(subset=['tmp', 'genotype', 'phenotype']).drop(columns=['tmp'])
per_gene_df['gene'] = gene
per_gene_df['EHR_priority_notation'] = np.nan

In [33]:
# Consolidate columns and export to CSV
keep_cols = ['founder_status', 'gene', 'sample_id', 'genotype', 'phenotype', 'EHR_priority_notation']
all_per_gene_df = per_gene_df.loc[:, keep_cols].copy()
all_per_gene_df.to_csv(all_idv_geno_csv, index=True, index_label='sre_patient_id')
per_gene_df = per_gene_df.loc[per_gene_df['founder_status']=='Founder', :].copy()
per_gene_df.to_csv(idv_geno_csv, index=True, index_label='sre_patient_id')

# Calculate allele frequency

In [35]:
# Group per_gene_df by genotype 
af_df = per_gene_df.groupby('genotype').agg(list)
af_df['gene'] = gene
af_df['variant'] = af_df.index
af_df['allele_count'] = af_df['sample_id'].apply(len)
af_df['allele_number'] = len(per_gene_df)
af_df['allele_frequency'] = af_df.apply(lambda r: r.allele_count / r.allele_number, axis=1)
af_df['no_heterozygous_carriers'] = np.nan
af_df['no_homozygous_carriers'] = np.nan

In [37]:
# Annotate allele function
def det_variant_func(var, allele_func_d, risk_allele_ls):
    if var in allele_func_d.keys():
        return allele_func_d[var]
    var_ls = var.split('+')
    if set(risk_allele_ls).intersection(set(var_ls)):
        return "increased risk of aminoglycoside-induced hearing loss"
    else:
        return "uncertain risk of aminoglycoside-induced hearing loss"

af_df['allele_function'] = af_df['variant'].apply(lambda x: det_variant_func(x, allele_func_d, risk_allele_ls))

In [39]:
# Consolidate columns and export to CSV
keep_cols = ['gene', 'variant', 'allele_function', 'allele_frequency', 'allele_count', 'allele_number', 'no_heterozygous_carriers', 'no_homozygous_carriers']
af_df = af_df.loc[:, keep_cols]
af_df.to_csv(af_csv, index=False)

# Calculate the carrier_frequencies of actionable phenotypes

In [40]:
# Prepare phenotype CSV and export
phenotype_df = per_gene_df.groupby('phenotype').agg(list)
phenotype_df['pharmacogene'] = phenotype_df['gene'].apply(lambda x: x[0])
phenotype_df['genotype'] = phenotype_df['genotype'].apply(lambda x: ','.join(list(set(x))))
phenotype_df['phenotype'] = phenotype_df.index
phenotype_df['no_carriers_with_risk_alleles'] = phenotype_df['sample_id'].apply(len)
phenotype_df['all_genotyped_individuals'] = len(per_gene_df)
phenotype_df['carrier_frequency'] = phenotype_df['no_carriers_with_risk_alleles'] / phenotype_df['all_genotyped_individuals']
keep_col = ['pharmacogene', 'genotype', 'phenotype', 'carrier_frequency', 'no_carriers_with_risk_alleles', 'all_genotyped_individuals']
phenotype_df = phenotype_df.loc[:, keep_col].copy()

In [42]:
phenotype_df.to_csv(pheno_csv, index=False)