In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
plt.style.use('ggplot')

In [2]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01b.genotype.Cyrius")
resultDir = projectDir/"results"
sumDir = projectDir/"summary"
referenceDir = Path('/home/_shared/jscliu/project/2025/Flagship/reference/PharmGKB_ref/CYP2D6')

# Input
all_tsv = resultDir/"manifest.all.tsv"

# Reference
id_ref = "/home/_shared/jscliu/project/2025/Flagship/reference/sample_info_annot.2024-11-12.csv"

# Output
gene = 'CYP2D6'
perGene_csv = sumDir/f"perGenes.{gene}.csv"
all_perGene_csv = sumDir/f"all_perGenes.{gene}.csv"
af_csv = sumDir/f"allele_frequency.{gene}.csv"
pheno_csv = sumDir/f"phenotype.{gene}.csv"

# Reading references

## ID reference

In [4]:
# Read id reference and subset for useful columns
id_ref_df = pd.read_csv(id_ref, index_col=[0])
id_ref_df = id_ref_df.loc[id_ref_df['treated_ethnicity']=='Chinese', :].copy()    # Include Chinese only
id_ref_df =  id_ref_df[['founder_status']]
print(f"{len(id_ref_df) = :,}")
id_ref_df.head()

len(id_ref_df) = 23,149


Unnamed: 0_level_0,founder_status
sre_participant_id,Unnamed: 1_level_1
SRE029440,Non-founder
SRE025838,Founder
SRE003784,Non-founder
SRE026979,Non-founder
SRE004787,Founder


## Assigned scores to alleles

In [5]:
allele_functionality = referenceDir/'CYP2D6_allele_functionality_reference.csv'
# Read
allele_func_df = pd.read_csv(
    allele_functionality, 
    usecols=[0, 1, 3], skiprows=2, 
    names=['allele', 'activity_value', 'status'], 
    index_col=[0]
)
# Remove alleles of multiple copies
keep_alleles = [ allele for allele in allele_func_df.index.tolist() if "x" not in allele ]
allele_func_df = allele_func_df.loc[keep_alleles, :].copy()

# Read and preprocess manifest.all.tsv

In [6]:
all_df = pd.read_table(all_tsv, skiprows=1, names=['sample_id', 'diplotype', 'filter'])
pass_df = all_df.loc[all_df['filter']=="PASS", ['sample_id', 'diplotype']].copy()

In [7]:
pass_df[['haplotype_1', 'haplotype_2']] = pass_df['diplotype'].apply(lambda d: pd.Series(d.split('/')))

In [9]:
# Remove redundant samples and duplicated samples with contradicting haplotypes
pass_df['sre_patient_id'] = pass_df['sample_id'].apply(lambda s: re.match(r'SRE[0-9]+', s).group(0))
def remove_duplicates(df):
    """
    Remove redundant samples and duplicated samples with 
    contradicting haplotypes, by "sre_patient_id", "haplotype_1", "haplotype_2"
    """
    ## Extract entries with duplicating sre_patient_id into df
    dup_id_df = df.loc[df.duplicated(subset=['sre_patient_id'], keep=False), :].copy()
    dup_id_consistent_df = df.loc[df.duplicated(subset=['sre_patient_id', 'haplotype_1', 'haplotype_2'], keep=False), :].copy()
    ## Get list of sample_id to be removed due to duplications and remove from consol_df
    inconsistent_duplicates:list = list(set(dup_id_df['sample_id'].tolist()).difference(set(dup_id_consistent_df['sample_id'].tolist())))
    consistent_duplicates:list = dup_id_consistent_df.loc[dup_id_consistent_df.duplicated(subset=['sre_patient_id']), 'sample_id'].tolist()
    nr_df = df.loc[~df['sample_id'].isin(inconsistent_duplicates+consistent_duplicates), :].copy()
    return nr_df
pass_df = remove_duplicates(pass_df).set_index("sre_patient_id")

# perGene

## Define functions

In [24]:
def standardize_haplotype(hap:str)->str:
    if pd.isna(hap):
        return hap
        
    # Remove asterik for numerical sorting
    if "x" in hap:
        return hap    # Nothing to do for multiple copies of same allele
    num_hap = hap.replace("*", "")
    num_hap_ls = num_hap.split('+')
    num_hap_ls.sort()
    sorted_hap_ls = [ f"*{h}" for h in num_hap_ls ]
    sorted_hap = "+".join(sorted_hap_ls)
    return sorted_hap

def break_allele_to_list(allele:str)->list:
    '''Function to break allele string from Cyrius into lists of alleles'''
    allele_ls:list = list()    
    if 'x' in allele:
        # Multiple copies of the same allele
        x_idx = allele.index('x')
        a = allele[:x_idx]
        copy_number = int(allele[x_idx+1:])
        allele_ls += [ a for _ in range(0, copy_number) ]
    else:
        # Single or combinations of alleles separated by "+"
        for a in allele.split('+'):
            allele_ls.append(a)
    return allele_ls

def score_allele(haplotype:str, score_df:pd.DataFrame)->float:
    if pd.isna(haplotype):
        return np.nan
        
    '''Assign scores to a haplotype'''
    # Give score
    allele_ls = break_allele_to_list(haplotype)
    score_ls = [ float(score_df.loc[allele, 'activity_value']) for allele in allele_ls ]

    # Activity value is nan if the allele has uncertain function
    score_set = set(score_ls)
    if (len(score_set)==1) and (list(score_set)[0]!=list(score_set)[0]):
        # return pd.Series([score_ls, np.nan])
        return np.nan
    
    # Calculate the sum of score. nan will be treated as 0.0
    score_replaced_ls = [ score if score == score else 0.0 for score in score_ls ]
    score_sum = sum(score_replaced_ls)
    
    # return pd.Series([score_ls, score_sum])    # Return
    return score_sum

In [28]:
def annot_clinical_function(score:float)->str:
    '''
    Given the cumulative allele score, annotate it's clinical function.
    The annotation is referenced to PharmGKB's guidelines, i.e.
    - 0: "No function"
    - >0 & <1: "Decreased function"
    - 1: "Normal function"
    - >1 & <2: NOT STATED
    - >=2: "Increased function"
    *For our case, between 1-1.5 will be considered as "Normal function" and 1.5-2.0 will be considered as "Increased function"
    '''
    if score == score:
        if score > 1.5:
            func = "Increased function"
        elif score >= 1.0:
            func = "Normal function"
        elif score > 0:
            func = "Decreased function"
        elif score == 0:
            func = "No function"
    else:
        func = "Uncertain function"
    return func
    
# Assign phenotype
def assign_phenotype(activity_score:float)->str:
    '''
    Annotate phenotype based on diplotype activity score: 
    - S >=2.5: Ultrarapid Metabolizer
    - 1.25 <= S <= 2.25: Normal Metabolizer
    - 0 < S <= 1: Intermediate Metabolizer
    - S = 0: Poor Metabolizer
    - na: Indeterminate
    '''
    if activity_score != activity_score:
        phenotype = "Indeterminate"     # nan
    elif activity_score >= 2.5:
        phenotype = "UM (ultrarapid metabolizer)"
    elif activity_score >= 1.25:
        phenotype = "NM (normal metabolizer)"
    elif activity_score > 0:
        phenotype = "IM (intermediate metabolizer)"
    elif activity_score == 0:
        phenotype = "PM (poor metabolizer)"
    return phenotype

keep_colnames:list = [
    'founder_status', 'gene', 'sample_id',
    'major', 'minor', 'haplotype_1', 'haplotype_2', 'genotype',
    'haplotype_1_func', 'haplotype_2_func', 'phenotype', 'EHR_priority_notation', 
    'haplotype_1_activity_value', 'haplotype_2_activity_value', 'activity_score'
]

## perGene - all_samples

In [16]:
# Merge all_id_ref_df to pass_df
sample_annot_df = id_ref_df.merge(pass_df, left_index=True, right_index=True, how='left')

In [23]:
sample_annot_df['haplotype_1'] = sample_annot_df['haplotype_1'].apply(standardize_haplotype)
sample_annot_df['haplotype_2'] = sample_annot_df['haplotype_2'].apply(standardize_haplotype)

In [25]:
sample_annot_df['haplotype_1_activity_value'] = sample_annot_df.apply(lambda r: score_allele(r.haplotype_1, allele_func_df), axis=1)
sample_annot_df['haplotype_2_activity_value'] = sample_annot_df.apply(lambda r: score_allele(r.haplotype_2, allele_func_df), axis=1)

In [29]:
# convert founder_df to perGene_df
all_perGene_df = sample_annot_df.copy()
# Reformat
all_perGene_df['gene'] = 'CYP2D6'
all_perGene_df.rename(columns={'diplotype': 'major'}, inplace=True)
all_perGene_df['genotype'] = all_perGene_df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}", axis=1)
all_perGene_df['haplotype_1_func'] = all_perGene_df['haplotype_1_activity_value'].apply(annot_clinical_function)
all_perGene_df['haplotype_2_func'] = all_perGene_df['haplotype_2_activity_value'].apply(annot_clinical_function)
all_perGene_df['activity_score'] = all_perGene_df.apply(lambda r: r.haplotype_1_activity_value + r.haplotype_2_activity_value if (r.haplotype_1_activity_value==r.haplotype_1_activity_value) & (r.haplotype_2_activity_value==r.haplotype_2_activity_value) else np.nan, axis=1)
all_perGene_df['phenotype'] = all_perGene_df['activity_score'].apply(assign_phenotype)
all_perGene_df['minor'] = np.nan
all_perGene_df['EHR_priority_notation'] = np.nan
all_perGene_df = all_perGene_df.loc[:, keep_colnames].copy()

In [31]:
# Export all_perGene_df to all_perGene_csv
all_perGene_df.to_csv(all_perGene_csv, index=True, index_label='sre_patient_id')

## perGene - founders only

In [34]:
perGene_df = all_perGene_df.loc[all_perGene_df['founder_status']=='Founder', :].copy()

In [39]:
# Export perGene_df to perGene_csv
perGene_df.to_csv(perGene_csv, index=True, index_label='sre_patient_id')

## Caluclate allele_frequencies

In [41]:
def count_zygosity(called_df):
    """Intake called_df, count the number of homozygous and heterozygous carriers into a dict"""
    homo_dict=defaultdict(int)
    hetero_dict=defaultdict(int)
    for _, row in called_df.iterrows():
        if row.haplotype_1 == row.haplotype_2:
            # Homozygous
            homo_dict[row.haplotype_1] += 1
        else:
            # Heterozygous
            hetero_dict[row.haplotype_1] += 1
            hetero_dict[row.haplotype_2] += 1
    return homo_dict, hetero_dict

In [42]:
gene = 'CYP2D6'
df = perGene_df.copy()

# Initialize af_df
af_df = pd.concat([
    df.loc[:, ['haplotype_1', 'haplotype_1_func']].rename(columns={'haplotype_1': 'variant', 'haplotype_1_func': 'allele_function'}), 
    df.loc[:, ['haplotype_2', 'haplotype_2_func']].rename(columns={'haplotype_2': 'variant', 'haplotype_2_func': 'allele_function'})
]).groupby('variant').agg(list)

# Get general information from af_df
af_df['allele_count'] = af_df['allele_function'].apply(len)
af_df['gene'] = gene
af_df['allele_function'] = af_df['allele_function'].apply(lambda x: x[0])
af_df.dropna(subset=['allele_function'], inplace=True)
af_df.sort_values(by=['allele_function', 'allele_count'], ascending=False, inplace=True)
af_df['variant'] = af_df.index
af_df['allele_number'] = af_df['allele_count'].sum()
af_df['allele_frequency'] = af_df.apply(lambda r: r.allele_count / r.allele_number, axis=1)

# Count zygosity and merge it to df
homo_d, hetero_d = count_zygosity(df)
homo_df = pd.DataFrame(homo_d, index=["no_homozygous_carriers"]).T
hetero_df = pd.DataFrame(hetero_d, index=["no_heterozygous_carriers"]).T
zygosity_df = hetero_df.merge(homo_df, left_index=True, right_index=True, how='outer').fillna(0).astype(int)
af_df = af_df.merge(zygosity_df, left_index=True, right_index=True, how='left')

# Subset for required columns and export to CSV
af_colnames = [
    "gene", "variant", "allele_function", 
    "allele_frequency", "allele_count", "allele_number", 
    "no_heterozygous_carriers", "no_homozygous_carriers"
]
af_df = af_df.loc[:, af_colnames].copy()

In [43]:
af_df.to_csv(af_csv, index=False)

## Calculate by phenotype

In [44]:
gene = 'CYP2D6'
df = perGene_df.copy()

pheno_df = df.groupby('phenotype').agg(list)
pheno_df['pharmacogene'] = gene
pheno_df['genotype'] = pheno_df['genotype'].apply(lambda x: ",".join(list(set(x))))
pheno_df['phenotype'] = pheno_df.index
pheno_df['no_carriers_with_risk_alleles'] = pheno_df['sample_id'].apply(len)
pheno_df['all_genotyped_individuals'] = pheno_df['no_carriers_with_risk_alleles'].sum()
pheno_df['carrier_frequency'] = pheno_df.apply(lambda r: r.no_carriers_with_risk_alleles / r.all_genotyped_individuals, axis=1)
pheno_df.sort_values(by=['carrier_frequency'], ascending=False, inplace=True)

# Subset of columns to be exported
keep_colnames = [
    'pharmacogene', 'genotype', 'phenotype', 'carrier_frequency', 
    'no_carriers_with_risk_alleles', 'all_genotyped_individuals'
]
pheno_df = pheno_df.loc[:, keep_colnames].copy()

In [45]:
pheno_df.to_csv(pheno_csv, index=False)