In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [3]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01c.genotype.HLA-HD/summary")

# Input
consolidated_hlaa = projectDir/"HLA-A.results_all.tsv"
consolidated_hlab = projectDir/"HLA-B.results_all.tsv"

# Reference
sample_info = Path("/home/_shared/jscliu/project/2025/Flagship/reference/sample_info_annot.2024-11-12.csv")
cohort_founder = Path("/home/_shared/jscliu/project/2025/Flagship/reference/cohort_founder_list.2024-11-12.csv")
hla_a_risk_alleles = ['HLA-A*31:01']
hla_b_risk_alleles = ['HLA-B*15:02', 'HLA-B*57:01', 'HLA-B*58:01']

# Output
all_hla_a_perGene = projectDir/"all_perGenes.HLA-A.csv"
all_hla_b_perGene = projectDir/"all_perGenes.HLA-B.csv"
hla_a_perGene = projectDir/"perGenes.HLA-A.csv"
hla_b_perGene = projectDir/"perGenes.HLA-B.csv"
hla_a_af = projectDir/"allele_frequency.HLA-A.csv"
hla_b_af = projectDir/"allele_frequency.HLA-B.csv"
hla_a_pheno = projectDir/"phenotype.HLA-A.csv"
hla_b_pheno = projectDir/"phenotype.HLA-B.csv"

# Read sample information

In [4]:
with open(cohort_founder, 'r') as fh:
    cohort_founders:list = [ line.strip().split(',')[1] for line in fh ]
sample_info_df = pd.read_csv(sample_info)
founder_info_df = sample_info_df.loc[sample_info_df['sre_lab_id'].isin(cohort_founders), :].copy()

# Read HLA-HD results

In [5]:
hla_a_df = pd.read_table(consolidated_hlaa, index_col=[0]).rename(columns={'diplotype': 'major'})
hla_b_df = pd.read_table(consolidated_hlab, index_col=[0]).rename(columns={'diplotype': 'major'})

# Merge hla_a_df/hla_b_df to founder_info_df

In [6]:
hla_a_result_df = sample_info_df.merge(hla_a_df, left_on='sre_lab_id', right_index=True)
hla_b_result_df = sample_info_df.merge(hla_b_df, left_on='sre_lab_id', right_index=True)

In [8]:
# Add columns: 'gene'
hla_a_result_df['gene'] = 'HLA-A'
hla_b_result_df['gene'] = 'HLA-B'

# Refine columns: 'haplotype_1, haplotype_2
hla_a_result_df['haplotype_1'] = hla_a_result_df['haplotype_1'].apply(lambda x: ":".join(x.split(":")[:2]) if not pd.isna(x) else np.nan)
hla_a_result_df['haplotype_2'] = hla_a_result_df['haplotype_2'].apply(lambda x: ":".join(x.split(":")[:2]) if not pd.isna(x) else np.nan)
hla_b_result_df['haplotype_1'] = hla_b_result_df['haplotype_1'].apply(lambda x: ":".join(x.split(":")[:2]) if not pd.isna(x) else np.nan)
hla_b_result_df['haplotype_2'] = hla_b_result_df['haplotype_2'].apply(lambda x: ":".join(x.split(":")[:2]) if not pd.isna(x) else np.nan) 

# Add columns: 'minor' and 'genotype'
hla_a_result_df['minor'] = hla_a_result_df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}"if ((not pd.isna(r.haplotype_1)) & (not pd.isna(r.haplotype_2))) else np.nan, axis=1)
hla_a_result_df['genotype'] = hla_a_result_df['minor']
hla_b_result_df['minor'] = hla_b_result_df.apply(lambda r: f"{r.haplotype_1}/{r.haplotype_2}"if ((not pd.isna(r.haplotype_1)) & (not pd.isna(r.haplotype_2))) else np.nan, axis=1)
hla_b_result_df['genotype'] = hla_b_result_df['minor']

In [9]:
# Annotate the allele function
def annotate_allele_func(allele:str, risk_alleles:list):
    if pd.isna(allele):
        return np.nan
    elif allele in risk_alleles:
        return "high_risk"
    else:
        return "low_risk"

hla_a_result_df['haplotype_1_func'] = hla_a_result_df['haplotype_1'].apply(lambda x: annotate_allele_func(x, hla_a_risk_alleles))
hla_a_result_df['haplotype_2_func'] = hla_a_result_df['haplotype_2'].apply(lambda x: annotate_allele_func(x, hla_a_risk_alleles))
hla_b_result_df['haplotype_1_func'] = hla_b_result_df['haplotype_1'].apply(lambda x: annotate_allele_func(x, hla_b_risk_alleles))
hla_b_result_df['haplotype_2_func'] = hla_b_result_df['haplotype_2'].apply(lambda x: annotate_allele_func(x, hla_b_risk_alleles))

In [10]:
# Annotate the phenotype
def annotate_phenotype(hap_1, hap_2):
    if pd.isna(hap_1) | pd.isna(hap_2):
        return np.nan
    elif "high_risk" in [hap_1, hap_2]:
        return "Carrier_of_risk_alleles"
    else:
        return "Non_carrier_of_risk_alleles"

hla_a_result_df['phenotype'] = hla_a_result_df.apply(lambda r: annotate_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1)
hla_b_result_df['phenotype'] = hla_b_result_df.apply(lambda r: annotate_phenotype(r.haplotype_1_func, r.haplotype_2_func), axis=1)

In [11]:
# Annotate the pseudo_priority_notation
hla_a_result_df['EHR_priority_notation'] = np.nan
hla_b_result_df['EHR_priority_notation'] = np.nan

# Subset for required columns and export to CSV

In [12]:
keep_cols:list = [
    'sre_participant_id','sre_lab_id', 'gene',
    'major', 'minor', 'haplotype_1', 'haplotype_2', 'genotype',
    'haplotype_1_func', 'haplotype_2_func', 'phenotype', 'EHR_priority_notation'
]
hla_a_allResults_df = hla_a_result_df.loc[:, keep_cols].copy()
hla_b_allResults_df = hla_b_result_df.loc[:, keep_cols].copy()

In [14]:
# All participants
hla_a_allResults_df.to_csv(all_hla_a_perGene, index=False)
hla_b_allResults_df.to_csv(all_hla_b_perGene, index=False)

In [17]:
# Drop non-unrelated-Chinese
hla_a_allResults_df =  hla_a_allResults_df.loc[hla_a_allResults_df['sre_lab_id'].isin(cohort_founders), :].copy()
hla_b_allResults_df =  hla_b_allResults_df.loc[hla_b_allResults_df['sre_lab_id'].isin(cohort_founders), :].copy()

In [115]:
hla_a_allResults_df.to_csv(hla_a_perGene, index=False)
hla_b_allResults_df.to_csv(hla_b_perGene, index=False)

# Calculate the allele frequencies

In [123]:
def allResults_to_af(in_df, ploidy="diploid"):
    df = in_df.copy()
    df['zygosity'] = df.apply(lambda r: "hom" if r.haplotype_1 == r.haplotype_2 else "non_hom", axis=1)
    gene = df['gene'].unique().tolist()[0]
    
    af_df = pd.concat([
        df.loc[:, ['haplotype_1', 'haplotype_1_func']].rename(columns={"haplotype_1": "variant", "haplotype_1_func": "allele_function"}), 
        df.loc[:, ['haplotype_2', 'haplotype_2_func']].rename(columns={"haplotype_2": "variant", "haplotype_2_func": "allele_function"})
    ])
    af_df = af_df.groupby('variant', as_index=False).agg(list)
    af_df['gene'] = gene
    af_df['allele_count'] = af_df['allele_function'].apply(len)
    af_df['allele_function'] = af_df['allele_function'].apply(lambda x: x[0])
    af_df['allele_number'] = 2*len(df) if ploidy=='diploid' else len(df)
    af_df['allele_frequency'] = af_df.apply(lambda r: r.allele_count/r.allele_number, axis=1)
    
    # Count zygosity
    zygosity_df_ls = list()
    for i, r in af_df.iterrows():
        n_hom = len(df.loc[(df['haplotype_1']==r.variant) & (df.zygosity=='hom'), :])
        n_het = len(df.loc[((df['haplotype_1']==r.variant) | (df['haplotype_2']==r.variant)) & (df.zygosity=='non_hom'), :])
        zygosity_df_ls.append(pd.DataFrame({"no_heterozygous_carriers": n_het, "no_homozygous_carriers": n_hom}, index=[i]))
    
    # Append zygosity to af_df
    af_df = af_df.merge(pd.concat(zygosity_df_ls), left_index=True, right_index=True, how='left')
    
    # Subset columns
    af_df_cols:list = ["gene", "variant", "allele_function", "allele_frequency", "allele_count", "allele_number", "no_heterozygous_carriers", "no_homozygous_carriers"]
    af_df = af_df.loc[:, af_df_cols].copy()
    
    # Sort and reset index, then return
    af_df.sort_values(by=["allele_function", "allele_frequency", "variant"], ascending=[False, False, True], inplace=True)
    af_df.reset_index(drop=True, inplace=True)
    return af_df

In [113]:
hla_a_af_df = allResults_to_af(hla_a_allResults_df)
hla_b_af_df = allResults_to_af(hla_b_allResults_df)

In [116]:
hla_a_af_df.to_csv(hla_a_af, index=False)
hla_b_af_df.to_csv(hla_b_af, index=False)

# (Not executed) Calculate the phenotype frequencies

In [135]:
def allResults_to_pheno(in_df):
    df = in_df.copy()
    phenotyped_df = df.dropna(subset=["phenotype"])
    pheno_df = phenotyped_df.loc[:, ['gene', 'genotype', 'phenotype']].groupby('phenotype', as_index=False).agg(list)
    pheno_df['no_carriers_with_risk_alleles'] = pheno_df['gene'].apply(len)
    pheno_df['all_genotyped_individuals'] = len(phenotyped_df)
    pheno_df['carrier_frequency'] = pheno_df['no_carriers_with_risk_alleles'] / pheno_df['all_genotyped_individuals']
    pheno_df['pharmacogene'] = pheno_df['gene'].apply(lambda x: x[0])
    pheno_df['genotype'] = pheno_df['genotype'].apply(lambda x: "/".join(x))
    
    # Subset columns
    pheno_df_cols:list = ["pharmacogene", "genotype", "phenotype", "carrier_frequency", "no_carriers_with_risk_alleles", "all_genotyped_individuals"]
    pheno_df = pheno_df.loc[:, pheno_df_cols].copy()
    
    # Sort and reset index, then return
    pheno_df.sort_values(by=["phenotype", "carrier_frequency"], ascending=False, inplace=True)
    pheno_df.reset_index(drop=True, inplace=True)
    return pheno_df

In [136]:
hla_a_pheno_df = allResults_to_pheno(hla_a_allResults_df)
hla_b_pheno_df = allResults_to_pheno(hla_b_allResults_df)

In [139]:
hla_a_pheno_df.to_csv(hla_a_pheno, index=False)
hla_b_pheno_df.to_csv(hla_b_pheno, index=False)