In [7]:
from pathlib import Path
import pandas as pd
import numpy as np
from yaml import load
from yaml import CLoader as Loader
import re

In [8]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx")
summaryDir = projectDir/"02.consolidate_genotypes/summary"
pheno_cat = projectDir/"02.consolidate_genotypes/script/phenotype_category.yml"
sample_info = Path("/home/_shared/jscliu/project/2025/Flagship/reference/sample_info_annot.2024-11-12.csv")

aldy_summary_dir = projectDir/f"01a.genotype.aldy/summary"
cyrius_summary_dir = projectDir/f"01b.genotype.Cyrius/summary"
hlahd_summary_dir = projectDir/f"01c.genotype.HLA-HD/summary"
manual_summary_dir = projectDir/f"01d.genotype.manual/summary"

# Output
consolidate_perGene = summaryDir/"pgx_allParticipants_allGenes.csv"

In [9]:
# Read pheno_cat as python dict
with open(pheno_cat) as handle:
    pheno_cat_d = load(handle, Loader=Loader)

In [10]:
level_1AB_genes:dict = {
    "aldy": [
        "CYP4F2", "CYP2C19", "CYP2C9", "NUDT15", "TPMT", 
        "VKORC1", "CYP3A5", "CFTR", "CYP2B6", "DPYD", 
        "G6PD", "IFNL3", "SLCO1B1", "UGT1A1", "CYP3A4", 
        "NAT2", "CYP2A6", "VKORC1_rs2359612", "VKORC1_rs7294", 
        "VKORC1_rs8050894", "VKORC1_rs9934438"
    ], 
    "Cyrius": ["CYP2D6"], 
    "HLA-HD": ["HLA-A", "HLA-B"], 
    "manual": [
        "CACNA1S", "RYR1", "MT-RNR1", "ABCG2", 
        "IFNL3_rs8099917", "IFNL4_rs11322783"
    ]
}

tool_summary_d:dict = {
    "aldy": aldy_summary_dir, 
    "Cyrius": cyrius_summary_dir, 
    "HLA-HD": hlahd_summary_dir, 
    "manual": manual_summary_dir 
}

# Read and extract sample info

In [11]:
sample_info_df = pd.read_csv(sample_info, index_col=[0])

In [12]:
drop_cols = [
    "sample_nature", "sex", "X_logratio", "Y_logratio", 
    "birth_date", "father_ethnicity", "mother ethnicity", 
    "inferred_ethnicity", "chineseOrNotPredicted"
]
sample_info_df.drop(columns=drop_cols, inplace=True)

# Read individual perGene df and append to sample_info_df

In [13]:
def read_pgx_csv(csv, gene):
    if gene == "MT-RNR1":
        keep_colnames:list = ['genotype', 'phenotype']
    else:
        keep_colnames:list = ['genotype', 'haplotype_1', 'haplotype_2', 'haplotype_1_func', 'haplotype_2_func', 'phenotype']
    gene_colnames:dict = { c: f"{gene}_{c}" for c in keep_colnames }
    df = pd.read_csv(csv, index_col=[0])
    df['sre_patient_id'] = df.index
    df = df.drop_duplicates(subset=['sre_patient_id']).set_index('sre_patient_id', drop=True)
    df = df.loc[:, keep_colnames].rename(columns=gene_colnames)
    return df

In [14]:
master_perGene_df = sample_info_df.copy()
for tool, gene_ls in level_1AB_genes.items():
    tool_summary = tool_summary_d[tool]
    for gene in gene_ls:
        # Define perGene CSV and read as df
        all_perGene = tool_summary/f"all_perGenes.{gene}.csv"
        df = read_pgx_csv(all_perGene, gene)
        df[f"{gene}_actionability"] = df[f'{gene}_phenotype'].apply(lambda p: pheno_cat_d[gene][p] if p==p else "NA")

        # Merge to master_perGene_df
        master_perGene_df = master_perGene_df.merge(df, left_index=True, right_index=True, how='left')

In [15]:
# Treat VKORC1 and IFNL3 on gene level
master_perGene_df['VKORC1_1AB_combined_actionability'] = master_perGene_df.apply(
    lambda r: "Actionable" if "Actionable" in [ r[c] for c in master_perGene_df.columns if (c.startswith("VKORC1")) & (c.endswith("_actionability"))] else "Normal", 
    axis=1
)
master_perGene_df['IFNL3_1AB_combined_actionability'] = master_perGene_df.apply(
    lambda r: "Actionable" if "Actionable" in [ r[c] for c in master_perGene_df.columns if (c.startswith("IFNL3")) & (c.endswith("_actionability"))] else "Normal", 
    axis=1
)

In [45]:
master_perGene_df.to_csv(consolidate_perGene, index=True, index_label="sre_participant_id")