In [11]:
from pathlib import Path
import pandas as pd
import numpy as np
import sys
import multiprocessing
from tqdm import tqdm

In [12]:
resultDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01c.genotype.HLA-HD/results")
summaryDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/PGx/01c.genotype.HLA-HD/summary")
summaryDir.mkdir(exist_ok=True)

In [18]:
def extract_allele(result_tab, gene):
    with open(result_tab, 'r') as f:
        for line in f:
            call_ls = line.strip().split('\t')
            if call_ls[0] != gene:
                continue
            
            haplotype_1, haplotype_2 = call_ls[1:3]
            if (haplotype_1 == "Not typed") or (haplotype_2 == "Not typed"):
                diplotype = np.nan
                haplotype_1 = np.nan if haplotype_1 == "Not typed" else haplotype_1
                haplotype_2 = np.nan if haplotype_2 == "Not typed" else haplotype_2
            else:
                haplotype_2 = haplotype_1 if haplotype_2 == "-" else haplotype_2
                diplotype = f'{haplotype_1}/{haplotype_2}'
            return haplotype_1, haplotype_2, diplotype
    sys.exit(1)

In [19]:
# Consolidate all *final.result.txt into a single matrix
hla_a_df = None
hla_b_df = None

final_result_ls:list = list(resultDir.rglob("*_final.result.txt"))
for result_tab in tqdm(final_result_ls):
    sample_id = result_tab.parent.parent.name
    
    # HLA-A
    a_h1, a_h2, a_d = extract_allele(result_tab, "A")
    df = pd.DataFrame({"haplotype_1": [a_h1], "haplotype_2": [a_h2], "diplotype": [a_d]}, index=[sample_id])
    hla_a_df = df.copy() if hla_a_df is None else pd.concat([hla_a_df, df])

    # HLA-B
    b_h1, b_h2, b_d = extract_allele(result_tab, "B")
    df = pd.DataFrame({"haplotype_1": [b_h1], "haplotype_2": [b_h2], "diplotype": [b_d]}, index=[sample_id])
    hla_b_df = df.copy() if hla_b_df is None else pd.concat([hla_b_df, df])

100%|████████████████████████████████████| 24122/24122 [00:28<00:00, 841.72it/s]


# Export hla_a_df and hla_b_df to CSV

In [21]:
hla_a_tsv = summaryDir/'HLA-A.results_all.tsv'
hla_a_df.to_csv(hla_a_tsv, sep='\t', index=True, index_label="sample_id")

hla_b_tsv = summaryDir/'HLA-B.results_all.tsv'
hla_b_df.to_csv(hla_b_tsv, sep='\t', index=True, index_label="sample_id")