In [28]:
from pathlib import Path
import pandas as pd
import numpy as np
import multiprocessing
from collections import defaultdict
from datetime import datetime
from itertools import chain, islice
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import shutil
import re

sns.set_style("whitegrid")

In [58]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/recessive/01.known_variants")
referenceDir = Path("/home/_shared/jscliu/project/2025/Flagship/reference")

# Input
cohort_list = referenceDir/"cohort_founder_list.2024-11-12.csv"
sample_info_csv = referenceDir/"sample_info_annot.2024-11-12.csv"
resultDir = projectDir/"results"

# Reference
recessive_genes_bed = referenceDir/"hkgi_recessive.ext200bp.bed"
known_var_tsv = projectDir/"data/recessive.known_variants.hgnc.tsv"
vcf_header = projectDir/"script/vcf_header.txt"

# Output
summaryDir = projectDir/"summary"
summaryDir.mkdir(parents=True, exist_ok=True)
clinvar_plp_tsv = summaryDir/"recessive.known_variants.tsv"
clinvar_plp_vcf = summaryDir/"recessive.known_variants.vcf"

# Read and pre-process clinvar annotated VCF

In [75]:
clinvar_ref_df = pd.read_table(known_var_tsv, names=['CHROM', 'POS', 'REF', 'ALT', 'clinVar_ID', 'GENEINFO', 'CLNSIG', 'CLNREVSTAT'])

# Construct CPRA and set as index
clinvar_ref_df['CPRA'] = clinvar_ref_df.apply(lambda r: f"{r.CHROM}-{r.POS}-{r.REF}-{r.ALT}", axis=1)
clinvar_ref_df.set_index("CPRA", inplace=True)

In [77]:
# Re-order the columns before merging
keep_cols = ['clinVar_ID', 'GENEINFO', 'CLNSIG', 'CLNREVSTAT']
clinvar_ref_df = clinvar_ref_df.loc[:, keep_cols].copy()

# Subset result TSV for participants/samples belonging to this cohort only

In [59]:
cohort_list_df = pd.read_csv(cohort_list, names=['participant_id', 'sample_id']).drop_duplicates(subset=['participant_id'])

In [None]:
tsv_ls:list = [ resultDir/f"{p}.smallvar.recessive.tsv" for p in cohort_list_df['sample_id'].tolist() ]
tsv_ls:list = [ p for p in tsv_ls if p.is_file() ]
print(f"Expected N TSV: {len(cohort_list_df):,}")
print(f"Found N TSV: {len(tsv_ls):,}")

In [65]:
def split_list(lst, chunk_size):
    it = iter(lst)
    return [list(islice(it, chunk_size)) for _ in range(0, len(lst), chunk_size)]

chunk_tsv_ls = split_list(tsv_ls, 2000)

# Read and concatenate result TSV as master pd.DataFrame

In [66]:
def read_result_tsv(tsv:Path):
    tsv_header:list = ['chrom', 'pos', 'ref', 'alt', 'genotype']
    df_header:list = ['CPRA', 'sample_id', 'genotype']
    df = pd.read_table(tsv, names=tsv_header)
    df['CPRA'] = df.apply(lambda r: f"{r.chrom}-{r.pos}-{r.ref}-{r.alt}", axis=1)
    df['sample_id'] = tsv.name.split(".")[0]
    df = df.loc[:, df_header].copy()
    return df

In [None]:
group_df_ls = list()
for working_tsv_ls in tqdm(chunk_tsv_ls):
    # Read TSV in parallel
    # print(f"{datetime.strftime(datetime.now(), '%H:%M')} - Reading TSVs in parallel")
    pool = multiprocessing.Pool(processes=20)    # Create Pool of 10 parallel processes max
    results = pool.starmap(read_result_tsv, [(tsv,) for tsv in working_tsv_ls])    # If the function only takes one arguments
    pool.close()    # Wait until all processes completed
    pool.join()    # Join all output from processes

    # Concatenate TSV df
    # print(f"{datetime.strftime(datetime.now(), '%H:%M')} - Concatenating")
    result_df = pd.concat(results)
    # print(f"{datetime.strftime(datetime.now(), '%H:%M')} - Groupby")
    group_df = result_df.groupby('CPRA').agg(list)

    # Append to group_df_ls
    group_df_ls.append(group_df)

In [68]:
# Concatenate the grouped df, final groupby to get a master df
recessive_df = pd.concat(group_df_ls).groupby('CPRA').agg(list)

In [69]:
for col in recessive_df.columns:
    recessive_df[col] = recessive_df[col].apply(lambda c: list(chain.from_iterable(c)))

## Merge clinvar_ref_df to recessive_df

In [139]:
recessive_clinvar_df = clinvar_ref_df.merge(recessive_df, left_index=True, right_index=True, how='right')

In [140]:
recessive_clinvar_df = recessive_clinvar_df.loc[~recessive_clinvar_df['clinVar_ID'].isna(), :].copy()

# Convert clinVarID to class int
recessive_clinvar_df = recessive_clinvar_df.astype({'clinVar_ID': int})

# Extract gene names from the "GENEINFO" columns

In [143]:
# Read from reference
gene_panel_df = pd.read_table(recessive_genes_bed, comment='#', names=['chr', 'start', 'end', 'gene', 'score', 'strand'])
recessive_gene_ls = gene_panel_df['gene'].tolist()
recessive_gene_dict = { re.split(",aka", g)[1]: re.split(",aka", g)[0] for g in recessive_gene_ls if "aka" in g }
recessive_gene_ls = [ re.split(",aka", g)[0] if "aka" in g else g for g in recessive_gene_ls ]

In [146]:
def identify_panel_genes(cpra, gene_info, gene_panel_df, recessive_gene_ls, recessive_gene_dict):
    chr, pos, ref, alt = cpra.split('-')
    # Gene by locations in BED files
    gene_by_loc = gene_panel_df.loc[
        (gene_panel_df['#chr']==chr) & (gene_panel_df['start']<=int(pos)) & (gene_panel_df['end']>=int(pos)), 
        'gene'
    ].tolist()
    gene_by_loc = gene_by_loc if gene_by_loc else np.nan

    # Gene by GENEINFO in clinvar VCF
    gene_by_name:list = []
    for gene in [ g.split(':')[0] for g in gene_info.split('|') ]:
        if gene in recessive_gene_ls:
            gene_by_name.append(gene)
        elif gene in recessive_gene_dict.keys():
            gene_by_name.append(recessive_gene_dict[gene])
    gene_by_name = gene_by_name if gene_by_name else np.nan
    return pd.Series([gene_by_name, gene_by_loc])    

recessive_clinvar_df[['recessive_genes_byinfo', 'recessive_genes_byloc']] = recessive_clinvar_df.apply(
    lambda r: identify_panel_genes(r.name, r.GENEINFO, gene_panel_df, recessive_gene_ls, recessive_gene_dict), 
    axis=1
)

In [148]:
# Determine the "Actual" recessive genes
recessive_clinvar_df['recessive_genes'] = recessive_clinvar_df.apply(
    lambda r: r['recessive_genes_byloc'] if r['recessive_genes_byinfo'] != r['recessive_genes_byinfo'] else r['recessive_genes_byinfo'], 
    axis=1
)

In [151]:
recessive_clinvar_df.dropna(subset=['recessive_genes'], inplace=True)

# Subset for clinvar P/LP variants only through interpreting CLNSIG

In [None]:
# Map clinvar.vcf.gz_CLNSIG to clinical_relevance
def map_clinical_relevance(clnsig):
    # Classify P/LP/B/LB by text match
    if "Likely_pathogenic" in clnsig:
        return "likely_pathogenic"
    elif "Pathogenic" in clnsig:
        return "pathogenic"
    elif "Likely_benign" in clnsig:
        return "likely_benign"
    elif "Benign" in clnsig:
        return "benign"
    
    # Classify VUS
    vus_clnsig:list = [
        "Uncertain_significance", "Conflicting_classifications_of_pathogenicity", 
        "Uncertain_risk_allele", "Likely_risk_allele"
    ]
    for term in vus_clnsig:
        if term in clnsig:
            return "vus"
            
    # Classify "others"
    other_clnsig:list = [
        "not_provided", "drug_response", 
        "no_classification_for_the_single_variant", 
        "other", "risk_factor", "association", 
        "Affects", "protective"
    ]
    for term in other_clnsig:
        if term in clnsig:
            return "other"
    print(f"[ERROR] Cannot interpret {clnsig}")
    return
    
recessive_clinvar_df['clinical_relevance'] = recessive_clinvar_df['CLNSIG'].apply(map_clinical_relevance)

In [157]:
# Subset for P/LP only
recessive_clinvar_plp_df = recessive_clinvar_df.loc[recessive_clinvar_df['clinical_relevance'].isin(['pathogenic', 'likely_pathogenic']), :].copy()

# Subset for one gs or up through interpreting CLNREVSTAT

In [160]:
def classify_goldstar(clnrevstat:str)->str:
    map_d = {
        'practice_guideline': 4, 
        'reviewed_by_expert_panel': 3, 
        'criteria_provided&_multiple_submitters&_no_conflicts': 2, 
        'criteria_provided&_conflicting_classifications': 1, 
        'criteria_provided&_single_submitter': 1, 
        'no_assertion_criteria_provided': 0, 
        'no_classification_provided': 0, 
        'no_classification_for_the_single_variant': 0
    }
    return map_d[clnrevstat]
recessive_clinvar_plp_df['CLNREVSTAT'] = recessive_clinvar_plp_df['CLNREVSTAT'].apply(lambda c: c.replace(",", "&"))
recessive_clinvar_plp_df['goldstars_n'] = recessive_clinvar_plp_df['CLNREVSTAT'].apply(classify_goldstar)

In [161]:
recessive_clinvar_plp_filtered_df = recessive_clinvar_plp_df.loc[recessive_clinvar_plp_df['goldstars_n']>0, :].copy()

# Filter columns

In [166]:
recessive_clinvar_plp_filtered_df['Chr-Pos-Ref-Alt'] = recessive_clinvar_plp_filtered_df.index

In [167]:
filtered_col = [
    'Chr-Pos-Ref-Alt', 'clinVar_ID', 'recessive_genes', 'CLNSIG', 
    'clinical_relevance', 'goldstars_n', 
    'sample_id', 'genotype'
]
recessive_clinvar_plp_filtered_df = recessive_clinvar_plp_filtered_df.loc[:, filtered_col].copy()

# Export report

In [169]:
# Export recessive_clinvar_plp_filtered_df to TSV
export_df = recessive_clinvar_plp_filtered_df.copy()
listed_cols:list = ["recessive_genes", "sample_id", "genotype"]
for col in listed_cols:
    export_df[col] = export_df[col].apply(lambda x: ".".join(x))
export_df.to_csv(clinvar_plp_tsv, index=False, sep='\t')

# Export the CPRA as VCF
export_vcf_df = export_df.rename(columns={"Chr-Pos-Ref-Alt": "CPRA", "clinVar_ID": "ID"})
vcf_cols:list = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
export_vcf_df[["#CHROM", "POS", "REF", "ALT"]] = export_vcf_df['CPRA'].apply(
    lambda x: pd.Series(x.split('-'))
)
for col in vcf_cols[-3:]:
    export_vcf_df[col] = "."
export_vcf_df = export_vcf_df.loc[:, vcf_cols].copy()
shutil.copy(vcf_header, clinvar_plp_vcf)
export_vcf_df.to_csv(clinvar_plp_vcf, index=False, sep='\t', mode='a')