In [4]:
from pathlib import Path
import glob
import pandas as pd
import numpy as np
import multiprocessing
from itertools import chain, islice
from tqdm import tqdm
import shutil

In [7]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/dominant/02.novel_small_variants")
referenceDir = Path("/home/_shared/jscliu/project/2025/Flagship/reference")
resultDir = projectDir/"../01.known_small_variants/results"
summaryDir = projectDir/"summary"

# Reference
af_bed = referenceDir/"acmg_sf3_2.ext200bp.bed"
cohort_founder_list = referenceDir/"cohort_founder_list.2024-11-12.csv"
sample_info_csv = referenceDir/"sample_info_annot.2024-11-12.csv"
vcf_headers = referenceDir/"hkgi_dominant.vcf_headers.txt"

# Output
out_tsv = summaryDir/"dominant.novel_variants.tsv"
out_vcf = summaryDir/"dominant.novel_variants.vcf"

In [9]:
cohort_founder_df = pd.read_csv(cohort_founder_list, names=['participant_id', 'sample_id'])

In [10]:
tsv_ls:list = [resultDir/f"{p}.split.rare.vep.additionalFinding.tsv" for p in cohort_founder_df['participant_id']]

In [5]:
def split_list(lst, chunk_size):
    it = iter(lst)
    return [list(islice(it, chunk_size)) for _ in range(0, len(lst), chunk_size)]

chunk_tsv_ls = split_list(tsv_ls, 200)

# Read reference af_bed

In [6]:
af_gene_df = pd.read_table(af_bed)

In [7]:
af_genes:list = af_gene_df['name'].tolist()

# Read and concatenate result TSV as master pd.DataFrame

In [17]:
def read_result_tsv(tsv:Path, af_genes:list):
    df = pd.read_table(tsv, na_values=".", low_memory=False)
    df['CPRA'] = df.apply(lambda r: f"{r.CHROM}-{r.POS}-{r.REF}-{r.ALT}", axis=1)
    df['participant_id'] = tsv.name.split(".")[0]
    novel_df = df.loc[df['clinvar.vcf.gz'].isna(), :].copy()
    novel_df = novel_df.loc[novel_df['Gene'].isin(af_genes), :].copy()
    return novel_df

In [19]:
group_df_ls = list()
for working_tsv_ls in tqdm(chunk_tsv_ls):
    # Read TSV in parallel
    pool = multiprocessing.Pool(processes=40)    # Create Pool of 10 parallel processes max
    results = pool.starmap(read_result_tsv, zip(working_tsv_ls, [af_genes]*len(working_tsv_ls)))    # If the function only takes one arguments
    pool.close()    # Wait until all processes completed
    pool.join()    # Join all output from processes

    # Concatenate TSV df
    result_df = pd.concat(results)
    group_df = result_df.groupby('CPRA').agg(list)

    # Append to group_df_ls
    group_df_ls.append(group_df)

In [20]:
# Concatenate the grouped df, final groupby to get a master_df
af_df = pd.concat(group_df_ls).groupby('CPRA').agg(list)

In [22]:
af_df.drop(columns=["clinvar.vcf.gz", "clinvar.vcf.gz_CLNSIG", "clinvar.vcf.gz_CLNREVSTAT"], inplace=True)

In [24]:
# Collapse the nested list of variant-specific columns
variant_cols:list = [
    "CHROM", "POS", "REF", "ALT", "Gene", 
    "HGVSc", "HGVSp", "Consequence", "REVEL", "SpliceAI"
]
individual_cols:list = ["participant_id", "GT", "AD"]

In [25]:
for col in variant_cols:
    af_df[col] = af_df[col].apply(lambda x: x[0][0])

In [26]:
for col in individual_cols:
    af_df[col] = af_df[col].apply(lambda c: list(chain.from_iterable(c)))

In [27]:
# Make sure VEP annotated the variants under sf v3.2 genes
print(f"No. of variants before filtering: {len(af_df)}")
af_df = af_df.loc[af_df['Gene'].isin(af_genes), :].copy()
print(f"No. of variants after filtering: {len(af_df)}")

# Filter variant by consequence

In [32]:
af_df['Consequence'] = af_df['Consequence'].apply(lambda x: x.split('&'))

In [35]:
def keep_variant(ref:str, alt:str, consequences:list, spliceai:str):
    """Determine whether to keep this variant or not"""
    is_snp:bool = True if len(ref)==len(alt)==1 else False
    
    consequences_to_keep:list = [
        'coding_sequence_variant', 'protein_altering_variant',
        'inframe_insertion', 'inframe_deletion', 'missense_variant', 'frameshift_variant',
        'start_lost', 'start_retained_variant', 'stop_gained', 'stop_lost', 'stop_retained_variant', 
        'splice_acceptor_variant', 'splice_polypyrimidine_tract_variant', 'splice_region_variant',
        'splice_donor_5th_base_variant', 'splice_donor_region_variant', 'splice_donor_variant'
    ]
    consequences_to_discard:list = [
        'intergenic_variant', 'non_coding_transcript_exon_variant', 'non_coding_transcript_variant'
    ]
    consequences_nonCoding_keep_conditionally:list = [
        '5_prime_UTR_variant', '3_prime_UTR_variant',
        'upstream_gene_variant', 'downstream_gene_variant'
    ]
    consequences_other_keep_conditionally:list = [
        'intron_variant', 'synonymous_variant'
    ]
    
    to_keep:list = list()
    for c in consequences:
        if c in consequences_to_discard:
            keep = False
        elif c in consequences_to_keep:
            keep = True
        elif c in consequences_nonCoding_keep_conditionally:
            # Keep only if non-coding and there is spliceai annotation
            keep = True if not pd.isna(spliceai) else False
        elif c in consequences_other_keep_conditionally:
            # Keep only if it's a snp and there is spliceai annotation
            keep = True if (is_snp & (not pd.isna(spliceai))) else False
        else:
            print(f"Undefined consequence: {c}")
            sys.exit(1)
        to_keep.append(keep)
    
    if any(to_keep):
        return True
    else:
        return False

In [36]:
# Apply the filter
af_df['keep_variant'] = af_df.apply(
    lambda r: keep_variant(r.REF, r.ALT, r.Consequence, r.SpliceAI), axis=1
)
af_filtered_df = af_df.loc[af_df['keep_variant'], :].copy()

# Label the prevalence of each variants

In [41]:
# Count het_n, hom_n, hem_n
af_filtered_df['het_n'] = af_filtered_df['GT'].apply(lambda g: g.count('0/1') + g.count('0|1') + g.count('1/0') + g.count('1|0'))
af_filtered_df['hom_n'] = af_filtered_df['GT'].apply(lambda g: g.count('1/1') + g.count('1|1'))
af_filtered_df['hem_n'] = af_filtered_df['GT'].apply(lambda g: g.count('1'))
af_filtered_df['AN'] = af_filtered_df.apply(lambda r: r.het_n + r.hom_n*2 + r.hem_n, axis=1)
af_filtered_df['AF'] = af_filtered_df['AN'] / len(tsv_ls)
af_filtered_df['N_All'] = af_filtered_df['participant_id'].apply(len)

In [42]:
# Classify the prevelance of each variant by the number of carrier and the AF
def classify_prevelence(participant_id:list, AF:float)->str:
    if len(participant_id) == 1:
        return 'private'
    elif AF <= 0.01:
        return 'rare'
    else:
        return 'common'
af_filtered_df['prevalence'] = af_filtered_df.apply(lambda r: classify_prevelence(r.participant_id, r.AF), axis=1)

# Classify the consequences

In [48]:
# Classify each consequence
consequence_classes:dict = {
    'coding': [
        'coding_sequence_variant', 'protein_altering_variant', 
        'start_retained_variant', 'stop_retained_variant', 'synonymous_variant'
    ],
    'missense': ['missense_variant'],
    'frameshift': ['frameshift_variant', 'start_lost', 'stop_gained', 'stop_lost'],
    'inframe': ['inframe_deletion', 'inframe_insertion'],
    'noncoding': [
        'upstream_gene_variant', 'downstream_gene_variant',
        '3_prime_UTR_variant', '5_prime_UTR_variant',
        'non_coding_transcript_exon_variant', 'non_coding_transcript_variant'
    ],
    'splicing': [
        'splice_donor_5th_base_variant', 'splice_donor_region_variant', 'splice_donor_variant',
        'splice_acceptor_variant', 'splice_polypyrimidine_tract_variant', 'splice_region_variant'
    ],
    'intronic': ['intron_variant']
}

consequence_map_d:dict = dict()
for c_class, consequences in consequence_classes.items():
    for c in consequences:
        consequence_map_d[c] = c_class

In [49]:
# Label the consequence_class
af_filtered_df['consequence_class'] = af_filtered_df['Consequence'].apply(lambda cs: [consequence_map_d[c] for c in cs])

# Reformat and export genes_filtered_df

In [68]:
export_df = af_filtered_df.copy()

In [70]:
# Reformat the column storing lists
listed_cols:list = [
    'participant_id', 'GT', 'AD', 
    'Consequence', 'consequence_class'
]
for col in listed_cols:
    export_df[col] = export_df[col].apply(lambda x: ".".join(x))

In [71]:
# Re-order the columns
keep_cols:list = [
    'CHROM', 'POS', 'REF', 'ALT', 
    'Gene', 'HGVSc', 'HGVSp', 'REVEL', 'SpliceAI', 
    'Consequence', 'consequence_class', 
    'het_n', 'hom_n', 'hem_n', 'AN', 'AF', 'N_All', 'prevalence', 
    'participant_id', 'GT', 'AD'
]
export_df = export_df.loc[:, keep_cols].copy()

# Export to TSV

In [72]:
export_df.to_csv(out_tsv, index=True, index_label='CPRA', sep='\t')

# Export to VCF

In [73]:
export_csv_df = export_df.fillna(".")

In [75]:
# Generate the missing fields
export_csv_df.rename(columns={
    'het_n': 'N_Het', 
    'hom_n': 'N_Hom', 
    'hem_n': 'N_Hem'
}, inplace=True)
export_csv_df['NOTE'] = export_csv_df.apply(lambda r: f"{r.participant_id}-{r.GT}-{r.AD}", axis=1)
export_csv_df['AF'] = export_csv_df['AF'].apply(lambda x: float(f"{x:.5f}"))

In [76]:
# Prepare the VCF columns
vcf_cols:list = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "FIRST_SAMPLE"]
export_csv_df['ID'] = export_csv_df.index
export_csv_df['QUAL'] = "."
export_csv_df['FILTER'] = export_csv_df['prevalence'].apply(lambda x: "PRIVATE" if x == 'private' else "NON_PRIVATE")

## INFO columns
export_csv_df['INFO'] = export_csv_df.apply(lambda r: f"Gene={r.Gene}", axis=1)
info_cols:list = [
    'HGVSc', 'HGVSp', 'REVEL', 'SpliceAI', 'Consequence', 'consequence_class', 
    'N_Het', 'N_Hom', 'N_Hem', 'N_All', 'AF', 'prevalence', 'NOTE'
]
for col in info_cols:
    export_csv_df['INFO'] = export_csv_df.apply(lambda r: f"{r.INFO};" + f"{col}={r[col]}", axis=1)

export_csv_df['FORMAT'] = "GT:AD"
export_csv_df['FIRST_SAMPLE'] = export_csv_df.apply(
    lambda r: f"{r['GT'].split('.')[0]}:{r['AD'].split('.')[0]}", axis=1
)

In [86]:
# Filter columns for VCF
export_csv_df.rename(columns={'CHROM': '#CHROM'}, inplace=True)
export_csv_df = export_csv_df.loc[:, vcf_cols].copy()

# Write header to VCF
shutil.copy(vcf_header, out_vcf)

# Export to VCF
export_csv_df.to_csv(out_vcf, index=False, sep='\t', mode='a')