In [38]:
from pathlib import Path
import pandas as pd
import numpy as np
import multiprocessing
from tqdm import tqdm
from datetime import datetime
from itertools import chain, islice
from collections import defaultdict
import shutil
import re
import sys

In [39]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/recessive/02.novel_small_variants")
referenceDir = Path("/home/_shared/jscliu/project/2025/Flagship/reference")
resultsDir = projectDir/"results"
summaryDir = projectDir/"summary"

# Reference
cohort_csv = referenceDir/"sample_info_annot.2024-11-12.csv"
cohort_founder_csv = referenceDir/"cohort_founder_list.2024-11-12.csv"
recessive_bed = referenceDir/"hkgi_recessive.ext200bp.bed"
vcf_header = projectDir/"script/vcf_headers.txt"

# Output
output_tsv = summaryDir/"recessive.novel.allResults.tsv"
output_vcf = summaryDir/"recessive.novel.allResults.vcf"

# Read and consolidate all TSV belonging to this cohort

In [3]:
cohort_all_df = pd.read_csv(cohort_csv).drop_duplicates(subset=['sre_participant_id'])
cohort_all_df = cohort_all_df.loc[cohort_all_df['treated_ethnicity']=="Chinese", :].copy()

In [4]:
cohort_df = pd.read_csv(cohort_founder_csv, names=['participant_id', 'sample_id']).drop_duplicates(subset=['participant_id'])

In [7]:
# Get list of TSV to be read
tsv_ls:list = [ resultsDir/f"{i}.split.rare.vep.recessive.tsv" for i in cohort_df.participant_id.tolist() ]
print(f"{len(tsv_ls) = }")

In [8]:
def split_list(lst, chunk_size):
    it = iter(lst)
    return [list(islice(it, chunk_size)) for _ in range(0, len(lst), chunk_size)]

chunk_tsv_ls = split_list(tsv_ls, 200)

In [9]:
def read_recessive_tsv(tsv:Path):
    # Read tsv and append columns
    df = pd.read_table(tsv, na_values=["."])
    df['CPRA'] = df.apply(lambda r: f"{r.CHROM}-{r.POS}-{r.REF}-{r.ALT}", axis=1)
    df['participant_id'] = tsv.name.split(".")[0]
    
    # Drop known variants (i.e. with clinvar entries)
    # Rearrange columns, discard unneeded cols at the same time
    keep_col:list = [
        'CPRA', 'Gene', 'HGVSc', 'HGVSp', 
        'Consequence', 'REVEL', 'SpliceAI',
        'participant_id', 'GT', 'AD'
    ]
    df = df.loc[df['clinvar.vcf.gz'].isna(), keep_col].copy()
    return df

In [10]:
consol_df_ls:list = list()
for sub_tsv_ls in tqdm(chunk_tsv_ls):
    # Read TSV in parallel 
    pool = multiprocessing.Pool(processes=10)    # Create Pool of 10 parallel processes max
    results = pool.starmap(read_recessive_tsv, [(tsv,) for tsv in sub_tsv_ls])    # If the function only takes one arguments
    pool.close()    # Wait until all processes completed
    pool.join()    # Join all output from processes

    # Consolidate chunked tsv dfs
    concat_df = pd.concat(results)
    consol_df = concat_df.groupby('CPRA').agg(list)
    for c in consol_df.columns.tolist()[:-3]:
        consol_df[c] = consol_df[c].apply(lambda x: x[0])

    consol_df_ls.append(consol_df)    # Append to consol_df_ls

In [16]:
master_df = pd.concat(consol_df_ls).groupby('CPRA').agg(list)

# Variant-specific columns
for c in master_df.columns.tolist()[:-3]:
    master_df[c] = master_df[c].apply(lambda x: x[0])

# Sample-specific columns
def flatten(nested_list):
    return list(chain.from_iterable(
        item if isinstance(item, list) else [item] for item in nested_list
    ))
for c in master_df.columns.tolist()[-3:]: 
    master_df[c] = master_df[c].apply(lambda x: flatten(x))

# Filter variants by Consequence

In [17]:
master_df['Consequence'] = master_df['Consequence'].apply(lambda x: x.split('&'))

In [19]:
# To validate: List out all consequences in this dataset
consequence_set:set = set()
for cs in master_df['Consequence'].tolist():
    for c in cs:
        consequence_set.add(c)
consequence_set

In [21]:
def keep_variant(cpra:str, consequences:list, spliceai:str):
    ref, alt = cpra.split('-')[-2:]
    is_snp:bool = True if len(ref)==len(alt)==1 else False
    
    consequences_to_keep:list = [
        'coding_sequence_variant', 'protein_altering_variant',
        'inframe_insertion', 'inframe_deletion', 'missense_variant', 'frameshift_variant',
        'start_lost', 'start_retained_variant', 'stop_gained', 'stop_lost', 'stop_retained_variant', 
        'splice_acceptor_variant', 'splice_polypyrimidine_tract_variant', 'splice_region_variant',
        'splice_donor_5th_base_variant', 'splice_donor_region_variant', 'splice_donor_variant'
    ]
    consequences_to_discard:list = [
        'intergenic_variant', 'non_coding_transcript_exon_variant', 'non_coding_transcript_variant'
    ]
    consequences_nonCoding_keep_conditionally:list = [
        '5_prime_UTR_variant', '3_prime_UTR_variant',
        'upstream_gene_variant', 'downstream_gene_variant'
    ]
    consequences_other_keep_conditionally:list = [
        'intron_variant', 'synonymous_variant'
    ]
    
    to_keep:list = list()
    for c in consequences:
        if c in consequences_to_discard:
            keep = False
        elif c in consequences_to_keep:
            keep = True
        elif c in consequences_nonCoding_keep_conditionally:
            # Keep only if non-coding and there is spliceai annotation
            keep = True if not pd.isna(spliceai) else False
        elif c in consequences_other_keep_conditionally:
            # Keep only if it's a snp and there is spliceai annotation
            keep = True if (is_snp & (not pd.isna(spliceai))) else False
        else:
            print(f"Undefined consequence: {c}")
            sys.exit(1)
        to_keep.append(keep)
    
    if any(to_keep):
        return True
    else:
        return False

In [22]:
master_df['keep_variant'] = master_df.apply(
    lambda r: keep_variant(r.name, r.Consequence, r.SpliceAI), axis=1
)

In [23]:
# Filter out variants by consequence
master_filtered_df = master_df.loc[master_df['keep_variant'], :].copy()
print(f"{len(master_filtered_df) = }")

In [24]:
master_filtered_df[['#CHROM', 'POS', 'REF', 'ALT']] = master_filtered_df.apply(lambda r: pd.Series([
    r.name.split("-")[0], 
    int(r.name.split("-")[1]), 
    r.name.split("-")[2], 
    r.name.split("-")[3]
]), axis=1)

In [25]:
# Sort master_df by chr and pos
master_filtered_df.sort_values(by=['#CHROM', 'POS'], ascending=True, inplace=True)

# Re-annotate the genes using recessive genes' BED file

In [26]:
recessive_gene_df = pd.read_table(
    recessive_bed, comment='#', 
    names=['chr', 'start', 'end', 'gene', 'strand'], 
    usecols=[0, 1, 2, 3, 5]
)
recessive_gene_df['start'] +=1    # Since BED files are 0-indexed
recessive_gene_df['idx_gene'] = recessive_gene_df['gene']
recessive_gene_df.set_index('idx_gene', inplace=True)
recessive_gene_df.sort_values(by=['chr', 'start'], ascending=True, inplace=True)

In [28]:
def find_matched_genes(pos_start, ref, chr_gene_df):
    pos_end = pos_start + len(ref) - 1
    
    matched_genes = chr_gene_df.loc[
        (chr_gene_df['start']<=pos_end) & (chr_gene_df['end']>=pos_start), "gene"
    ].tolist()

    if len(matched_genes)>0:
        return matched_genes
    else:
        return np.nan

matched_chr_df_ls:list = list()
for chr in tqdm(master_filtered_df['#CHROM'].unique().tolist()):
    chr_df = master_filtered_df.loc[master_filtered_df['#CHROM']==chr, :].copy()
    chr_gene_df = recessive_gene_df.loc[recessive_gene_df['chr']==chr, :].copy()
    chr_df['recessive_genes'] = chr_df.apply(
        lambda r: find_matched_genes(r.POS, r.REF, chr_gene_df), axis=1
    )
    matched_chr_df_ls.append(chr_df)

In [29]:
genes_filtered_df = pd.concat(matched_chr_df_ls)

### Annotate position relative to the recessive gene

In [41]:
def annotate_recessive_gene(r:pd.Series, recessive_gene_df:pd.DataFrame):
    start:int = int(r['POS'])
    end:int = int(r['POS']) + len(r['REF'])
    genes:list = r['recessive_genes']
    located_genes:list = list()
    for g in genes:
        gene_start:int = recessive_gene_df.loc[g, 'start']+200+1    # positions in BED file is 0-indexed
        gene_end:int = recessive_gene_df.loc[g, 'end']-200
        strand:str = recessive_gene_df.loc[g, 'strand']
        if max(start, gene_start) <= min(end, gene_end):
            locate = 'within'
        elif end < gene_end:
            locate = 'upstream_gene_variant' if strand == "+" else "downstream_gene_variant"
        elif start > gene_end:
            locate = 'downstream_gene_variant' if strand == "+" else "upstream_gene_variant"
        try:
            located_genes.append(locate)
        except:
            print(f"{g = }\n{start}-{end} vs {gene_start}-{gene_end}")
            
    return located_genes
genes_filtered_df['variant_location'] = genes_filtered_df.apply(lambda r: annotate_recessive_gene(r, recessive_gene_df), axis=1)

# Label the prevalence of each variants

In [42]:
# Count het_n, hom_n, hem_n
genes_filtered_df['het_n'] = genes_filtered_df['GT'].apply(lambda g: g.count('0/1') + g.count('0|1') + g.count('1/0') + g.count('1|0'))
genes_filtered_df['hom_n'] = genes_filtered_df['GT'].apply(lambda g: g.count('1/1') + g.count('1|1'))
genes_filtered_df['hem_n'] = genes_filtered_df['GT'].apply(lambda g: g.count('1'))
genes_filtered_df['AN'] = genes_filtered_df.apply(lambda r: r.het_n + r.hom_n*2 + r.hem_n, axis=1)
genes_filtered_df['AF'] = genes_filtered_df['AN'] / len(tsv_ls)

In [43]:
# Classify the prevelance of each variant by the number of recessive and the AF
def classify_prevelence(participant_id:list, AF:float)->str:
    if len(participant_id) == 1:
        return 'private'
    elif AF <= 0.01:
        return 'rare'
    else:
        return 'common'
genes_filtered_df['prevalence'] = genes_filtered_df.apply(lambda r: classify_prevelence(r.participant_id, r.AF), axis=1)

# Classify the consequences

In [46]:
# Get all consequences
all_consequences:set = set()
for consequences in genes_filtered_df['Consequence']:
    for c in consequences:
        all_consequences.add(c)

In [47]:
# Classify each consequence
consequence_classes:dict = {
    'coding': [
        'coding_sequence_variant', 'protein_altering_variant', 
        'start_retained_variant', 'stop_retained_variant', 'synonymous_variant'
    ],
    'missense': ['missense_variant'],
    'frameshift': ['frameshift_variant', 'start_lost', 'stop_gained', 'stop_lost'],
    'inframe': ['inframe_deletion', 'inframe_insertion'],
    'noncoding': [
        'upstream_gene_variant', 'downstream_gene_variant',
        '3_prime_UTR_variant', '5_prime_UTR_variant',
        'non_coding_transcript_exon_variant', 'non_coding_transcript_variant'
    ],
    'splicing': [
        'splice_donor_5th_base_variant', 'splice_donor_region_variant', 'splice_donor_variant',
        'splice_acceptor_variant', 'splice_polypyrimidine_tract_variant', 'splice_region_variant'
    ],
    'intronic': ['intron_variant']
}

consequence_map_d:dict = dict()
for c_class, consequences in consequence_classes.items():
    for c in consequences:
        consequence_map_d[c] = c_class

In [48]:
# Label the consequence_class
genes_filtered_df['consequence_class'] = genes_filtered_df['Consequence'].apply(lambda cs: [consequence_map_d[c] for c in cs])

# Reformat and export genes_filtered_df

In [49]:
export_df = genes_filtered_df.copy()

In [50]:
# Reformat the column storing type list
listed_cols:list = [
    'participant_id', 'GT', 'AD', 
    'recessive_genes', 'variant_location', 
    'Consequence', 'consequence_class'
]
for col in listed_cols:
    export_df[col] = export_df[col].apply(lambda x: ".".join(x))

In [51]:
# Re-order the columns
keep_cols:list = [
    '#CHROM', 'POS', 'REF', 'ALT', 
    'Gene', 'recessive_genes', 'variant_location', 
    'HGVSc', 'HGVSp', 'REVEL', 'SpliceAI', 'Consequence', 'consequence_class', 
    'participant_id', 'GT', 'AD'
]
export_df = export_df.loc[:, keep_cols].copy()

### Export to TSV

In [53]:
export_df.to_csv(output_tsv, index=True, index_label='CPRA', sep='\t')

### Export to VCF for more annotations

In [54]:
export_csv_df = export_df.fillna(".")

In [55]:
# Generate the missing fields
export_csv_df['NOTE'] = export_csv_df.apply(lambda r: f"{r.participant_id}-{r.GT}-{r.AD}", axis=1)

In [56]:
# Prepare the VCF columns
vcf_cols:list = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "FIRST_SAMPLE"]
export_csv_df['ID'] = export_csv_df.index
export_csv_df['QUAL'] = "."
export_csv_df['FILTER'] = "."

## INFO columns
export_csv_df['INFO'] = export_csv_df.apply(lambda r: f"Gene={r.Gene}", axis=1)
info_cols:list = [
    'recessive_genes', 'variant_location', 'HGVSc', 'HGVSp', 'REVEL', 'SpliceAI', 'Consequence', 'consequence_class', 'NOTE'
]

for col in info_cols:
    export_csv_df['INFO'] = export_csv_df.apply(lambda r: f"{r.INFO};" + f"{col}={r[col]}", axis=1)

export_csv_df['FORMAT'] = "GT:AD"
export_csv_df['FIRST_SAMPLE'] = export_csv_df.apply(
    lambda r: f"{r['GT'].split('.')[0]}:{r['AD'].split('.')[0]}", axis=1
)

In [57]:
# Filter columns for VCF
export_csv_df = export_csv_df.loc[:, vcf_cols].copy()

In [60]:
# Export df to CSV
shutil.copy(vcf_header, output_vcf)    # Copy header lines
export_csv_df.to_csv(output_vcf, index=False, sep='\t', mode='a')    # Append CSV