# Consolidate the unfiltered, known and novel variants

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import shutil
from tqdm import tqdm
import re

In [2]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/recessive")
referenceDir = Path("/home/_shared/jscliu/project/2025/Flagship/reference")

summaryDir = projectDir/"03.filtering/summary"

# Input
known_var_tsv = projectDir/"01.known_small_variants/summary/recessive.known_variants.tsv"
novel_pvs1_vstrong_tsv = projectDir/"02.novel_small_variants/summary/recessive.novel.allResults.PVS1-verystrong.tsv"

# Reference
vcf_columns:list = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
vcf_header = projectDir/"03.filtering/script/vcf_headers.txt"
sample_info = referenceDir/"sample_info_annot.2024-11-12.csv"
cohort_founders = referenceDir/"cohort_founder_list.2024-11-12.csv"

# Output
concat_tsv = summaryDir/"recessive.var.tsv"

In [3]:
concat_columns:list = [
    'Chr-Pos-Ref-Alt',
    'recessive_genes',
    'source', 
    'clinVar_ID',
    'CLNSIG',
    'clinical_relevance',
    'goldstars_n',    
    'variant_location',
    'HGVSc',
    'HGVSp',
    'REVEL',
    'SpliceAI',
    'Consequence',
    'consequence_class',
    'participant_id',
    'GT'
]

## Pre-process table for clinvar P/LP

In [4]:
known_var_df = pd.read_table(known_var_tsv)
known_var_df['source'] = 'clinvar'

In [5]:
# For columns participant_id and GT
known_var_df['participant_id'] = known_var_df['sample_id'].apply(lambda x: ".".join([ y[:-3] for y in x.split(".") ]))
known_var_df.rename(columns={'genotype': 'GT'}, inplace=True)

In [6]:
# make known_var_toConcat_df
for c in concat_columns:
    if c not in known_var_df.columns:
        known_var_df[c] = np.nan

known_var_toConcat_df = known_var_df.loc[:, concat_columns].copy()

## Pre-process table for novel vep PVS1-verystrong

In [8]:
novel_pvs1vstrong_df = pd.read_table(novel_pvs1_vstrong_tsv)
novel_pvs1vstrong_df['source'] = 'novel'

In [9]:
# make novel_pvs1vstrong_toConcat_df
for c in concat_columns:
    if c not in novel_pvs1vstrong_df.columns:
        novel_pvs1vstrong_df[c] = np.nan

novel_pvs1vstrong_toConcat_df = novel_pvs1vstrong_df.loc[:, concat_columns].copy()

# Concatenate

In [12]:
recessive_master_df = pd.concat([known_var_toConcat_df, novel_pvs1vstrong_toConcat_df])

# Sort by chr and pos

In [15]:
recessive_master_df[['chr', 'pos']] = recessive_master_df['Chr-Pos-Ref-Alt'].apply(
    lambda x: pd.Series([
        re.sub('chr', '', x.split('-')[0]), 
        int(x.split('-')[1])
    ])
)

In [17]:
recessive_master_df.sort_values(by=['chr', 'pos'], ascending=True, inplace=True)
recessive_master_df.drop(columns=['chr', 'pos'], inplace=True)

In [19]:
recessive_master_df.to_csv(concat_tsv, index=False, sep='\t')