# Apply gnomAD filters and append gnomAD frequency info

In [7]:
from pathlib import Path
import pandas as pd
import numpy as np
import shutil
from tqdm import tqdm
import re

In [8]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/recessive/03.filtering")
dataDir = projectDir/"data"
summaryDir = projectDir/"summary"

# Reference gnomad_info
gnomad_info_tsv = dataDir/"recessive.gnomad-annotated.tsv"

# Input
concat_tsv = summaryDir/"recessive.var.tsv"

# Output
gnomad_annot_tsv = summaryDir/"recessive.var.gnomad-annotated.tsv"

## Read gnomad_info_tsv as pd.DataFrame

In [9]:
gnomad_info = pd.read_table(gnomad_info_tsv, index_col=[0])

## Read concat_tsv as pd.DataFrame

In [13]:
df = pd.read_table(concat_tsv, index_col=[0])

# Merge

In [17]:
gnomad_annotated_columns = [
    'recessive_genes', 'source', 'rsID',
    'clinVar_ID', 'CLNSIG', 'clinical_relevance', 'goldstars_n',
    'variant_location', 'HGVSc', 'HGVSp', 'REVEL', 'SpliceAI',
    'Consequence', 'consequence_class',
    'gnomAD_filter', 'gnomad_v3_AN', 'gnomad_v3_AF', 'gnomad_v3_AN_eas', 'gnomad_v3_AF_eas', 
    'participant_id', 'GT'
]
gnomad_annotated_df = df.merge(gnomad_info, left_index=True, right_index=True, how='left').loc[:, gnomad_annotated_columns]

# Discard variants which has gnomAD_filter but not "PASS"

In [26]:
gnomad_filtered_df = gnomad_annotated_df.loc[
    (pd.isna(gnomad_annotated_df['gnomAD_filter'])) | (gnomad_annotated_df['gnomAD_filter']=='PASS'), 
    :
].drop(columns=['gnomAD_filter'])

In [28]:
gnomad_filtered_df.to_csv(gnomad_annot_tsv, index=True, index_label="Chr-Pos-Ref-Alt", sep='\t')