# Filtering MAF files
The current filtering of MAF files has four parts: the quorum rule, filter, gdc filter, and tumor_normal pairs. The quorum rule can easily be hanged by replacing the hard coded 2 with any different desired quorum rule. Because the filtering is done with a pandas dataframe, the filtering is generally straightforward, aside from the quorum and tumor-normal pairs filtering. For example, if you want to filter out any variants that do or do not have certain values in specific columns of the MAF file, only two lines of code is needed:  



It is also important to note that two additional columns have been added to this filtered MAF file: a list of the callers that had called that variant, and the caller from which the line denoting that variant came from. Because of this, if it is important to consider the values of all of these columns that are different across the MAF files, this information can be recovered with these two extra columns as well as the original MAF files.   
Both cells below must be run in order to generate and save these filtered MAF files. The name of the resulting sav

In [None]:
import os
import pandas as pd 
import numpy as np
import math


# parse comments from MAF files
def parse_comments(fp):
    comments = []

    fh = open(fp, 'r')
    for line in fh:
        if line[0] == '#':
            comments.append(line)

    return(comments)

# parses chromosome, start position, end position, reference allele, and tumor alleles of variants
def parse_maf(fp):
    ''' Return a dictionary with vars[patient][variant_set]'''
    caller_vars_dict = {}

    index = 0
    fh = open(fp, 'r')
    for line in fh:
        temp = line.strip().split('\t')
        if line[0] == '#' or temp[0] == 'Hugo_Symbol':
            continue

        chrom =temp[4]
        s_pos = temp[5]
        e_pos = temp[6]
        ref = temp[10]
        tum_allele_1 = temp[11]
        tum_allele_2 = temp[12]

        var = (chrom, s_pos, e_pos, ref, tum_allele_1, tum_allele_2)
        
        caller_vars_dict[var] = index

        index += 1

    return caller_vars_dict


possible_callers = ('muse', 'mutect', 'somaticsniper', 'varscan')

possible_cancers = ('ACC', 'BLCA','BRCA','CESC','CHOL','COAD','DLBC','ESCA','GBM','HNSC', 
                    'KICH','KIRC','KIRP','LAML','LGG','LIHC','LUAD','LUSC','MESO', 'OV','PAAD',
                    'PCPG','PRAD','READ','SARC','SKCM','STAD','TGCT','THCA','THYM','UCEC','UCS','UVM')

cwd = os.getcwd()

dataframes = {}

for cancer in possible_cancers:
    # accessing MAF files in path cwd/data/cancer
    mafs = os.listdir(os.path.join(cwd, 'data', cancer))

    maf_fps = {}
    for caller in possible_callers:
        for maf in mafs:    
            if caller in maf: # str in the filepath
                maf_fps[caller] = os.path.join(cwd, 'data', cancer, maf)
    
    # deletes comments from MAF files and stores them in all_comments         
    all_comments = {}
    for caller in maf_fps:
        all_comments[caller] = parse_comments(maf_fps[caller])
        for i in range(4):
            del all_comments[caller][0]
        all_comments[caller] = all_comments[caller][0].strip().split(',')
        all_comments[caller][0] = all_comments[caller][0].strip().split(' ')
        all_comments[caller].append(all_comments[caller][0][1])
        del all_comments[caller][0]
    
    # stores MAF files as dataframes and subvariants from parse_maf function for all variants from all callers
    all_variants = {}
    all_df = {}
    for caller in maf_fps:
        all_variants[caller] = parse_maf(maf_fps[caller])
        all_df[caller] = pd.read_csv(maf_fps[caller], sep='\t',comment="#")
    
    # list of all variants
    all_v = []
    
    for caller in maf_fps:
        all_v.extend(all_variants[caller].keys())   
    
    # removes duplicates
    all_v = list(dict.fromkeys(all_v))
    
    # stores indexes and caller of each sub-variant
    all_v_index = {k:[] for k in all_v}
    all_v_caller = {j:[] for j in all_v}
    for caller in maf_fps:
        for var in all_v:
            if var in all_variants[caller].keys():
                all_v_index[var].append(all_variants[caller][var])
                all_v_caller[var].append(caller)
    
    # quorum rule
    list_delete=[]
    for i in all_v_index.keys():
        if len(all_v_index[i]) < 2:
            list_delete.append(i)
    
    for j in list_delete:        
        del all_v_index[j]
        del all_v_caller[j]
    
    # gathers which variants from which caller to include in final file
    
    boolean_arrays = {c:[] for c in possible_callers}
    for c in possible_callers:
        boolean_arrays[c] = np.zeros(len(all_df[c]), dtype=bool)
    
    orig = []
    for k in all_v_index.keys():
        boolean_arrays[all_v_caller[k][0]][all_v_index[k][0]] = True
        orig.append(all_v_caller[k][0])
    
    final_dataframes = {}
    for c in possible_callers:
        final_dataframes[c] = all_df[c][boolean_arrays[c]]
        
    final = pd.concat(final_dataframes, ignore_index = True, keys= None)
    
    final['callers'] = list(all_v_caller.values())
    final['original'] = orig
    
    # PASS filter and reindexing
    final = final.drop(index = [i for i in final.index if final.iloc[i][110] != 'PASS'])
    final.index = range(len(final))
    
    #gdc filter and reindexing 
    gdc_filt = set(['common_in_exac', 'gdc_pon', 'ndp', 'NonExonic', 'bitgt'])
    gdc_filt_drop = []
    for i in final.index:
        if isinstance(final.iloc[i][116], str):
            if set(final.iloc[i][116].split(';')) & gdc_filt != set():
                gdc_filt_drop.append(i)
    final = final.drop(index = gdc_filt_drop)
    final.index = range(len(final))
    
    
    # filter out any variants whose tumor-normal pair was run through all 4 variant callers
    tumor_normal = {c:[] for caller in maf_fps}
    
    for caller in maf_fps:    
        tumor_normal[caller]= [all_df[caller].loc[:,'Tumor_Sample_Barcode'],all_df[caller].loc[:,'Matched_Norm_Sample_Barcode']]
        tumor_normal[caller] = np.array(tumor_normal[caller])
        tumor_normal[caller] = tumor_normal[caller].transpose()
        tumor_normal[caller] = set([(tumor_normal[caller][i][0],tumor_normal[caller][i][1]) for i in range(len(all_df[caller]))])
    
    tumor_normal_set = tumor_normal[list(maf_fps.keys())[0]]

    for caller in maf_fps:
        tumor_normal_set = tumor_normal_set & tumor_normal[caller]
        
    final = final.drop(index = [i for i in final.index if (final.iloc[i][15], final.iloc[i][16]) not in tumor_normal_set])
    
    final.index = range(len(final))
    
    # store in dictionary of dataframes
    dataframes[cancer] = final



In [None]:
for cancer in possible_cancers:
    dataframes[cancer].to_csv("filtered_{}.maf".format(cancer), sep="\t")