In [189]:
import os, sys, argparse
from upsetplot import from_memberships
from upsetplot import plot
%matplotlib inline 
from matplotlib import pyplot
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd

original = os.getcwd()

possible_callers = ('muse', 'mutect', 'somaticsniper', 'varscan')

possible_cancers = ('ACC', 'BLCA','BRCA','CESC','CHOL','COAD','DLBC','ESCA','GBM','HNSC', 
					'KICH','KIRC','KIRP','LAML','LGG','LIHC','LUAD','LUSC','MESO', 'OV','PAAD',
                    'PCPG','PRAD','READ','SARC','SKCM','STAD','TGCT','THCA','THYM','UCEC','UCS','UVM')

from itertools import combinations

# combine these two functions into one (maybe just rename first one keys?)

def my_combs(iterable, r):
    to_return = []
    for i in range(1, r+1):
        c = combinations(iterable, i)
        for comb in c:
            if len(comb) == 1:
                to_return.extend((comb,))
            else:
                to_return.append(list(comb))
    return to_return

def intersections(list_of_sets):
    '''
    Given a list of sets, returns all possible exclusive intersections of the sets.
    '''
    sets = list_of_sets
    combs = []
    inters = []
    ret = []

    for i in range(len(sets)):
        combs.append(my_combs(range(len(sets)), abs(len(sets)-i)))
        inter = []
        for comb in combs[i]:
            intersect = sets[comb[0]]
            for j in range(len(comb)):
                intersect = intersect & sets[comb[j]]
            for j in range(i):
                for comb2, inter2 in zip(combs[j], inters[j]):
                    condition = True
                    for k in range(len(comb)):
                        condition = condition and (comb[k] in comb2)
                    if condition is True:
                        intersect -= inter2
            inter.append(intersect)
        inters.append(inter)
    
    for j in range(len(sets)):
        ret.extend(inters[len(sets)-1-j])
    
    return ret

def n_intersections(sets, nways):
    combs = my_combs(sets, nways)
    inters = []
    for c in range(len(combs)):
        inters.append(intersections(combs[c]))
    return inters


In [2]:
def parse_maf(fp):
    ''' Return a dictionary with vars[patient][variant_set]'''
    caller_vars = {}

    fh = open(fp, 'r')
    for line in fh:
        temp = line.strip().split('\t')
        if line[0] == '#' or temp[0] == 'Hugo_Symbol':
            continue
        ID = temp[16] + '_' + temp[17] # 'tumor'_'normal'
        if ID not in caller_vars:
            caller_vars[ID] = set()
        gene = temp[0]
        chrom =temp[4]
        s_pos = temp[5]
        e_pos = temp[6]
        ref = temp[11]
        tum_allele_1 = temp[12]
        tum_allele_2 = temp[13]
        impact = temp[93]
        filt = temp[110]
        var = (gene,chrom, s_pos, e_pos, ref, tum_allele_1, tum_allele_2,impact, filt)
        caller_vars[ID].add(var)

    return(caller_vars)

In [3]:
keys = my_combs_frozenset(possible_callers, len(possible_callers))

def set_contents(dat, possible_callers):        
    data = dict([(key, []) for key in keys])
    
    patients = set(list(dat[0].keys()))
    for i in range(len(dat)):
        patients = patients & set(list(dat[i].keys()))
    
    
    for patient in patients:
        sets = []
        for j in range(len(dat)):
            sets.append(dat[j][patient])
        
        inters = intersections(sets)
        
        for i in range(len(inters)):
            data[keys[i]].extend(list(inters[i]))
       
    return data


In [4]:
os.chdir(original)
possible_cancers = ('ACC', 'BLCA','BRCA','CESC','CHOL','COAD', 'DLBC','ESCA','GBM','HNSC', 
					'KICH','KIRC','KIRP','LAML','LGG','LIHC','LUAD','LUSC','MESO', 'OV','PAAD',
                    'PCPG','PRAD','READ','SARC','SKCM','STAD','TGCT','THCA','THYM','UCEC','UCS','UVM')

g = np.array(np.genfromtxt('Bailey_et_al_2018_sig_mut_genes.txt',dtype=None,encoding = None, usecols = (0)))
c = np.genfromtxt('Bailey_et_al_2018_sig_mut_genes.txt',dtype=None,encoding = None, usecols = (1))
genes = np.delete(g,4)
cancers = np.delete(c,4)
dictofgenes = dict([(cancer, set()) for cancer in cancers])

for i in range(len(genes)):
    dictofgenes[cancers[i]].add(genes[i])

pancandict = dictofgenes['PANCAN']
coadread = dictofgenes['COADREAD']

del dictofgenes['PANCAN']
del dictofgenes['COADREAD']

impacts = set(('MODERATE', 'HIGH'))
filt = set(('PASS'))

def how_many_tumor(cancer, all_variants, possible_callers, content):
    if cancer == 'COAD' or cancer == 'READ':
        tumorcount = []
        for key in keys:
            count = 0
            for c in content[key]:
                if set(c) & coadread:
                    if set(c) & impacts:
                        if 'PASS' in set(c):
                            count += 1
            tumorcount.append(count)
    else:
        tumorcount = []
        for key in keys:
            count = 0
            for c in content[key]:
                if set(c) & dictofgenes[cancer]:
                    if set(c) & impacts:
                        if 'PASS' in set(c):
                            count += 1
            tumorcount.append(count)
    return tumorcount



In [8]:
def baileydf(possible_cancers, possible_callers):
    dfbailey = pd.DataFrame(np.nan, index=keys, columns=possible_cancers)

    for cancer in possible_cancers:
        os.chdir(original)
        os.chdir(cancer)
        mafs = os.listdir()

        maf_fps = {}
        for caller in possible_callers:
            for maf in mafs:
                if caller in maf: # str in the filepath
                    maf_fps[caller] = maf

        all_variants = {}
    
        for caller in maf_fps:
            all_variants[caller] = parse_maf(maf_fps[caller])
        arg = []
        for i in range(len(possible_callers)):
            arg.append(all_variants[possible_callers[i]])

        content = set_contents(arg, possible_callers)
        dfbailey.loc[:,cancer] = how_many_tumor(cancer, all_variants, possible_callers, content)
    return dfbailey
dfbailey = baileydf(possible_cancers, possible_callers)
print(dfbailey)

                                        ACC  BLCA  BRCA  CESC  CHOL  COAD  \
(muse)                                    0    78    34    24     3    18   
(mutect)                                  4   223   184    53     5    88   
(somaticsniper)                           0    10     5     1     0     4   
(varscan)                                14   178   145    34     2   165   
(mutect, muse)                            3    86    44    27     1    45   
(muse, somaticsniper)                     0     8     2     2     0     3   
(varscan, muse)                           0    51    11    11     2    11   
(mutect, somaticsniper)                   0    37     5     6     2     5   
(mutect, varscan)                        12   314   512    58    17   347   
(varscan, somaticsniper)                  7    34    35    14     0    47   
(mutect, muse, somaticsniper)             7    79    50    20     0    64   
(mutect, varscan, muse)                   0   116    81    32     0    92   

In [212]:
def table(df, possible_callers, additional_callers, cancer):
    #df = baileydf(possible_cancers, possible_callers)
    
    sets = []
    sets.append(list(possible_callers))
    ad = my_combs(additional_callers, len(additional_callers))

    for n in range(len(ad)):
        sets.append(possible_callers+list(ad[n]))

    for i in range(len(sets)):
        sets[i] = tuple(sets[i])

    sums = pd.DataFrame(0, index=sets, columns=('real', 'real %', 'total', 'total %'))

    su = []
    for i in range(len(sets)):
        index = []
        for j in range(len(keys)):
            if len(set(sets[i])&keys[j]) >= 2:
                index.append(j)
        s =[]
        for k in range(len(index)):
            s.append(df[cancer][index[k]])
        su.append(np.sum(s))
    sums.loc[:,'real'] = su
    
    tot = []
    for i in range(len(sets)):
        index = []
        for j in range(len(keys)):
            if len(set(sets[i])&keys[j]) >= 1:
                index.append(j)        
        t =[]
        for k in range(len(index)):
            t.append(df[cancer][index[k]])
        tot.append(np.sum(t))
    sums.loc[:,'total'] = tot
    
    rpercent = [0]
    for i in range(1, len(sets)):
        rpercent.append((sums['real'][i] - sums['real'][0]) / sums['real'][0])
    sums.loc[:,'real %'] = rpercent
    
    tpercent = [0]
    for i in range(1, len(sets)):
        tpercent.append((sums['total'][i] - sums['total'][0]) / sums['total'][0])
    sums.loc[:,'total %'] = tpercent
    sums = sums.sort_values(['real %'], ascending=False)
    return sums
     
possible_callers = ['mutect', 'varscan']
additional_callers = ['muse', 'somaticsniper']
for cancer in possible_cancers:
    print(cancer)
    print(table(dfbailey, possible_callers, additional_callers,cancer))

ACC
                                        real    real %  total  total %
(mutect, varscan, muse, somaticsniper)    49  0.689655     67      0.0
(mutect, varscan, somaticsniper)          46  0.586207     67      0.0
(mutect, varscan, muse)                   42  0.448276     67      0.0
(mutect, varscan)                         29  0.000000     67      0.0
BLCA
                                        real    real %  total   total %
(mutect, varscan, muse, somaticsniper)  1996  0.299479   2485  0.040184
(mutect, varscan, muse)                 1917  0.248047   2475  0.035998
(mutect, varscan, somaticsniper)        1851  0.205078   2407  0.007535
(mutect, varscan)                       1536  0.000000   2389  0.000000
BRCA
                                        real    real %  total   total %
(mutect, varscan, muse, somaticsniper)  1720  0.139828   2088  0.020029
(mutect, varscan, muse)                 1678  0.111995   2083  0.017587
(mutect, varscan, somaticsniper)        1663  0.102054 

(mutect, varscan)                        211  0.000000    303  0.000000
READ
                                        real    real %  total   total %
(mutect, varscan, muse, somaticsniper)   573  0.287640    655  0.012365
(mutect, varscan, muse)                  551  0.238202    654  0.010819
(mutect, varscan, somaticsniper)         544  0.222472    649  0.003091
(mutect, varscan)                        445  0.000000    647  0.000000
SARC
                                        real    real %  total   total %
(mutect, varscan, muse, somaticsniper)   152  0.727273    215  0.009390
(mutect, varscan, somaticsniper)         138  0.568182    214  0.004695
(mutect, varscan, muse)                  134  0.522727    214  0.004695
(mutect, varscan)                         88  0.000000    213  0.000000
SKCM
                                        real    real %  total   total %
(mutect, varscan, muse, somaticsniper)  1320  0.264368   1517  0.020861
(mutect, varscan, muse)                 1260  0.2