In [1]:
import os, sys, argparse
from upsetplot import from_memberships
from upsetplot import plot
%matplotlib inline 
from matplotlib import pyplot
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from itertools import combinations
from comb import parse_maf, my_combs_frozenset, my_combs_all, my_combs, my_combos
from inter import intersections
from howmany import how_many_tumor_cgc
from contents import set_contents
from df import baileydf, cgcdf, pancandf
from tabulate import tabulate
from table import table

original = os.getcwd()


In [2]:
possible_callers = ('muse', 'mutect', 'somaticsniper', 'varscan')

possible_cancers = ('ACC', 'BLCA','BRCA','CESC','CHOL','COAD','DLBC','ESCA','GBM','HNSC', 
					'KICH','KIRC','KIRP','LAML','LGG','LIHC','LUAD','LUSC','MESO', 'OV','PAAD',
                    'PCPG','PRAD','READ','SARC','SKCM','STAD','TGCT','THCA','THYM','UCEC','UCS','UVM')
callers = ['mutect','varscan']
additional_callers = ['somaticsniper','muse']
keys = my_combs_frozenset(possible_callers, len(possible_callers))

impacts = set(('MODERATE', 'HIGH'))
filt = set(('PASS',))
dfcgc = cgcdf(possible_cancers, possible_callers, keys, original, impacts, filt)

In [14]:


def individual_analysis(original, cancer, possible_callers, tabu):
    os.chdir(original)
    cgc = pd.read_csv('Cancer_Gene_Census_all_Jun-11-2019.csv', usecols = (0,9))
    os.chdir(cancer)
    mafs = os.listdir()

    maf_fps = {}
    for caller in possible_callers:
        for maf in mafs:
            if caller in maf: # str in the filepath
                maf_fps[caller] = maf

    all_variants = {}
    
    for caller in maf_fps:
        all_variants[caller] = parse_maf(maf_fps[caller])
    arg = []
    for i in range(len(possible_callers)):
        arg.append(all_variants[possible_callers[i]])   
    
    
    patients = set(list(arg[0].keys()))
    for i in range(len(arg)):
        patients = patients & set(list(arg[i].keys()))

    patients = list(patients)
    
    allrealdiff = []
    alltotaldiff = []
    allreal = []
    
    total = np.zeros(4)

    for k in range(len(patients)):
        df = pd.DataFrame(0, index=keys, columns=[cancer,])
        data = dict([(key, []) for key in keys])
        sets = []
        for j in range(len(arg)):
            sets.append(arg[j][patients[k]])
        
        inters = intersections(sets)
    
        for i in range(len(inters)):
            data[keys[i]].extend(list(inters[i]))
        
        df.loc[:,cancer] = how_many_tumor_cgc(data, cgc, filt, impacts, keys)
    
        t = table(callers, additional_callers, cancer, 2, impacts, filt, keys, bcp = df, individual = True, tab = False)        
        
        totaltable = table(callers, additional_callers, cancer, 2, impacts, filt, keys, bcp = dfcgc, individual = False, tab = False)
        indexorder = list(totaltable.index)
        
        t = t.reindex(indexorder)
        
        total = total + np.array(t['real'])
    
        if all(np.isfinite(t['real % diff'])):
            allrealdiff.append(np.array(t['real % diff']))
        if all(np.isfinite(t['total % diff'])):
            alltotaldiff.append(np.array(t['total % diff']))
        if all(np.isfinite(t['% of all real'])):
            allreal.append(np.array(t['% of all real']))
    allvar = []
    averages = pd.DataFrame(0, index=t.index, columns=('mean real', 'real std', 'mean total', 'total std', 'mean all real', 'all real std'))
    averages.loc[:,'mean real'] = np.mean(np.array(allrealdiff), axis = 0)
    for i in range(len(np.array(allrealdiff))):
        allvar.append(np.array(allrealdiff)[i][0])
    
    averages.loc[:,'mean total'] = np.mean(np.array(alltotaldiff), axis = 0)
    averages.loc[:,'mean all real'] = np.mean(np.array(allreal), axis = 0)
    averages.loc[:,'real std'] = np.std(np.array(allrealdiff), axis = 0)
    averages.loc[:,'total std'] = np.std(np.array(alltotaldiff), axis = 0)
    averages.loc[:,'all real std'] = np.std(np.array(allreal), axis = 0)
    #print(averages)
    if tabu == True:
        print(cancer)
        print(tabulate(averages, headers=['variant callers', 'mean real %', 'real % std', 'mean total %', 'total % std', 'mean all real %', 'all real % std'], tablefmt='psql',floatfmt=(".0f",".3f",".3f", ".3f",".3f",".3f", ".3f")))
    return averages
ave = individual_analysis(original, 'ACC', possible_callers, tabu = True)

  fractions = 100*(tops/realsum)
  rpercent.append(100*((sums['real'][i] - sums['real'][0]) / sums['real'][0]))
  tpercent.append(100*((sums['total'][i] - sums['total'][0]) / sums['total'][0]))
  rpercent.append(100*((sums['real'][i] - sums['real'][0]) / sums['real'][0]))


ACC
+------------------------------------------------+---------------+--------------+----------------+---------------+-------------------+------------------+
| variant callers                                |   mean real % |   real % std |   mean total % |   total % std |   mean all real % |   all real % std |
|------------------------------------------------+---------------+--------------+----------------+---------------+-------------------+------------------|
| ('mutect', 'varscan', 'somaticsniper', 'muse') |        29.834 |       45.400 |          4.742 |        16.704 |           100.000 |            0.000 |
| ('mutect', 'varscan', 'somaticsniper')         |        18.238 |       34.076 |          4.274 |        16.489 |            92.830 |           14.063 |
| ('mutect', 'varscan', 'muse')                  |        22.129 |       36.300 |          0.979 |         4.543 |            94.407 |           11.276 |
| ('mutect', 'varscan')                          |         0.000 |      

In [13]:
# extrapolates size from cost, assuming equal cost and equal size for each patient, assumes num variants and size are linear
# could rewrite this function to take input size as an 
def cost(original, cancer, possible_callers, size, callers, additional_callers):
    df = pd.DataFrame(0, index=possible_callers, columns=['coefficient', 'constant'])
    df['coefficient'] = [5,3,9,1]
    df['constant'] = [2,10,4,2]
    constants = 0
    coefficients = 0
    for i in range(len(callers)):
        coefficients += df.loc[callers[i]]['coefficient']
        constants +=  df.loc[callers[i]]['constant']
        
    cost = coefficients*size +constants # size of each patient, extrapolated from cost
    averages = individual_analysis(original, cancer, possible_callers, tabu = False) # average patient stats
       
    costs = np.zeros(len(averages.index))

    for i in range(len(averages.index)):
        coefficient = 0
        constant = 0
        for caller in averages.index[i]:
            coefficient += df.loc[caller]['coefficient']
            constant +=  df.loc[caller]['constant']
        costs[i] = (coefficient * size + constant) 
    averages['cost'] = costs
    print(cancer)
    print(tabulate(averages, headers=['variant callers', 'mean real %', 'real % std', 'mean total %', 'total % std', 'mean all real %', 'all real % std', 'cost'], tablefmt='psql',floatfmt=(".0f",".3f",".3f", ".3f",".3f",".3f", ".3f", '.3f')))
    return averages

cost_table = cost(original, 'ACC', possible_callers, 100, ['mutect', 'varscan'], ['somaticsniper', 'muse'])


  fractions = 100*(tops/realsum)
  rpercent.append(100*((sums['real'][i] - sums['real'][0]) / sums['real'][0]))
  tpercent.append(100*((sums['total'][i] - sums['total'][0]) / sums['total'][0]))
  rpercent.append(100*((sums['real'][i] - sums['real'][0]) / sums['real'][0]))


ACC
+------------------------------------------------+---------------+--------------+----------------+---------------+-------------------+------------------+----------+
| variant callers                                |   mean real % |   real % std |   mean total % |   total % std |   mean all real % |   all real % std |     cost |
|------------------------------------------------+---------------+--------------+----------------+---------------+-------------------+------------------+----------|
| ('mutect', 'varscan', 'somaticsniper', 'muse') |        29.834 |       45.400 |          4.742 |        16.704 |           100.000 |            0.000 | 1818.000 |
| ('mutect', 'varscan', 'somaticsniper')         |        18.238 |       34.076 |          4.274 |        16.489 |            92.830 |           14.063 | 1316.000 |
| ('mutect', 'varscan', 'muse')                  |        22.129 |       36.300 |          0.979 |         4.543 |            94.407 |           11.276 |  914.000 |
| ('mu