In [1]:
import os, sys, argparse
from upsetplot import from_memberships
from upsetplot import plot
%matplotlib inline 
from matplotlib import pyplot
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from itertools import combinations
from comb import parse_maf, my_combs_frozenset, my_combs_all, my_combs, my_combos
from inter import intersections
#from howmany import how_many_tumor
from contents import set_contents
from df import baileydf, cgcdf, pancandf
from tabulate import tabulate

original = os.getcwd()

possible_callers = ('muse', 'mutect', 'somaticsniper', 'varscan')

possible_cancers = ('ACC', 'BLCA','BRCA','CESC','CHOL','COAD','DLBC','ESCA','GBM','HNSC', 
					'KICH','KIRC','KIRP','LAML','LGG','LIHC','LUAD','LUSC','MESO', 'OV','PAAD',
                    'PCPG','PRAD','READ','SARC','SKCM','STAD','TGCT','THCA','THYM','UCEC','UCS','UVM')


In [2]:
keys = my_combs_frozenset(possible_callers, len(possible_callers))


In [3]:
impacts = set(('MODERATE', 'HIGH'))
filt = set(('PASS',))
dfcgc = cgcdf(possible_cancers, possible_callers, keys, original, impacts, filt)
#print(dfcgc)

In [10]:

def table(callers, additional_callers, cancer, cardinality, impacts, filt, bcp):

    if isinstance(bcp, pd.DataFrame):
        df = bcp.drop(['Total'], axis=0)
    elif bcp == 'BAILEY':
        df = baileydf(possible_cancers, possible_callers, keys, original, impacts, filt).drop(['Total'], axis=0)
    elif bcp == 'CGC':
        df = cgcdf(possible_cancers, possible_callers, keys, original, impacts, filt).drop(['Total'], axis=0)
    elif bcp == 'PANCAN':
        df = pancandf(possible_cancers, possible_callers, keys, original, impacts, filt).drop(['Total'], axis=0)
    
    
    sets = []
    if len(callers) >= 1:
        sets.append(list(callers))
    ad = my_combs_all(additional_callers, len(additional_callers))

    for n in range(len(ad)):
        sets.append(callers+list(ad[n]))

    for i in range(len(sets)):
        sets[i] = tuple(sets[i])
    
    if len(callers) >= cardinality:
        sums = pd.DataFrame(0, index=sets, columns=('real', 'real % diff', 'total', 'total % diff', '% of all real'))
    else:
        sums = pd.DataFrame(0, index=sets, columns=('real', 'total', '% of all real'))
    
    # real counts
    su = []
    for i in range(len(sets)):
        index = []
        for j in range(len(keys)):
            if len(set(sets[i])&keys[j]) >= cardinality:
                index.append(j)
        s =[]
        for k in range(len(index)):
            s.append(df[cancer][index[k]])
        su.append(np.sum(s))
    sums.loc[:,'real'] = su
    
    # total counts
    tot = []
    for i in range(len(sets)):
        index = []
        for j in range(len(keys)):
            if len(set(sets[i])&keys[j]) >= 1:
                index.append(j)        
        t =[]
        for k in range(len(index)):
            t.append(df[cancer][index[k]])
        tot.append(np.sum(t))
    sums.loc[:,'total'] = tot
    
    
        # real fractions 
    realsum = 0
    for i in range(len(df[cancer])):
        if len(df.index[i]) >= cardinality:
            realsum += df[cancer][i]
    tops = np.array(sums.loc[:,'real'])
    fractions = 100*(tops/realsum)       
    sums.loc[:,'% of all real'] = fractions   
    
    if len(callers) >= cardinality:   
        # real percentage difference, with respect to the initial two way intersection
        rpercent = []
        for i in range(len(sets)):
            rpercent.append(100*((sums['real'][i] - sums['real'][0]) / sums['real'][0]))
        sums.loc[:,'real % diff'] = rpercent
    
    # total percentage difference, with respect to the initial two way intersection
        tpercent = []
        for i in range(len(sets)):
            tpercent.append(100*((sums['total'][i] - sums['total'][0]) / sums['total'][0]))
        sums.loc[:,'total % diff'] = tpercent
    
    #sort by real percentage difference values
        sums = sums.sort_values(['real % diff'], ascending=False)
    
    #print(cancer)
    if len(callers) >= cardinality: 
        print(tabulate(sums, headers=['variant callers', 'real', 'real % diff', 'total', 'total % diff', '% of all real'], tablefmt='psql',floatfmt=(".0f",".0f",".3f", ".0f", ".3f", ".3f")))
    else:
        print(tabulate(sums, headers=['variant callers', 'real', 'total', '% of all real'], tablefmt='psql',floatfmt=(".0f",".0f",".0f", ".3f")))
    print()
    
    return sums


callers = ['mutect','varscan','muse']
additional_callers = ['somaticsniper']
#cardinality = 2
for cancer in possible_cancers:
    print(cancer)
    print('cardinality = 2')
    table(callers, additional_callers, cancer, 2, impacts, filt, bcp = dfcgc)
    print('cardinality = 3')
    table(callers, additional_callers, cancer, 3, impacts, filt, bcp = dfcgc)



ACC
cardinality = 2
+------------------------------------------------+--------+---------------+---------+----------------+-----------------+
| variant callers                                |   real |   real % diff |   total |   total % diff |   % of all real |
|------------------------------------------------+--------+---------------+---------+----------------+-----------------|
| ('mutect', 'varscan', 'muse', 'somaticsniper') |    487 |        10.682 |     630 |          2.606 |         100.000 |
| ('mutect', 'varscan', 'muse')                  |    440 |         0.000 |     614 |          0.000 |          90.349 |
+------------------------------------------------+--------+---------------+---------+----------------+-----------------+

cardinality = 3
+------------------------------------------------+--------+---------------+---------+----------------+-----------------+
| variant callers                                |   real |   real % diff |   total |   total % diff |   % of all re

In [11]:
combos = my_combos(possible_callers, 2)
print('BRCA')
for j in range(len(combos)):    
    table(combos[j], set(possible_callers) - set(combos[j]), 'BRCA', 2, impacts, filt, bcp = dfcgc)

BRCA
+------------------------------------------------+--------+---------------+---------+----------------+-----------------+
| variant callers                                |   real |   real % diff |   total |   total % diff |   % of all real |
|------------------------------------------------+--------+---------------+---------+----------------+-----------------|
| ('muse', 'mutect', 'somaticsniper', 'varscan') |   5573 |        32.533 |    7239 |          7.595 |         100.000 |
| ('muse', 'mutect', 'varscan')                  |   5478 |        30.273 |    7164 |          6.480 |          98.295 |
| ('muse', 'mutect', 'somaticsniper')            |   4560 |         8.442 |    6873 |          2.155 |          81.823 |
| ('muse', 'mutect')                             |   4205 |         0.000 |    6728 |          0.000 |          75.453 |
+------------------------------------------------+--------+---------------+---------+----------------+-----------------+

+-------------------------

In [6]:
from howmany import how_many_tumor_cgc
def outlier_detector(dat, possible_callers, outliers, cancer, dfcgc):           
    os.chdir(original)
    cgc = pd.read_csv('Cancer_Gene_Census_all_Jun-11-2019.csv', usecols = (0,9))
    patients = set(list(dat[0].keys()))
    
    for i in range(len(dat)):
        patients = patients & set(list(dat[i].keys()))
    
    lengths = []
    
    patients = list(patients)
    
    for patient in patients:
        totalnumvariants = 0
        sets = []
        
        for j in range(len(dat)):
            sets.append(dat[j][patient])
            
        inters = intersections(sets)
        
        for j in range(len(inters)):
            totalnumvariants += len(inters[j])
        
        lengths.append(totalnumvariants)
    
    value = max(lengths)
    ind = lengths.index(max(lengths))
    outliers['maximum value'][cancer]= value
    outliers['patient index'][cancer]= ind
    
    
    data = dict([(key, []) for key in keys])

    sets = []
    for j in range(len(dat)):
        sets.append(dat[j][patients[ind]])
        
    inters = intersections(sets)
    for i in range(len(inters)):
        data[keys[i]].extend(list(inters[i]))
        
    dfcgc.loc[:,cancer] = how_many_tumor_cgc(data, cgc, filt, impacts, keys)
    return outliers, dfcgc


outliers = pd.DataFrame(0, index=possible_cancers, columns=('maximum value', 'patient index'))
dfcgc = pd.DataFrame(np.nan, index=keys, columns=possible_cancers)
for cancer in possible_cancers:
    os.chdir(original)
    os.chdir(cancer)
    mafs = os.listdir()

    maf_fps = {}
    for caller in possible_callers:
        for maf in mafs:
            if caller in maf: # str in the filepath
                maf_fps[caller] = maf

    all_variants = {}
    
    for caller in maf_fps:
        all_variants[caller] = parse_maf(maf_fps[caller])
    arg = []
    for i in range(len(possible_callers)):
        arg.append(all_variants[possible_callers[i]])    
    test, data = outlier_detector(arg, possible_callers, outliers, cancer, dfcgc)
print(data)
print(test)



os.chdir(original)

                                        ACC  BLCA  BRCA  CESC  CHOL  COAD  \
(muse)                                    0     2     5    12     4    10   
(mutect)                                  2     7    20    27     5    90   
(somaticsniper)                           1     1     0     1     0     2   
(varscan)                                 3     0     2    11     1    20   
(muse, mutect)                            0    10    24     7     1    46   
(muse, somaticsniper)                     0     0     0     3     0     1   
(muse, varscan)                           1     5    18     4     1    49   
(mutect, somaticsniper)                   0     6     0     0     1     2   
(mutect, varscan)                         5     3    16    27     3    56   
(somaticsniper, varscan)                  5     1     2     8     0     3   
(muse, mutect, somaticsniper)             1     0     0     0     1     1   
(muse, mutect, varscan)                   5     1   104    17     1   150   

In [7]:
for cancer in possible_cancers:
    table(callers, additional_callers, cancer, cardinality, impacts, filt, bcp = data)

ACC
+------------------------------------------------+--------+---------------+---------+----------------+-----------------+
| variant callers                                |   real |   real % diff |   total |   total % diff |   % of all real |
|------------------------------------------------+--------+---------------+---------+----------------+-----------------|
| ('mutect', 'varscan', 'muse', 'somaticsniper') |    101 |        23.171 |     107 |          0.943 |         100.000 |
| ('mutect', 'varscan', 'somaticsniper')         |    100 |        21.951 |     107 |          0.943 |          99.010 |
| ('mutect', 'varscan', 'muse')                  |     96 |        17.073 |     106 |          0.000 |          95.050 |
| ('mutect', 'varscan')                          |     82 |         0.000 |     106 |          0.000 |          81.188 |
+------------------------------------------------+--------+---------------+---------+----------------+-----------------+

BLCA
+---------------------

