In [1]:
import os, sys, argparse
from upsetplot import from_memberships
from upsetplot import plot
%matplotlib inline 
from matplotlib import pyplot
import numpy as np

possible_callers = ('mutect', 'muse', 'somaticsniper', 'varscan')
possible_cancers = ('ACC', 'BLCA','BRCA','CESC','CHOL','COAD','DLBC','ESCA','GBM','HNSC', 
                    'KICH','KIRC','KIRP','LAML','LGG','LIHC','LUAD','LUSC','MESO',
                    'OV','PAAD','PCPG','PRAD','READ','SARC','SKCM','STAD','TGCT','THCA','THYM','UCEC','UCS','UVM')

# functions
def parse_maf(fp):
	''' Return a dictionary with vars[patient][variant_set]'''
	caller_vars = {}

	fh = open(fp, 'r')
	for line in fh:
		temp = line.strip().split('\t')
		if line[0] == '#' or temp[0] == 'Hugo_Symbol':
			continue
		ID = temp[16] + '_' + temp[17] # 'tumor'_'normal'
		if ID not in caller_vars:
			caller_vars[ID] = set()
		chrom =temp[4]
		s_pos = temp[5]
		e_pos = temp[6]
		ref = temp[11]
		tum_allele_1 = temp[12]
		tum_allele_2 = temp[13]
		var = (chrom, s_pos, e_pos, ref, tum_allele_1, tum_allele_2)
		caller_vars[ID].add(var)

	return(caller_vars)


def two_set(dat_1, dat_2):
    inter_all = 0
    diff_1_all = 0
    diff_2_all = 0

    # intersect patient keys
    patients = set(list(dat_1.keys())) & set(list(dat_2.keys()))
    dat = []
    for patient in patients:
        set_1 = dat_1[patient]
        set_2 = dat_2[patient]
        inter = len(set_1 & set_2)
        diff_1 = len(set_1 - set_2)
        diff_2 = len(set_2 - set_1)

        inter_all = inter_all + inter
        diff_1_all = diff_1_all + diff_1
        diff_2_all = diff_2_all + diff_2
    
    dat.extend([inter_all, diff_1_all, diff_2_all])
    return dat


def three_set(dat_1, dat_2, dat_3):

    inter_all = 0
    diff_1_all = 0
    diff_2_all = 0
    diff_3_all = 0
    inter_12_all = 0
    inter_13_all = 0
    inter_23_all = 0

    # intersect patient keys
    patients = set(list(dat_1.keys())) & set(list(dat_2.keys())) & set(list(dat_3.keys()))
    
    dat = []
    for patient in patients:
        set_1 = dat_1[patient]
        set_2 = dat_2[patient]
        set_3 = dat_3[patient]
        inter = len(set_1 & set_2 & set_3)
        inter_12 = len(set_1 & set_2) - inter
        inter_13 = len(set_1 & set_3) - inter
        inter_23 = len(set_2 & set_3) - inter
        diff_1 = len(set_1) - inter - inter_13 - inter_12
        diff_2 = len(set_2) - inter - inter_12 - inter_23
        diff_3 = len(set_3) - inter - inter_13 - inter_23
        
        inter_all = inter_all + inter
        diff_1_all = diff_1_all + diff_1
        diff_2_all = diff_2_all + diff_2
        diff_3_all = diff_3_all + diff_3
        inter_12_all = inter_12_all + inter_12
        inter_13_all = inter_13_all + inter_13
        inter_23_all = inter_23_all + inter_23

    dat.extend([inter_all, diff_1_all, diff_2_all, diff_3_all, inter_12_all, inter_13_all, inter_23_all])
    return dat

def four_set_fractions(dat_1, dat_2, dat_3, dat_4):
    dat = []
    
    # intersect patient keys
    patients = set(list(dat_1.keys())) & set(list(dat_2.keys())) & set(list(dat_3.keys())) & set(list(dat_4.keys()))

    for patient in patients:
        top = []
        fractions = []
        set_1 = dat_1[patient]
        set_2 = dat_2[patient]
        set_3 = dat_3[patient]
        set_4 = dat_4[patient]
        total = len(set_1 | set_2 | set_3 | set_4)
    
        inter = len(set_1 & set_2 & set_3 & set_4)
        top.append(inter)
        inter_123 = len(set_1 & set_2 & set_3) - inter
        top.append(inter_123)
        inter_124 = len(set_1 & set_2 & set_4) - inter
        top.append(inter_124)
        inter_134 = len(set_1 & set_3 & set_4) - inter
        top.append(inter_134)
        inter_234 = len(set_2 & set_3 & set_4) - inter
        top.append(inter_234)
        
        
        inter_12 = len(set_1 & set_2) - inter - inter_123 - inter_124
        top.append(inter_12)
        inter_13 = len(set_1 & set_3) - inter - inter_123 - inter_134
        top.append(inter_13)
        inter_14 = len(set_1 & set_4) - inter - inter_124 - inter_134
        top.append(inter_14)
        inter_23 = len(set_2 & set_3) - inter - inter_123 - inter_234
        top.append(inter_23)
        inter_24 = len(set_2 & set_4) - inter - inter_124 - inter_234
        top.append(inter_24)
        inter_34 = len(set_3 & set_4) - inter - inter_234 - inter_134
        top.append(inter_34)
        
        diff_1 = len(set_1) - inter - inter_13 - inter_12 - inter_14 - inter_123 - inter_134 - inter_124
        top.append(diff_1)
        diff_2 = len(set_2) - inter - inter_12 - inter_23 - inter_24 - inter_123 - inter_124 - inter_234
        top.append(diff_2)
        diff_3 = len(set_3) - inter - inter_13 - inter_23 - inter_34 - inter_123 - inter_134 - inter_234
        top.append(diff_3)
        diff_4 = len(set_4) - inter - inter_14 - inter_24 - inter_34 - inter_124 - inter_134 - inter_234
        top.append(diff_4)
        for t in top:
            fractions.append(t/total)
        dat.append(fractions)
    
    return dat



import matplotlib.pyplot as plt
from itertools import combinations
from simple_venn import venn2, venn3, venn4

original = os.getcwd()

for cancer in possible_cancers:
    os.chdir(original)
    os.chdir(cancer)
    mafs = os.listdir()

    maf_fps = {}
    for caller in possible_callers:
        for maf in mafs:
            if caller in maf: # str in the filepath
                maf_fps[caller] = maf

    #print(possible_callers)
    #print(maf_fps)

    # parse maf for each caller
    all_variants = {}
    for caller in maf_fps:
        all_variants[caller] = parse_maf(maf_fps[caller])
        
    num_callers = len(possible_callers)
    
    dat = four_set_fractions(all_variants[possible_callers[0]], all_variants[possible_callers[1]], all_variants[possible_callers[2]], all_variants[possible_callers[3]])
    interall = []

    inter123all = []
    inter124all = []
    inter134all = []
    inter234all = []
    
    inter12all = []
    inter13all = []
    inter14all = []
    inter23all = []
    inter24all = []
    inter34all = []
    
    diff1all = []
    diff2all = []
    diff3all = []
    diff4all = []
    
    for d in dat:
        interall.append(d[0])
        inter123all.append(d[1])
        inter124all.append(d[2])
        inter134all.append(d[3])
        inter234all.append(d[4])
        inter12all.append(d[5])
        inter13all.append(d[6])
        inter14all.append(d[7])
        inter23all.append(d[8])
        inter24all.append(d[9])
        inter34all.append(d[10])
        diff1all.append(d[11])
        diff2all.append(d[12])
        diff3all.append(d[13])
        diff4all.append(d[14])
    print(cancer)    
    print("all:", np.mean(interall), np.var(interall))
    print("mutect:", np.mean(diff1all), np.var(diff1all))
    print("muse:", np.mean(diff2all), np.var(diff2all))
    print("somaticsniper:", np.mean(diff3all), np.var(diff3all))
    print("varscan:", np.mean(diff4all), np.var(diff4all))
    print("mutect+muse:", np.mean(inter12all), np.var(inter12all))
    print("mutect+somaticsniper:", np.mean(inter13all), np.var(inter13all))
    print("mutect+varscan:", np.mean(inter14all), np.var(inter14all))
    print("muse+somaticsniper:", np.mean(inter23all), np.var(inter23all))
    print("muse+varscan:", np.mean(inter24all), np.var(inter24all))
    print("somaticsniper+varscan:", np.mean(inter34all), np.var(inter34all))
    print("mutect+muse+somaticsniper:", np.mean(inter123all), np.var(inter123all))
    print("mutect+muse+varscan:", np.mean(inter124all), np.var(inter124all))
    print("mutect+somaticsniper+varscan:", np.mean(inter134all), np.var(inter134all))
    print("muse+somaticsniper+varscan:", np.mean(inter234all), np.var(inter234all))
    print()


ACC
all: 0.4422030271453525 0.020103125595957727
mutect: 0.15215935419117504 0.009343582328017451
muse: 0.01887008459699171 0.0006684433840377186
somaticsniper: 0.015852156083001053 0.0004429922470596442
varscan: 0.07829564677324788 0.004278567738936814
mutect+muse: 0.04919218662355881 0.0016067311481563244
mutect+somaticsniper: 0.008209234226313214 0.00021602629317369186
mutect+varscan: 0.05623718531720941 0.0016654526273115623
muse+somaticsniper: 0.0037594953013593605 8.722392044098893e-05
muse+varscan: 0.009002336766068481 0.00024294625328533586
somaticsniper+varscan: 0.040904950058671286 0.0018583165727107747
mutect+muse+somaticsniper: 0.028510370366774052 0.0017009043639196423
mutect+muse+varscan: 0.05494860649078926 0.0034394253854629136
mutect+somaticsniper+varscan: 0.012371257859016884 0.0002961233686730361
muse+somaticsniper+varscan: 0.029484108200471027 0.0007985682853055372

BLCA
all: 0.5209545593017414 0.029369680364690787
mutect: 0.1246609241565346 0.01462692675316724
muse

KICH
all: 0.3424918403691233 0.01568270775629253
mutect: 0.2413444696447856 0.013002358093047643
muse: 0.015428125015013608 0.0006673745799174738
somaticsniper: 0.02102467535686557 0.001054167383035753
varscan: 0.11830098879744608 0.002911481064662034
mutect+muse: 0.033352424202612224 0.001012061135088766
mutect+somaticsniper: 0.0066255581805999405 0.0002233808170483084
mutect+varscan: 0.06800726744221193 0.0022475415061655626
muse+somaticsniper: 0.001990216424815795 6.771317327915769e-05
muse+varscan: 0.0108583831281291 0.00031471387806622183
somaticsniper+varscan: 0.02235613120572772 0.0012829179373628854
mutect+muse+somaticsniper: 0.021996646925716144 0.0005926340647601356
mutect+muse+varscan: 0.0582900684800335 0.00275209869866614
mutect+somaticsniper+varscan: 0.013303938983190516 0.0004469441594787396
muse+somaticsniper+varscan: 0.024629265843729025 0.0009629048488466087

KIRC
all: 0.45727077534438787 0.028331995410777917
mutect: 0.12927898243527447 0.00983135433702689
muse: 0.021

PAAD
all: 0.329900498833839 0.05790338973295336
mutect: 0.18876538181224226 0.02573160188168235
muse: 0.024897494946456714 0.0006409697181531047
somaticsniper: 0.004160748546426372 8.890276904025389e-05
varscan: 0.01742227098575209 0.0004806986796283656
mutect+muse: 0.14461606057888174 0.0158207462316288
mutect+somaticsniper: 0.002058742314446996 5.1361393968528914e-05
mutect+varscan: 0.04267371351692333 0.001154642007378518
muse+somaticsniper: 0.0017989971302440528 2.6462807743408498e-05
muse+varscan: 0.009671439159784464 0.00017359724204904145
somaticsniper+varscan: 0.004652471019332026 0.00016113545792722305
mutect+muse+somaticsniper: 0.010258786774311133 0.0018051250389292165
mutect+muse+varscan: 0.20557562147425082 0.025937784781501803
mutect+somaticsniper+varscan: 0.0024077021633630746 4.284699847471303e-05
muse+somaticsniper+varscan: 0.011140070743745881 0.00024535781820258606

PCPG
all: 0.4366905244227243 0.029801915970751096
mutect: 0.2605207572632593 0.021150172615881938
muse

UCEC
all: 0.5065676363629789 0.024084484229858153
mutect: 0.1464561049085078 0.023719893012343837
muse: 0.016980331534449687 0.00023096400992414138
somaticsniper: 0.0068054729417299795 0.00011187173423771872
varscan: 0.06504638715443203 0.0028642762189410075
mutect+muse: 0.04148344766388123 0.0006216282204768121
mutect+somaticsniper: 0.007068189426094405 0.00013598363791805448
mutect+varscan: 0.09431540297440985 0.0047900386048982295
muse+somaticsniper: 0.001295580484247556 1.0162069426870869e-05
muse+varscan: 0.007744589190233908 0.00012134506502797997
somaticsniper+varscan: 0.009095690163190723 0.00011979848973670958
mutect+muse+somaticsniper: 0.009465956495821621 0.000259125140285703
mutect+muse+varscan: 0.04777921476555823 0.002757003684199741
mutect+somaticsniper+varscan: 0.01878133821181332 0.0003166205279707366
muse+somaticsniper+varscan: 0.021114657722650924 0.00044199498424528523

UCS
all: 0.5203082890341792 0.012740188144080131
mutect: 0.10667622638303331 0.002538969744634697