In [2]:
lumpy_trans_file = '../final_calls/lumpy.calls'
genomestrip_trans_file = '../final_calls/genomestrip.calls'
breakdancer_trans_file = '../final_calls/breakdancer.calls'

In [3]:
ped_files = ['../data/160826.ped', '../data/ssc.ped']
# Affection (0=unknown; 1=unaffected; 2=affected)
sample_id_to_affected = dict()
sample_id_to_sex = dict()
sample_id_to_parent_ids = dict()

for ped_file in ped_files:
    with open(ped_file, 'r') as f:
        for line in f:
            pieces = line.strip().split('\t')
            if len(pieces) >= 6:
                fam_id, child_id, f_id, m_id, sex, disease_status = pieces[0:6]
                sample_id_to_affected[child_id] = disease_status
                sample_id_to_sex[child_id] = sex
                sample_id_to_parent_ids[child_id] = (f_id, m_id)

In [4]:
import json

# load iHART deletions
with open('../final_calls/filtered_iHART_deletions.json', 'r') as f:
    ihart_deletions = json.load(f)

In [5]:
print(ihart_deletions[0])

{'family': 'AU0080.AU008001.AU008002', 'chrom': 1, 'start_pos': 49243, 'end_pos': 76610, 'length': 27368, 'opt_start_pos': 47961, 'opt_end_pos': 97952, 'opt_length': 49992, 'trans': ['AU008004'], 'notrans': [], 'is_mat': True, 'is_pat': False, 'mother': 'AU008001', 'father': 'AU008002'}


In [6]:
from collections import defaultdict

my_trans = defaultdict(list)
for d in ihart_deletions:
    if d['is_mat']:
        for c in d['trans']:
            trans = (d['mother'], c)
            my_trans[(trans, str(d['chrom']))].append(d)     
    if d['is_pat']:
        for c in d['trans']:
            trans = (d['father'], c)
            my_trans[(trans, str(d['chrom']))].append(d)   

In [7]:
lumpy_trans = defaultdict(list)
with open(lumpy_trans_file, 'r') as f:
    for line in f:
        pieces = line.strip().split('\t')
        chrom, start, end = pieces[:3]
        trans = pieces[9:]
        for x in trans:
            lumpy_trans[(tuple(x.split('-')), chrom)].append((chrom, int(start), int(end)))

In [8]:
print(lumpy_trans[list(lumpy_trans.keys())[0]])

[('1', 714046, 243265030), ('1', 724869, 224200101), ('1', 726273, 224201675), ('1', 726801, 224202080), ('1', 826022, 5727117), ('1', 829170, 829205), ('1', 869471, 870218), ('1', 1232649, 1233072), ('1', 1339353, 1339636), ('1', 1465927, 1466230), ('1', 1584586, 1647687), ('1', 1650061, 1650602), ('1', 1866399, 1866977), ('1', 1869284, 1869311), ('1', 2053562, 2055573), ('1', 2583931, 2623413), ('1', 2583937, 2584699), ('1', 2583948, 2585344), ('1', 2583969, 2615479), ('1', 2584492, 2623451), ('1', 2585504, 2615431), ('1', 2585678, 2620897), ('1', 2585713, 2628852), ('1', 2586443, 2619277), ('1', 2586472, 2586516), ('1', 2618836, 2626293), ('1', 2619851, 2627668), ('1', 2620905, 2628874), ('1', 2622812, 2629144), ('1', 3216326, 3216959), ('1', 3708278, 3708553), ('1', 4125562, 4126986), ('1', 4155159, 4155460), ('1', 4196016, 4196563), ('1', 4204658, 4204726), ('1', 4204679, 4204726), ('1', 4480354, 4480417), ('1', 4692817, 4693485), ('1', 4871290, 4871319), ('1', 4999561, 4999608), 

In [9]:
genomestrip_trans = defaultdict(list)
with open(genomestrip_trans_file, 'r') as f:
    for line in f:
        pieces = line.strip().split('\t')
        chrom, start, end = pieces[:3]
        trans = pieces[9:]
        for x in trans:
            genomestrip_trans[(tuple(x.split('-')), chrom)].append((chrom, int(start), int(end)))

In [10]:
breakdancer_trans = defaultdict(list)
with open(breakdancer_trans_file, 'r') as f:
    for line in f:
        pieces = line.strip().split('\t')
        chrom, start, end = pieces[:3]
        trans = pieces[9:]
        for x in trans:
            breakdancer_trans[(tuple(x.split('-')), chrom)].append((chrom, int(start), int(end)))

In [11]:
print(len(my_trans), len(lumpy_trans), len(genomestrip_trans), len(breakdancer_trans))

26850 51390 35565 51478


In [12]:
# make sure we're using same set of families
my_sample_ids = set([x[0][0] for x in my_trans.keys()] + [x[0][1] for x in my_trans.keys()])
lumpy_sample_ids = set([x[0][0] for x in lumpy_trans.keys()] + [x[0][1] for x in lumpy_trans.keys()])
genomestrip_sample_ids = set([x[0][0] for x in genomestrip_trans.keys()] + [x[0][1] for x in genomestrip_trans.keys()])
breakdancer_sample_ids = set([x[0][0] for x in breakdancer_trans.keys()] + [x[0][1] for x in breakdancer_trans.keys()])

all_sample_ids = my_sample_ids & lumpy_sample_ids & genomestrip_sample_ids & breakdancer_sample_ids
print(len(my_sample_ids), len(lumpy_sample_ids), len(genomestrip_sample_ids), len(breakdancer_sample_ids), len(all_sample_ids))

3390 2231 2126 2231 1403


In [13]:
my_trans = dict([(k, v) for k, v in my_trans.items() if k[0][0] in all_sample_ids and k[0][1] in all_sample_ids])
lumpy_trans = dict([(k, v) for k, v in lumpy_trans.items() if k[0][0] in all_sample_ids and k[0][1] in all_sample_ids])
genomestrip_trans = dict([(k, v) for k, v in genomestrip_trans.items() if k[0][0] in all_sample_ids and k[0][1] in all_sample_ids])
breakdancer_trans = dict([(k, v) for k, v in breakdancer_trans.items() if k[0][0] in all_sample_ids and k[0][1] in all_sample_ids])

print(len(my_trans), len(lumpy_trans), len(genomestrip_trans), len(breakdancer_trans))

7031 21980 23200 21956


In [14]:
# make sure we have same chroms
my_chroms = set([x[1] for x in my_trans.keys()])
lumpy_chroms = set([x[1] for x in lumpy_trans.keys()])
genomestrip_chroms = set([x[1] for x in genomestrip_trans.keys()])
breakdancer_chroms = set([x[1] for x in breakdancer_trans.keys()])

all_chroms = my_chroms & lumpy_chroms & genomestrip_chroms & breakdancer_chroms
print(len(my_chroms), len(lumpy_chroms), len(genomestrip_chroms), len(breakdancer_chroms), len(all_chroms))

22 24 24 24 22


In [15]:
my_trans = dict([(k, v) for k, v in my_trans.items() if k[1] in all_chroms])
lumpy_trans = dict([(k, v) for k, v in lumpy_trans.items() if k[1] in all_chroms])
genomestrip_trans = dict([(k, v) for k, v in genomestrip_trans.items() if k[1] in all_chroms])
breakdancer_trans = dict([(k, v) for k, v in breakdancer_trans.items() if k[1] in all_chroms])

print(len(my_trans), len(lumpy_trans), len(genomestrip_trans), len(breakdancer_trans))

7031 21786 21846 21793


In [16]:
lumpy_support = set()
for k, other_dels in lumpy_trans.items():
    if k in my_trans:
        my_dels = my_trans[k]
    
        for od in other_dels:
            od_start, od_end = od[1:3]
            for md in my_dels:
                md_start, md_end = md['start_pos'], md['end_pos']
                if od_end < md_start or md_end < od_start:
                    overlap = 0
                elif od_start <= md_start and od_end >= md_end:
                    overlap = md_end-md_start+1
                elif md_start <= od_start and md_end >= od_end:
                    overlap = od_end-od_start+1
                elif md_start < od_end:
                    overlap = od_end - md_start + 1
                elif od_start < md_end:
                    overlap = md_end - od_start + 1

                if overlap/(md_end-md_start+1) >= 0.5 and overlap/(od_end-od_start+1) >= 0.5:
                    lumpy_support.add((k[0][0], k[0][1], k[1], md_start, md_end))




In [17]:
genomestrip_support = set()
for k, other_dels in genomestrip_trans.items():
    if k in my_trans:
        my_dels = my_trans[k]
    
        for od in other_dels:
            od_start, od_end = od[1:3]
            for md in my_dels:
                md_start, md_end = md['start_pos'], md['end_pos']
                if od_end < md_start or md_end < od_start:
                    overlap = 0
                elif od_start <= md_start and od_end >= md_end:
                    overlap = md_end-md_start+1
                elif md_start <= od_start and md_end >= od_end:
                    overlap = od_end-od_start+1
                elif md_start < od_end:
                    overlap = od_end - md_start + 1
                elif od_start < md_end:
                    overlap = md_end - od_start + 1

                if overlap/(md_end-md_start+1) >= 0.5 and overlap/(od_end-od_start+1) >= 0.5:
                    genomestrip_support.add((k[0][0], k[0][1], k[1], md_start, md_end))

In [18]:
breakdancer_support = set()
for k, other_dels in breakdancer_trans.items():
    if k in my_trans:
        my_dels = my_trans[k]
    
        for od in other_dels:
            od_start, od_end = od[1:3]
            for md in my_dels:
                md_start, md_end = md['start_pos'], md['end_pos']
                if od_end < md_start or md_end < od_start:
                    overlap = 0
                elif od_start <= md_start and od_end >= md_end:
                    overlap = md_end-md_start+1
                elif md_start <= od_start and md_end >= od_end:
                    overlap = od_end-od_start+1
                elif md_start < od_end:
                    overlap = od_end - md_start + 1
                elif od_start < md_end:
                    overlap = md_end - od_start + 1

                if overlap/(md_end-md_start+1) >= 0.5 and overlap/(od_end-od_start+1) >= 0.5:
                    breakdancer_support.add((k[0][0], k[0][1], k[1], md_start, md_end))

In [19]:
print('I called', sum([len(v) for v in my_trans.values()]), 'transmissions')
print('Lumpy supported', len(lumpy_support))
print('Genomestrip supported', len(genomestrip_support))
print('Breakdancer supported', len(breakdancer_support))
print('All supported', len(lumpy_support & genomestrip_support & breakdancer_support))
print('At least one supported', len(lumpy_support | genomestrip_support | breakdancer_support))

print()

print('Lumpy called', sum([len(v) for v in lumpy_trans.values()]))
print('Genomestrip called', sum([len(v) for v in genomestrip_trans.values()]))
print('Breakdancer called', sum([len(v) for v in breakdancer_trans.values()]))

I called 9487 transmissions
Lumpy supported 5291
Genomestrip supported 5210
Breakdancer supported 4606
All supported 4055
At least one supported 5775

Lumpy called 5338380
Genomestrip called 1548581
Breakdancer called 1896393


In [29]:
len(all_sample_ids)

1403

In [30]:
print('a', len(lumpy_support - genomestrip_support - breakdancer_support))
print('b', len(genomestrip_support - lumpy_support - breakdancer_support))
print('c', len(breakdancer_support - genomestrip_support - lumpy_support))
print('ab', len((lumpy_support & genomestrip_support) - breakdancer_support))
print('ac', len((lumpy_support & breakdancer_support) - genomestrip_support))
print('bc', len((genomestrip_support & breakdancer_support) - lumpy_support))
print('abc', len(lumpy_support & genomestrip_support & breakdancer_support))

a 156
b 305
c 37
ab 708
ac 372
bc 142
abc 4055
