In [1]:
from collections import defaultdict, Counter
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib

In [2]:
chroms = [str(x) for x in range(1, 23)] #+ ['X']
#chroms = ['X']

family_sizes = [3, 4, 5, 6]
phase_dir = '../phased_ihart_miss'
data_dir = '../split_gen_miss'

#family_sizes = [3, 4]
#phase_dir = '../phased_ssc_miss'
#data_dir = '../split_gen_miss_ssc'

In [3]:
ped_files = ['../data/160826.ped', '../data/ssc.ped']
# Affection (0=unknown; 1=unaffected; 2=affected)
child_id_to_affected = dict()
child_id_to_sex = dict()

for ped_file in ped_files:
    with open(ped_file, 'r') as f:
        for line in f:
            pieces = line.strip().split('\t')
            if len(pieces) >= 6:
                fam_id, child_id, f_id, m_id, sex, disease_status = pieces[0:6]
                child_id_to_affected[child_id] = disease_status
                child_id_to_sex[child_id] = sex

# Filter Families

In [5]:
monozygotic_multiple_families = set()
with open('../data/160826.iHART.db.query.csv', 'r') as f:
    next(f)
    for line in f:
        pieces = line.split(',')
        if 'MZ' in pieces[11]:
            monozygotic_multiple_families.add(pieces[3])
print('Mono multiples', len(monozygotic_multiple_families))

Mono multiples 96


In [6]:
family_to_chroms = defaultdict(set)
family_to_individuals = dict()
for chrom in chroms:
    print(chrom, end=' ')

    for j in family_sizes:
        try:
            with open('%s/chr.%s.familysize.%d.families.txt' % (phase_dir, chrom, j), 'r') as f:
                next(f) # skip header
                for line in f:
                    pieces = line.strip().split('\t')
                    family_key = pieces[0]
                    family_to_chroms[family_key].add(chrom)
                    family_to_individuals[family_key] = pieces[1:(1+j)]
        except FileNotFoundError:
            print('File not found', 'chrom', chrom, 'family size', j)
        except StopIteration:
            print('File empty', 'chrom', chrom, 'family size', j)
            
families_with_all_chroms = set([k for k, v in family_to_chroms.items() if len(v)==len(chroms)])
print('Families with all chroms', len(families_with_all_chroms))
print(Counter([len(v) for v in family_to_chroms.values()]))


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 Families with all chroms 680
Counter({22: 680, 20: 70, 18: 35, 17: 29, 16: 29, 21: 19, 19: 5, 1: 2})


In [7]:
families_with_lcl = set([x for x, inds in family_to_individuals.items() if len([y for y in inds if 'LCL' in y]) > 0])
print('Families with LCL', len(families_with_lcl))

families_to_include = [x for x in families_with_all_chroms if x.split('.')[0] not in monozygotic_multiple_families and x not in families_with_lcl]
print('Final families', len(families_to_include))

Families with LCL 12
Final families 606


In [8]:
from collections import namedtuple
MaskedArea = namedtuple('MaskedArea', ['family', 'chrom', 
                                   'start_pos', 'end_pos', 'length'])

In [19]:
from collections import defaultdict

# load deletions from all chromosomes
masked_areas = []

for chrom in chroms:
    print(chrom, end=' ')
    
    for j in family_sizes:
        try:
            with open('%s/chr.%s.familysize.%d.families.txt' % (phase_dir, chrom, j), 'r')  as f:
                next(f) # skip header
                num_fams_of_size = 0
                for line in f:
                    pieces = line.strip().split('\t')
                    family_key = pieces[0]
                    if family_key in families_to_include:
                        family_to_individuals[family_key] = pieces[1:(1+j)]
                        num_fams_of_size += 1

            # load deletions
            family_to_states = defaultdict(list)
            family_to_pos = defaultdict(list)
            with open('%s/chr.%s.familysize.%d.phased.txt' % (phase_dir, chrom, j), 'r')  as f:
                next(f) # skip header

                for line in f:
                    pieces = line.strip().split('\t')
                    family_key = pieces[0]
                    if family_key in families_to_include:
                        inheritance_state = [int(x) for x in pieces[1:(2+(j*2))]]
                        start_pos, end_pos = [int(x) for x in pieces[(2+(j*2)):(4+(j*2))]]

                        family_to_states[family_key].append(inheritance_state)
                        family_to_pos[family_key].append((start_pos, end_pos))
                 
            # pull deletions from each family
            for family_key, states in family_to_states.items():
                states = np.asarray(states)
                positions = np.asarray(family_to_pos[family_key])
                mother, father = family_to_individuals[family_key][:2]
                children = family_to_individuals[family_key][2:]
                                
                start, end = [], []
                if states[0, -1] == 1:
                    start.append(0)
                if states[-1, -1] == 1:
                    end.append(states.shape[0]-1)
                    
                start = start + (np.where((states[:-1, -1]==0) & (states[1:, -1]==1))[0] + 1).tolist()
                end = np.where((states[:-1, -1]==1) & (states[1:, -1]==0))[0].tolist() + end

                if len(start) < 20:
                    for s, e in zip(start, end):

                        try:
                            start_pos, end_pos = positions[s], positions[e]
                            masked_areas.append(MaskedArea(family_key, chrom, start_pos, end_pos, start_pos-end_pos+1))
                        except IndexError:
                            print(s, e, positions.shape, states.shape)
                else:
                    print(family_key, chrom, len(start))
                
        except FileNotFoundError:
            print('File not found', 'chrom', chrom, 'fammily size', j)
        except StopIteration:
            print('File empty', 'chrom', chrom, 'family size', j)
                


1 AU1495303.AU1495202.AU1495201 1 88
AU0193.AU019301.AU019302 1 23
AU0208.AU020801.AU020802 1 20
AU0268.AU026801.AU026802 1 20
AU0293.AU029301.AU029302 1 24
AU0763.AU076301.AU076302 1 24
AU0822.AU082201.AU082202 1 22
AU0922.AU0922202.AU0922201 1 20
AU0934.AU0934202.AU0934201 1 24
AU0993.AU0993202.AU0993201 1 23
AU0839.AU0839202.AU0839201 1 21
AU1047.AU1047202.AU1047201 1 27
AU1048.AU1048202.AU1048201 1 21
AU1053.AU1053202.AU1053201 1 25
AU1178.AU1178202.AU1178201 1 24
AU1228.AU1228202.AU1228201 1 26
AU1252.AU1252202.AU1252201 1 25
AU1276.AU1276202.AU1276201 1 20
AU1277.AU1277202.AU1277201 1 21
AU1285.AU1285202.AU1285201 1 28
AU1299.AU1299202.AU1299201 1 21
AU1353.AU1353202.AU1353201 1 24
AU1265.AU1265202.AU1265203 1 24
AU1402.AU1402202.AU1402201 1 24
AU1410.AU1410202.AU1410201 1 26
AU1446.AU1446202.AU1446201 1 21
AU1550.AU1550202.AU1550201 1 23
AU1441.AU1441202.AU1441201 1 35
AU1445.AU1445202.AU1445201 1 24
AU1573.AU1573202.AU1573201 1 21
AU1648.AU1648202.AU1648201 1 20
AU1708.AU170820

AU0022.AU002201.AU002202 9 21
AU0063.AU006301.AU006302 9 20
AU0084.AU008401.AU008402 9 23
AU0127.AU012701.AU012702 9 25
AU0660.AU066001.AU066002 9 20
AU1086.AU1086202.AU1086201 9 21
AU1098.AU1098202.AU1098201 9 35
AU1174.AU1174202.AU1174201 9 30
AU1184.AU1184202.AU1184201 9 20
AU1185.AU1185202.AU1185201 9 25
AU1255.AU1255202.AU1255201 9 63
AU1042.AU1042202.AU1042201 9 24
AU1329.AU1329202.AU1329201 9 26
AU1341.AU1341202.AU1341201 9 30
AU1355.AU1355202.AU1355201 9 28
AU1392.AU1392202.AU1392201 9 25
AU1609.AU1609202.AU1609201 9 37
AU1586.AU1586202.AU1586201 9 25
AU1842.AU1842202.AU1842201 9 42
AU1907.AU1907202.AU1907201 9 23
AU1916.AU1916202.AU1916201 9 25
AU1922.AU1922202.AU1922201 9 22
AU1923.AU1923202.AU1923201 9 24
AU1924.AU1924202.AU1924201 9 30
AU2038.AU2038202.AU2038201 9 20
AU2332.AU2332202.AU2332201 9 20
AU2525.AU2525202.AU2525201 9 26
AU2757.AU2757202.AU2757201 9 21
AU2793.AU2793202.AU2793201 9 20
AU2829.AU2829202.AU2829201 9 26
AU3311.AU3311202.AU3311201 9 20
AU3412.AU3412202.A

In [20]:
print(len(masked_areas))

69864


In [21]:
positions = sorted(set(sum([[(x.chrom, x.start_pos), (x.chrom, x.end_pos)] for x in masked_areas], [])), key=lambda x:(int(x[0]), x[1]))
families = sorted(families_to_include)
pos_to_index = dict([(x, i) for i, x in enumerate(positions)])
fam_to_index = dict([(x, i) for i, x in enumerate(individuals)])

dm = np.zeros((len(fam_to_index), len(pos_to_index)), dtype=int)
for d in masked_areas:
    start_index, end_index = pos_to_index[(d.chrom, d.start_pos)], pos_to_index[(d.chrom, d.end_pos)]
    for ind in d.trans:
        dm[ind_to_index[ind], start_index:(end_index+1)] = 1

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(15, 5))
plt.imshow(dm, aspect='auto')
plt.show()