In [3]:
import gzip
from collections import defaultdict

# Original SSC

In [7]:
vcf_file = '../data/ssc.MT.vcf.bgz'

In [8]:
# Pull data from vcf
with gzip.open(vcf_file, 'rt') as f:
    # Skip header
    line = next(f)
    while line.startswith('##'):
        line = next(f)

    # Pull sample_ids and write to file
    sample_ids = line.strip().split('\t')[9:]
print(len(sample_ids))

2076


In [9]:
print(sample_ids[:5])

['14109.mo', '13750.s1', '11420.s1', '11328.p1', '11006.p1']


In [5]:
family_ids = sorted(set([x[:-3] for x in sample_ids]))
print(len(family_ids))

519


In [None]:

# write to file
with open('data/ssc.ped', 'w+'):
    

# New SSC

In [56]:
section = 'phase2'
vcf_file = '../data/Y.%s.HG38toHG19.vcf.gz' % section

In [57]:
# Pull data from vcf
with gzip.open(vcf_file, 'rt') as f:
    # Skip header
    line = next(f)
    while line.startswith('##'):
        line = next(f)

    # Pull sample_ids and write to file
    sample_ids = line.strip().split('\t')[9:]
print(len(sample_ids))

2368


In [58]:
print(sample_ids[:5])

['SS0012980', 'SS0012982', 'SS0012989', 'SS0012997', 'SS0013001']


In [59]:
sample_id_to_family_role = dict()
with open('../data/ssc.id_map.from.ssc-wgs38', 'r') as f:
    for line in f:
        pieces = line.strip().split('\t')
        sample_id_to_family_role[pieces[1]] = tuple(pieces[0].split('.'))

In [60]:
role_to_index = {'mo': 0, 'fa': 1, 'p1': 2, 's1': 3, 's2': 4, 's3': 5}

# mom_id, dad_id, proband_id, sibling_id
family_to_inds = defaultdict(lambda: [None, None, None, None, None, None])
missing_sample_ids = []
for sample_id in sample_ids:
    if sample_id in sample_id_to_family_role:
        family, role = sample_id_to_family_role[sample_id]
        family_to_inds[family][role_to_index[role]] = sample_id
    else:
        missing_sample_ids.append(sample_id)
        #print('%s not found' % sample_id)
print('%d missing sample ids' % len(missing_sample_ids))

0 missing sample ids


In [None]:
with open('../data/ssc_%s.ped' % section, '')

# SSC HG38

In [8]:
family_to_inds = defaultdict(lambda: [None, None, None, None])
role_to_index = {'mo': 0, 'fa': 1, 'p1': 2, 's1': 3}
with open('../data/ssc.id_map.from.repository', 'r') as f:
    for line in f:
        pieces = line.strip().split('\t')
        family_key, role = pieces[1].split('.')
        if role in role_to_index:
            family_to_inds[family_key][role_to_index[role]] = pieces[0]


In [10]:
with open('../data/ssc.hg38.ped', 'w+') as f:
    for family, inds in family_to_inds.items():
        if inds[0] is not None:
            f.write('\t'.join([family, inds[0], '0', '0', '2', '1']) + '\n')
        if inds[1] is not None:
            f.write('\t'.join([family, inds[1], '0', '0', '1', '1']) + '\n')
        if inds[2] is not None:
            f.write('\t'.join([family, inds[2], 
                               '0' if inds[1] is None else inds[1], 
                               '0' if inds[0] is None else inds[0], '.', '2']) + '\n')
        if inds[3] is not None:
            f.write('\t'.join([family, inds[3], 
                               '0' if inds[1] is None else inds[1], 
                               '0' if inds[0] is None else inds[0], '.', '1']) + '\n')
            