In [11]:
from scipy.sparse import save_npz, load_npz, hstack
from os import listdir


In [1]:
# this code is used to remove all genotype information for parents in a dataset
# this allows us to test phasing algorithm when parental data is missing

In [3]:
data_dir = '../split_gen_spark_exome_pilot_noparents'
ped_file = '../data/spark_jae.ped'

chroms = [str(x) for x in range(1, 23)] + ['X', 'Y']

In [6]:
# read samples
sample_file = '%s/chr.%s.gen.samples.txt' % (data_dir, chroms[0])
with open(sample_file, 'r') as f:
    sample_id_to_index = dict([(line.strip(), i) for i, line in enumerate(f)])

In [7]:
# read family structure
parents = set()
children = set()
with open(ped_file, 'r') as f:
    for line in f:
        fam_id, ind_id, dad_id, mom_id, sex, disease_status = line.strip().split('\t')[:6]
        # sex: 1=male, 2=female
        # disease status: 1=unaffected, 2=affected
        if dad_id != '0' and dad_id in sample_id_to_index:
            parents.add(dad_id)
        if mom_id != '0' and mom_id in sample_id_to_index:
            parents.add(mom_id)
        if (dad_id != '0' or mom_id != '0') and ind_id in sample_id_to_index:
            children.add(ind_id)
            
print('parents', len(parents), 'children', len(children), 'overlap', len(parents & children))

parents 914 children 465 overlap 0


In [14]:
# this code could be improved since changing sparsity structure of a csr/csc matrix is expensive

parent_indices = [sample_id_to_index[x] for x in parents]

for chrom in chroms:
    print(chrom, end=' ')
    # load genotypes
    gen_files = sorted([f for f in listdir(data_dir) if ('chr.%s.' % chrom) in f and 'gen.npz' in f])

    # Pull data together
    A = hstack([load_npz('%s/%s' % (data_dir, gen_file)) for gen_file in gen_files])
    
    # remove parental data
    A[parent_indices, :] = 0
    
    save_npz('%s/chr.%s.gen' % (data_dir, chrom), A)
    
    

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y 