In [13]:
import numpy as np
from os import listdir
import json
import scipy.sparse as sparse

In [14]:
orig_data_dir = '../../DATA/ancestry/genotypes'
new_data_dir = '../../DATA/ancestry/genotypes38'
dataset_name = 'ancestry'

#orig_data_dir = '../../DATA/spark/genotypes/wgs1_b02_array'
#new_data_dir = '../../DATA/spark/genotypes/wgs1_b02_array38'
#dataset_name = 'spark_wgs1_b02_array'

#orig_data_dir = '../../DATA/ihart.chip/genotypes'
#new_data_dir = '../../DATA/ihart.chip/genotypes38'
#dataset_name = 'ihart.chip'

chroms = [str(x) for x in range(1, 23)] + ['X', 'Y']

In [15]:
with open('../data/%s37.bed' % dataset_name, 'w+') as f:
    for chrom in chroms:
        coords = np.load('%s/chr.%s.0.gen.coordinates.npy' % (orig_data_dir, chrom), 'r')
        for i in range(coords.shape[0]):
            f.write('chr%s\t%d\t%d\t%d\n' % (chrom, coords[i, 1]-1, coords[i, 1], i))
        

In [12]:
chrom_to_coords = dict([(str(chrom), np.load('%s/chr.%s.0.gen.coordinates.npy' % (orig_data_dir, chrom))) for chrom in chroms])
for chrom, coords in chrom_to_coords.items():
    coords[:, 1] = 0

with open('../data/%s38.bed' % dataset_name, 'r') as f:
    for line in f:
        pieces = line.strip().split('\t')
        chrom = pieces[0][3:]
        if chrom in chrom_to_coords:
            chrom_to_coords[chrom][int(pieces[3]), 1] = int(pieces[2])+1
        
with open('%s/info.json' % orig_data_dir, 'r') as f:
    info = json.load(f)
    info['assembly'] = '38'
    
with open('%s/info.json' % new_data_dir, 'w+') as f:
    json.dump(info, f)
    
with open('%s/samples.json' % orig_data_dir, 'r') as f:
    samples = json.load(f)
    
with open('%s/samples.json' % new_data_dir, 'w+') as f:
    json.dump(samples, f)
    
for chrom in chroms:
    print('saving', chrom, end=' ')
    
    gen = sparse.load_npz('%s/chr.%s.0.gen.npz' % (orig_data_dir, chrom))
    coords = chrom_to_coords[chrom]
    
    # remove positions that don't map
    indices = coords[:, 1]!=0
    gen = gen[:, indices]
    coords = coords[indices, :]
    print('removing %d positions that do not map' % np.sum(~indices), end=' ')
        
    # reorder and remove positions that didn't map
    indices = np.argsort(coords[:, 1])
    coords = coords[indices, :]
    gen = gen[:, indices]
    
    # remove positions that map to the same place
    to_be_removed = np.zeros((coords.shape[0],), dtype=bool)
    to_be_removed[np.where(coords[1:, 1]==coords[:-1, 1])[0]+1] = True
    print('removing %d positions that map to the same place' % np.sum(to_be_removed), end=' ')
    coords = coords[~to_be_removed, :]
    gen = gen[:, ~to_be_removed]
    
    assert np.all(coords[1:, 1] > coords[:-1, 1])
    print(coords.shape, np.min(coords[:, 1]))
    
    # set everything to pass
    coords[:, 3] = 1
    
    sparse.save_npz('%s/chr.%s.0.gen.npz' % (new_data_dir, chrom), gen)
    np.save('%s/chr.%s.0.gen.coordinates.npy' % (new_data_dir, chrom), coords)
    

saving 1 removing 22 positions that do not map removing 723 positions that map to the same place (49113, 4) 634039
saving 2 removing 3 positions that do not map removing 550 positions that map to the same place (41937, 4) 33013
saving 3 removing 8 positions that do not map removing 458 positions that map to the same place (35394, 4) 25221
saving 4 removing 2 positions that do not map removing 399 positions that map to the same place (29196, 4) 60163
saving 5 removing 0 positions that do not map removing 439 positions that map to the same place (29400, 4) 38142
saving 6 removing 4 positions that do not map removing 702 positions that map to the same place (34020, 4) 204073
saving 7 removing 45 positions that do not map removing 369 positions that map to the same place (27522, 4) 44936
saving 8 removing 22 positions that do not map removing 315 positions that map to the same place (24339, 4) 213227
saving 9 removing 10 positions that do not map removing 309 positions that map to the same