In [36]:
import numpy as np
from collections import Counter
from scipy.sparse import csc_matrix, save_npz


In [12]:
# read .ped
individuals = set()
with open('../data/Ancestry Data files/AncestryDNA.ped', 'r') as f:
    next(f) # skip header
    for line in f:
        individuals.add(line.strip().split()[1])
        individuals.add(line.strip().split()[2])
        individuals.add(line.strip().split()[3])
individuals = sorted(individuals)
individual_to_index = dict([(x, i) for i, x in enumerate(individuals)])
print('individuals', len(individuals))

individuals 91


In [20]:
# pull positions
positions = []
with open('../data/Ancestry Data files/%s.txt' % individuals[0], 'r') as f:
    line = next(f)
    while line.startswith('#'):
        line = next(f)
    for line in f:
        pieces = line.strip().split('\t')
        positions.append((int(pieces[1]), int(pieces[2])))
        
# remove positions that occur multiple times (CNV probes)
positions = sorted([k for k, v in Counter(positions).items() if v==1])
print('positions', len(positions))



positions 676594


In [28]:
# pull reference at these positions
chrom_to_int = dict([('chr%d' % d, d) for d in range(1, 23)])
chrom_to_int['chrX'] = 23

chrom = None
pos_to_ref = dict()
with open('../data/hg19.fa', 'r') as f:
    for line in f:
        if line.startswith('>'):
            if chrom is not None:
                for p in pos:
                    assert (chrom_to_int[chrom], p) in pos_to_ref
            
            # pull out data
            chrom = line.strip()[1:]
            pos = [] if chrom not in chrom_to_int else [x[1] for x in positions if x[0] == chrom_to_int[chrom]]
            pos_index = 0
            current_pos = 1
            print(chrom, len(pos))
        else:
            line = line.strip()
            if pos_index == len(pos):
                # we're done with positions for this chromosome
                pass
            elif current_pos + len(line) <= pos[pos_index]:
                # no positions of interest here
                pass
            else:
                while pos_index < len(pos) and pos[pos_index]-current_pos < len(line):
                    pos_to_ref[(chrom_to_int[chrom], pos[pos_index])] = line[pos[pos_index]-current_pos]
                    pos_index += 1
            current_pos += len(line)
                


chr1 50552
chr2 54719
chr3 43186
chr4 36866
chr5 38798
chr6 43325
chr7 34609
chrX 25178
chr8 32969
chr9 29623
chr10 32779
chr11 32503
chr12 31280
chr13 24617
chr14 21134
chr15 21341
chr16 23332
chr17 22629
chr18 18987
chr20 18077
chrY 0
chr19 16969
chr22 10986
chr21 10170
chr6_ssto_hap7 0
chr6_mcf_hap5 0
chr6_cox_hap2 0
chr6_mann_hap4 0
chr6_apd_hap1 0
chr6_qbl_hap6 0
chr6_dbb_hap3 0
chr17_ctg5_hap1 0
chr4_ctg9_hap1 0
chr1_gl000192_random 0
chrUn_gl000225 0
chr4_gl000194_random 0
chr4_gl000193_random 0
chr9_gl000200_random 0
chrUn_gl000222 0
chrUn_gl000212 0
chr7_gl000195_random 0
chrUn_gl000223 0
chrUn_gl000224 0
chrUn_gl000219 0
chr17_gl000205_random 0
chrUn_gl000215 0
chrUn_gl000216 0
chrUn_gl000217 0
chr9_gl000199_random 0
chrUn_gl000211 0
chrUn_gl000213 0
chrUn_gl000220 0
chrUn_gl000218 0
chr19_gl000209_random 0
chrUn_gl000221 0
chrUn_gl000214 0
chrUn_gl000228 0
chrUn_gl000227 0
chr1_gl000191_random 0
chr19_gl000208_random 0
chr9_gl000198_random 0
chr17_gl000204_random 0
chrUn_gl0

In [51]:
ns = {'A', 'C', 'G', 'T'}
def find_genotype(ref, allele1, allele2):
    if allele1 in ns and allele2 in ns and ref in ns:
        return (allele1 == ref) + (allele2 == ref)
    return -1

for chrom, chrom_int in chrom_to_int.items():
    print(chrom)
    poss = [x[1] for x in positions if x[0] == chrom_int]
    pos_to_index = dict([(x, i) for i, x in enumerate(poss)])
    coordinates = np.zeros((len(poss), 4), dtype=int)
    coordinates[:, 0] = chrom_int
    coordinates[:, 1] = poss
    coordinates[:, 2:] = 1
    
    
    gen = -np.ones((len(individuals), len(poss)), dtype=np.int8)
    for i, individual in enumerate(individuals):
        try:
            with open('../data/Ancestry Data files/%s.txt' % individual, 'r') as f:
                line = next(f)
                while line.startswith('#'):
                    line = next(f)
                for line in f:
                    pieces = line.strip().split('\t')
                    pos = int(pieces[2])
                    if int(pieces[1]) == chrom_int and pos in pos_to_index:
                        ref = pos_to_ref[(chrom_int, pos)].upper()
                        gen[i, pos_to_index[pos]] = find_genotype(ref, pieces[3], pieces[4])
        except FileNotFoundError:
            pass

    # throw out individuals where more than 10% of variants are missing
    ind_indices = np.sum(gen==-1, axis=1)/len(poss) < 0.1
    individuals = [ind for ind, include in zip(individuals, ind_indices)]
    gen = gen[ind_indices, :]
    print('missing individuals', np.sum(~ind_indices)/len(individuals))

    # throw out sites where more than 10% of individuals are missing
    coordinates[np.sum(gen==-1, axis=0)/len(individuals) >= 0.1, 3] = 0
    print('missing variants', np.sum(coordinates[:, 3]==0)/len(poss))
    

    save_npz('../split_gen_ancestry/chr.%s.gen' % (str(chrom_int) if chrom_int < 23 else 'X'), csc_matrix(gen))
    np.save('../split_gen_ancestry/chr.%s.gen.coordinates' % (str(chrom_int) if chrom_int < 23 else 'X'), coordinates)
    
    with open('../split_gen_ancestry/chr.%s.gen.samples.txt' % (str(chrom_int) if chrom_int < 23 else 'X'), 'w+') as f:
        for individual, include in zip(individuals, ind_indices):
            if include:
                f.write(individual + '\n')
            
    

chr1
missing individuals 0.8571428571428571
missing variants 0.009831460674157303
chr2
missing individuals 0.8571428571428571
missing variants 0.016612145689796964
chr3
missing individuals 0.8571428571428571
missing variants 0.010697911360163017
chr4
missing individuals 0.8571428571428571
missing variants 0.00463842022459719
chr5
missing individuals 0.8571428571428571
missing variants 0.010541780504149698
chr6
missing individuals 0.8571428571428571
missing variants 0.006970571263704558
chr7
missing individuals 0.8571428571428571
missing variants 0.009390620936750556
chr8
missing individuals 0.8571428571428571
missing variants 0.006551609087324457
chr9
missing individuals 0.8571428571428571
missing variants 0.008135570333862202
chr10
missing individuals 0.8571428571428571
missing variants 0.006528570121114128
chr11
missing individuals 0.8571428571428571
missing variants 0.016521551856751686
chr12
missing individuals 0.8571428571428571
missing variants 0.010805626598465474
chr13
missing 

In [45]:
coordinates[np.sum(gen==-1, axis=0)/len(individuals) >= 0.1, 3] = 0
print('missing variants', np.sum(coordinates[:, 3]==0)/coordinates.shape[0])
    
# throw out individuals where more than 10% of variants are missing
ind_indices = np.sum(gen==-1, axis=1)/len(poss) < 0.1
print('missing individuals', np.sum(coordinates[:, 3]==0)/coordinates.shape[0])


missing variants 1.0
missing individuals 1.0


In [49]:
print(np.sum(gen==-1, axis=0)/len(individuals))

(54719,)


In [50]:
print(np.sum(gen==-1, axis=1)/len(poss))

[0.01794623 0.01712385 0.02236883 0.01719695 0.0169411  0.01805589
 0.01666697 0.0167949  1.         1.         0.01699592 1.
 1.         0.0167949  0.01697765 1.         0.01684972 0.02618834
 0.01734315 0.01688627 0.02657212 0.01688627 0.06933606 1.
 0.01869552 1.         0.01681317 1.         0.01805589 0.01860414
 0.01781831 0.01776348 0.01688627 0.01692282 0.01732488 0.01807416
 1.         0.01761728 0.019591   0.01748935 0.01743453 0.0170873
 0.0167949  0.01695937 0.0170142  0.02001133 0.02140024 0.01743453
 0.01727005 1.         1.         0.016868   0.01697765 0.01772693
 0.01666697 0.01913412 0.01706903 0.01725178 0.01728833 0.01703248
 0.01725178 0.0293134  1.         0.01732488 0.01758073 0.02713865
 0.01666697 0.02083371 0.02352017 0.01792796 0.0173797  0.01732488
 0.01721523 0.01725178 0.01721523 0.01725178 0.02024891 0.01752591
 0.01759901 0.01774521 0.0169411  0.01721523 0.0166487  0.01759901
 1.         0.0174528  0.01681317 0.02103474 0.0190793  0.01747108
 1.        ]