In [1]:
import numpy as np
from collections import Counter
from scipy.sparse import csc_matrix, save_npz


In [2]:
# read .ped
individuals = set()
with open('../data/Ancestry Data files/AncestryDNA.ped', 'r') as f:
    next(f) # skip header
    for line in f:
        individuals.add(line.strip().split()[1])
        individuals.add(line.strip().split()[2])
        individuals.add(line.strip().split()[3])
individuals = sorted(individuals)
individual_to_index = dict([(x, i) for i, x in enumerate(individuals)])
print('individuals', len(individuals))

individuals 153


In [3]:
# pull positions
positions = []
with open('../data/Ancestry Data files/%s.txt' % individuals[0], 'r') as f:
    line = next(f)
    while line.startswith('#'):
        line = next(f)
    for line in f:
        pieces = line.strip().split('\t')
        positions.append((int(pieces[1]), int(pieces[2])))
        
# remove positions that occur multiple times (CNV probes)
positions = sorted([k for k, v in Counter(positions).items() if v==1])
print('positions', len(positions))



positions 676594


In [4]:
# pull reference at these positions
chrom_to_int = dict([('chr%d' % d, d) for d in range(1, 23)])
chrom_to_int['chrX'] = 23

chrom = None
pos_to_ref = dict()
with open('../data/hg19.fa', 'r') as f:
    for line in f:
        if line.startswith('>'):
            if chrom is not None:
                for p in pos:
                    assert (chrom_to_int[chrom], p) in pos_to_ref
            
            # pull out data
            chrom = line.strip()[1:]
            pos = [] if chrom not in chrom_to_int else [x[1] for x in positions if x[0] == chrom_to_int[chrom]]
            pos_index = 0
            current_pos = 1
            print(chrom, len(pos))
        else:
            line = line.strip()
            if pos_index == len(pos):
                # we're done with positions for this chromosome
                pass
            elif current_pos + len(line) <= pos[pos_index]:
                # no positions of interest here
                pass
            else:
                while pos_index < len(pos) and pos[pos_index]-current_pos < len(line):
                    pos_to_ref[(chrom_to_int[chrom], pos[pos_index])] = line[pos[pos_index]-current_pos]
                    pos_index += 1
            current_pos += len(line)
                


chr1 50552
chr2 54719
chr3 43186
chr4 36866
chr5 38798
chr6 43325
chr7 34609
chrX 25178
chr8 32969
chr9 29623
chr10 32779
chr11 32503
chr12 31280
chr13 24617
chr14 21134
chr15 21341
chr16 23332
chr17 22629
chr18 18987
chr20 18077
chrY 0
chr19 16969
chr22 10986
chr21 10170
chr6_ssto_hap7 0
chr6_mcf_hap5 0
chr6_cox_hap2 0
chr6_mann_hap4 0
chr6_apd_hap1 0
chr6_qbl_hap6 0
chr6_dbb_hap3 0
chr17_ctg5_hap1 0
chr4_ctg9_hap1 0
chr1_gl000192_random 0
chrUn_gl000225 0
chr4_gl000194_random 0
chr4_gl000193_random 0
chr9_gl000200_random 0
chrUn_gl000222 0
chrUn_gl000212 0
chr7_gl000195_random 0
chrUn_gl000223 0
chrUn_gl000224 0
chrUn_gl000219 0
chr17_gl000205_random 0
chrUn_gl000215 0
chrUn_gl000216 0
chrUn_gl000217 0
chr9_gl000199_random 0
chrUn_gl000211 0
chrUn_gl000213 0
chrUn_gl000220 0
chrUn_gl000218 0
chr19_gl000209_random 0
chrUn_gl000221 0
chrUn_gl000214 0
chrUn_gl000228 0
chrUn_gl000227 0
chr1_gl000191_random 0
chr19_gl000208_random 0
chr9_gl000198_random 0
chr17_gl000204_random 0
chrUn_gl0

In [5]:
ns = {'A', 'C', 'G', 'T'}
def find_genotype(ref, allele1, allele2):
    if allele1 in ns and allele2 in ns and ref in ns:
        return (allele1 == ref) + (allele2 == ref)
    return -1

for chrom, chrom_int in chrom_to_int.items():
    print(chrom)
    poss = [x[1] for x in positions if x[0] == chrom_int]
    pos_to_index = dict([(x, i) for i, x in enumerate(poss)])
    coordinates = np.zeros((len(poss), 4), dtype=int)
    coordinates[:, 0] = chrom_int
    coordinates[:, 1] = poss
    coordinates[:, 2:] = 1
    
    
    gen = -np.ones((len(individuals), len(poss)), dtype=np.int8)
    for i, individual in enumerate(individuals):
        try:
            with open('../data/Ancestry Data files/%s.txt' % individual, 'r') as f:
                line = next(f)
                while line.startswith('#'):
                    line = next(f)
                for line in f:
                    pieces = line.strip().split('\t')
                    pos = int(pieces[2])
                    if int(pieces[1]) == chrom_int and pos in pos_to_index:
                        ref = pos_to_ref[(chrom_int, pos)].upper()
                        gen[i, pos_to_index[pos]] = find_genotype(ref, pieces[3], pieces[4])
        except FileNotFoundError:
            pass

    # throw out individuals where more than 10% of variants are missing
    ind_indices = np.sum(gen==-1, axis=1)/len(poss) < 0.1
    individuals = [ind for ind, include in zip(individuals, ind_indices)]
    gen = gen[ind_indices, :]
    print('missing individuals', np.sum(~ind_indices)/len(individuals))

    # throw out sites where more than 10% of individuals are missing
    coordinates[np.sum(gen==-1, axis=0)/len(individuals) >= 0.1, 3] = 0
    print('missing variants', np.sum(coordinates[:, 3]==0)/len(poss))
    

    save_npz('../split_gen_ancestry/chr.%s.gen' % (str(chrom_int) if chrom_int < 23 else 'X'), csc_matrix(gen))
    np.save('../split_gen_ancestry/chr.%s.gen.coordinates' % (str(chrom_int) if chrom_int < 23 else 'X'), coordinates)
    
    with open('../split_gen_ancestry/chr.%s.gen.samples.txt' % (str(chrom_int) if chrom_int < 23 else 'X'), 'w+') as f:
        for individual, include in zip(individuals, ind_indices):
            if include:
                f.write(individual + '\n')
            
    

chr1
missing individuals 0.0457516339869281
missing variants 0.010088621617344517
chr2
missing individuals 0.0457516339869281
missing variants 0.016831447943127617
chr3
missing individuals 0.0457516339869281
missing variants 0.010860000926226092
chr4
missing individuals 0.0457516339869281
missing variants 0.005126674985081105
chr5
missing individuals 0.0457516339869281
missing variants 0.010722202175369865
chr6
missing individuals 0.0457516339869281
missing variants 0.007155222158107329
chr7
missing individuals 0.0457516339869281
missing variants 0.009766245774220578
chr8
missing individuals 0.0457516339869281
missing variants 0.0067639297521914525
chr9
missing individuals 0.0457516339869281
missing variants 0.008203085440367281
chr10
missing individuals 0.0457516339869281
missing variants 0.006742121480215992
chr11
missing individuals 0.0457516339869281
missing variants 0.01670615020151986
chr12
missing individuals 0.0457516339869281
missing variants 0.011349104859335038
chr13
missing

In [6]:
coordinates[np.sum(gen==-1, axis=0)/len(individuals) >= 0.1, 3] = 0
print('missing variants', np.sum(coordinates[:, 3]==0)/coordinates.shape[0])
    
# throw out individuals where more than 10% of variants are missing
ind_indices = np.sum(gen==-1, axis=1)/len(poss) < 0.1
print('missing individuals', np.sum(coordinates[:, 3]==0)/coordinates.shape[0])


missing variants 0.029231869092064502
missing individuals 0.029231869092064502


In [7]:
print(np.sum(gen==-1, axis=0)/len(individuals))

[0.        0.        0.        ... 0.        0.        0.0130719]


In [8]:
print(np.sum(gen==-1, axis=1)/len(poss))

[0.03145603 0.02859639 0.03093971 0.02943046 0.02823894 0.0299865
 0.02780205 0.02811979 0.02800064 0.02855668 0.02819922 0.02788148
 0.04217968 0.03248868 0.02804035 0.05639844 0.02804035 0.02843752
 0.02843752 0.02819922 0.03193264 0.0313766  0.02875526 0.0281595
 0.02800064 0.063905   0.03185321 0.02800064 0.06203829 0.03030423
 0.02800064 0.02879498 0.03308444 0.03101914 0.04233855 0.03189292
 0.02764318 0.05111605 0.02994678 0.0606879  0.02895385 0.02895385
 0.02903328 0.02764318 0.02923187 0.02923187 0.02855668 0.03129716
 0.02954961 0.02819922 0.02847724 0.03240925 0.02962904 0.03244896
 0.03006593 0.03193264 0.02863611 0.03526889 0.0276829  0.0279212
 0.029073   0.03935976 0.0299865  0.02855668 0.02796092 0.02943046
 0.03542775 0.02796092 0.0313766  0.02927159 0.02899357 0.0281595
 0.05417428 0.02808007 0.03082056 0.0279212  0.02788148 0.02784177
 0.0279212  0.02855668 0.03928032 0.04531734 0.02835809 0.02800064
 0.02839781 0.03526889 0.03050282 0.02808007 0.02780205 0.02788148