In [1]:
import numpy as np
from collections import Counter
from scipy.sparse import csc_matrix, save_npz


In [2]:
# read .ped
individuals = set()
with open('../data/Ancestry Data files/AncestryDNA.ped', 'r') as f:
    next(f) # skip header
    for line in f:
        individuals.add(line.strip().split()[1])
        individuals.add(line.strip().split()[2])
        individuals.add(line.strip().split()[3])
individuals = sorted(individuals)
individual_to_index = dict([(x, i) for i, x in enumerate(individuals)])
print('individuals', len(individuals))

individuals 182


In [3]:
# pull positions
positions = []
with open('../data/Ancestry Data files/%s.txt' % individuals[0], 'r') as f:
    line = next(f)
    while line.startswith('#'):
        line = next(f)
    for line in f:
        pieces = line.strip().split('\t')
        positions.append((int(pieces[1]), int(pieces[2])))
        
# remove positions that occur multiple times (CNV probes)
positions = sorted([k for k, v in Counter(positions).items() if v==1])
print('positions', len(positions))



positions 676594


In [4]:
# pull reference at these positions
chrom_to_int = dict([('chr%d' % d, d) for d in range(1, 23)])
chrom_to_int['chrX'] = 23

chrom = None
pos_to_ref = dict()
with open('../data/hg19.fa', 'r') as f:
    for line in f:
        if line.startswith('>'):
            if chrom is not None:
                for p in pos:
                    assert (chrom_to_int[chrom], p) in pos_to_ref
            
            # pull out data
            chrom = line.strip()[1:]
            pos = [] if chrom not in chrom_to_int else [x[1] for x in positions if x[0] == chrom_to_int[chrom]]
            pos_index = 0
            current_pos = 1
            print(chrom, len(pos))
        else:
            line = line.strip()
            if pos_index == len(pos):
                # we're done with positions for this chromosome
                pass
            elif current_pos + len(line) <= pos[pos_index]:
                # no positions of interest here
                pass
            else:
                while pos_index < len(pos) and pos[pos_index]-current_pos < len(line):
                    pos_to_ref[(chrom_to_int[chrom], pos[pos_index])] = line[pos[pos_index]-current_pos]
                    pos_index += 1
            current_pos += len(line)
                


chr1 50552
chr2 54719
chr3 43186
chr4 36866
chr5 38798
chr6 43325
chr7 34609
chrX 25178
chr8 32969
chr9 29623
chr10 32779
chr11 32503
chr12 31280
chr13 24617
chr14 21134
chr15 21341
chr16 23332
chr17 22629
chr18 18987
chr20 18077
chrY 0
chr19 16969
chr22 10986
chr21 10170
chr6_ssto_hap7 0
chr6_mcf_hap5 0
chr6_cox_hap2 0
chr6_mann_hap4 0
chr6_apd_hap1 0
chr6_qbl_hap6 0
chr6_dbb_hap3 0
chr17_ctg5_hap1 0
chr4_ctg9_hap1 0
chr1_gl000192_random 0
chrUn_gl000225 0
chr4_gl000194_random 0
chr4_gl000193_random 0
chr9_gl000200_random 0
chrUn_gl000222 0
chrUn_gl000212 0
chr7_gl000195_random 0
chrUn_gl000223 0
chrUn_gl000224 0
chrUn_gl000219 0
chr17_gl000205_random 0
chrUn_gl000215 0
chrUn_gl000216 0
chrUn_gl000217 0
chr9_gl000199_random 0
chrUn_gl000211 0
chrUn_gl000213 0
chrUn_gl000220 0
chrUn_gl000218 0
chr19_gl000209_random 0
chrUn_gl000221 0
chrUn_gl000214 0
chrUn_gl000228 0
chrUn_gl000227 0
chr1_gl000191_random 0
chr19_gl000208_random 0
chr9_gl000198_random 0
chr17_gl000204_random 0
chrUn_gl0

In [6]:
print(individuals)

['072_ASD_1', '072_DAD', '072_MOM', '072_NT_1', '1012_ASD_1', '1012_DAD', '1012_MOM', '1012_NT_1', '1030_DAD', '1030_MOM', '1030_NT_1', '1030_NT_2', '1031_ASD_1', '1031_DAD', '1031_MOM', '1031_NT_1', '1031_NT_2', '1035_ASD_1', '1035_DAD', '1035_MOM', '1035_NT_1', '1039_ASD_1', '1039_DAD', '1039_MOM', '1039_NT_1', '1046_DAD', '1046_MOM', '1046_NT_1', '1046_NT_2', '1047_ASD_1', '1047_DAD', '1047_MOM', '1048_ASD_1', '1048_DAD', '1048_MOM', '1048_NT_1', '1049_DAD', '1049_MOM', '1049_NT_1', '1049_NT_2', '1052_ASD_1', '1052_DAD', '1052_MOM', '1052_NT_1', '1053_ASD_1', '1053_DAD', '1053_MOM', '1053_NT_1', '1053_NT_2', '1055_ASD_1', '1055_DAD', '1055_MOM', '1055_NT_1', '1059_ASD_1', '1059_DAD', '1059_MOM', '1059_NT_1', '1060_DAD', '1060_MOM', '1060_NT_1', '1065_ASD_1', '1065_DAD', '1065_MOM', '1065_NT_1', '1065_NT_2', '1068_ASD_1', '1068_DAD', '1068_MOM', '1068_NT_1', '1070_ASD_1', '1070_DAD', '1070_MOM', '1070_NT_1', '1079_ASD_1', '1079_DAD', '1079_MOM', '1079_NT_1', '1080_ASD_1', '1080_DAD',

In [8]:
ns = {'A', 'C', 'G', 'T'}
def find_genotype(ref, allele1, allele2):
    if allele1 in ns and allele2 in ns and ref in ns:
        return (allele1 == ref) + (allele2 == ref)
    return -1

for chrom, chrom_int in chrom_to_int.items():
    print(chrom)
    poss = [x[1] for x in positions if x[0] == chrom_int]
    pos_to_index = dict([(x, i) for i, x in enumerate(poss)])
    coordinates = np.zeros((len(poss), 4), dtype=int)
    coordinates[:, 0] = chrom_int
    coordinates[:, 1] = poss
    coordinates[:, 2:] = 1
    
    
    gen = -np.ones((len(individuals), len(poss)), dtype=np.int8)
    for i, individual in enumerate(individuals):
        print(individual, end=' ')
        try:
            with open('../data/Ancestry Data files/%s.txt' % individual, 'r') as f:
                line = next(f)
                while line.startswith('#'):
                    line = next(f)
                for line in f:
                    pieces = line.strip().split('\t')
                    pos = int(pieces[2])
                    if int(pieces[1]) == chrom_int and pos in pos_to_index:
                        ref = pos_to_ref[(chrom_int, pos)].upper()
                        gen[i, pos_to_index[pos]] = find_genotype(ref, pieces[3], pieces[4])
        except FileNotFoundError:
            pass

    # throw out individuals where more than 10% of variants are missing
    ind_indices = np.sum(gen==-1, axis=1)/len(poss) < 0.1
    individuals = [ind for ind, include in zip(individuals, ind_indices)]
    gen = gen[ind_indices, :]
    print('missing individuals', np.sum(~ind_indices)/len(individuals))

    # throw out sites where more than 10% of individuals are missing
    coordinates[np.sum(gen==-1, axis=0)/len(individuals) >= 0.1, 3] = 0
    print('missing variants', np.sum(coordinates[:, 3]==0)/len(poss))
    

    save_npz('../../DATA/ancestry/genotypes/chr.%s.0.gen' % (str(chrom_int) if chrom_int < 23 else 'X'), csc_matrix(gen))
    np.save('../../DATA/ancestry/genotypes/chr.%s.0.gen.coordinates' % (str(chrom_int) if chrom_int < 23 else 'X'), coordinates)
    
    with open('../../DATA/ancestry/genotypes/chr.%s.gen.samples.txt' % (str(chrom_int) if chrom_int < 23 else 'X'), 'w+') as f:
        for individual, include in zip(individuals, ind_indices):
            if include:
                f.write(individual + '\n')
            
    

chr1
072_ASD_1 072_DAD 072_MOM 072_NT_1 1012_ASD_1 1012_DAD 1012_MOM 1012_NT_1 1030_DAD 1030_MOM 1030_NT_1 1030_NT_2 1031_ASD_1 1031_DAD 1031_MOM 1031_NT_1 1031_NT_2 1035_ASD_1 1035_DAD 1035_MOM 1035_NT_1 1039_ASD_1 1039_DAD 1039_MOM 1039_NT_1 1046_DAD 1046_MOM 1046_NT_1 1046_NT_2 1047_ASD_1 1047_DAD 1047_MOM 1048_ASD_1 1048_DAD 1048_MOM 1048_NT_1 1049_DAD 1049_MOM 1049_NT_1 1049_NT_2 1052_ASD_1 1052_DAD 1052_MOM 1052_NT_1 1053_ASD_1 1053_DAD 1053_MOM 1053_NT_1 1053_NT_2 1055_ASD_1 1055_DAD 1055_MOM 1055_NT_1 1059_ASD_1 1059_DAD 1059_MOM 1059_NT_1 1060_DAD 1060_MOM 1060_NT_1 1065_ASD_1 1065_DAD 1065_MOM 1065_NT_1 1065_NT_2 1068_ASD_1 1068_DAD 1068_MOM 1068_NT_1 1070_ASD_1 1070_DAD 1070_MOM 1070_NT_1 1079_ASD_1 1079_DAD 1079_MOM 1079_NT_1 1080_ASD_1 1080_DAD 1080_MOM 1080_NT_1 1081_ASD_1 1081_DAD 1081_MOM 1082_ASD_1 1082_DAD 1082_MOM 1082_NT_1 1093_ASD_1 1093_DAD 1093_MOM 1093_NT_1 1094_ASD_1 1094_DAD 1094_MOM 1094_NT_1 1108_ASD_1 1108_DAD 1108_MOM 1108_NT_1 1111_ASD_1 1111_DAD 1111_MOM

072_ASD_1 072_DAD 072_MOM 072_NT_1 1012_ASD_1 1012_DAD 1012_MOM 1012_NT_1 1030_DAD 1030_MOM 1030_NT_1 1030_NT_2 1031_ASD_1 1031_DAD 1031_MOM 1031_NT_1 1031_NT_2 1035_ASD_1 1035_DAD 1035_MOM 1035_NT_1 1039_ASD_1 1039_DAD 1039_MOM 1039_NT_1 1046_DAD 1046_MOM 1046_NT_1 1046_NT_2 1047_ASD_1 1047_DAD 1047_MOM 1048_ASD_1 1048_DAD 1048_MOM 1048_NT_1 1049_DAD 1049_MOM 1049_NT_1 1049_NT_2 1052_ASD_1 1052_DAD 1052_MOM 1052_NT_1 1053_ASD_1 1053_DAD 1053_MOM 1053_NT_1 1053_NT_2 1055_ASD_1 1055_DAD 1055_MOM 1055_NT_1 1059_ASD_1 1059_DAD 1059_MOM 1059_NT_1 1060_DAD 1060_MOM 1060_NT_1 1065_ASD_1 1065_DAD 1065_MOM 1065_NT_1 1065_NT_2 1068_ASD_1 1068_DAD 1068_MOM 1068_NT_1 1070_ASD_1 1070_DAD 1070_MOM 1070_NT_1 1079_ASD_1 1079_DAD 1079_MOM 1079_NT_1 1080_ASD_1 1080_DAD 1080_MOM 1080_NT_1 1081_ASD_1 1081_DAD 1081_MOM 1082_ASD_1 1082_DAD 1082_MOM 1082_NT_1 1093_ASD_1 1093_DAD 1093_MOM 1093_NT_1 1094_ASD_1 1094_DAD 1094_MOM 1094_NT_1 1108_ASD_1 1108_DAD 1108_MOM 1108_NT_1 1111_ASD_1 1111_DAD 1111_MOM 1111

072_ASD_1 072_DAD 072_MOM 072_NT_1 1012_ASD_1 1012_DAD 1012_MOM 1012_NT_1 1030_DAD 1030_MOM 1030_NT_1 1030_NT_2 1031_ASD_1 1031_DAD 1031_MOM 1031_NT_1 1031_NT_2 1035_ASD_1 1035_DAD 1035_MOM 1035_NT_1 1039_ASD_1 1039_DAD 1039_MOM 1039_NT_1 1046_DAD 1046_MOM 1046_NT_1 1046_NT_2 1047_ASD_1 1047_DAD 1047_MOM 1048_ASD_1 1048_DAD 1048_MOM 1048_NT_1 1049_DAD 1049_MOM 1049_NT_1 1049_NT_2 1052_ASD_1 1052_DAD 1052_MOM 1052_NT_1 1053_ASD_1 1053_DAD 1053_MOM 1053_NT_1 1053_NT_2 1055_ASD_1 1055_DAD 1055_MOM 1055_NT_1 1059_ASD_1 1059_DAD 1059_MOM 1059_NT_1 1060_DAD 1060_MOM 1060_NT_1 1065_ASD_1 1065_DAD 1065_MOM 1065_NT_1 1065_NT_2 1068_ASD_1 1068_DAD 1068_MOM 1068_NT_1 1070_ASD_1 1070_DAD 1070_MOM 1070_NT_1 1079_ASD_1 1079_DAD 1079_MOM 1079_NT_1 1080_ASD_1 1080_DAD 1080_MOM 1080_NT_1 1081_ASD_1 1081_DAD 1081_MOM 1082_ASD_1 1082_DAD 1082_MOM 1082_NT_1 1093_ASD_1 1093_DAD 1093_MOM 1093_NT_1 1094_ASD_1 1094_DAD 1094_MOM 1094_NT_1 1108_ASD_1 1108_DAD 1108_MOM 1108_NT_1 1111_ASD_1 1111_DAD 1111_MOM 1111

072_ASD_1 072_DAD 072_MOM 072_NT_1 1012_ASD_1 1012_DAD 1012_MOM 1012_NT_1 1030_DAD 1030_MOM 1030_NT_1 1030_NT_2 1031_ASD_1 1031_DAD 1031_MOM 1031_NT_1 1031_NT_2 1035_ASD_1 1035_DAD 1035_MOM 1035_NT_1 1039_ASD_1 1039_DAD 1039_MOM 1039_NT_1 1046_DAD 1046_MOM 1046_NT_1 1046_NT_2 1047_ASD_1 1047_DAD 1047_MOM 1048_ASD_1 1048_DAD 1048_MOM 1048_NT_1 1049_DAD 1049_MOM 1049_NT_1 1049_NT_2 1052_ASD_1 1052_DAD 1052_MOM 1052_NT_1 1053_ASD_1 1053_DAD 1053_MOM 1053_NT_1 1053_NT_2 1055_ASD_1 1055_DAD 1055_MOM 1055_NT_1 1059_ASD_1 1059_DAD 1059_MOM 1059_NT_1 1060_DAD 1060_MOM 1060_NT_1 1065_ASD_1 1065_DAD 1065_MOM 1065_NT_1 1065_NT_2 1068_ASD_1 1068_DAD 1068_MOM 1068_NT_1 1070_ASD_1 1070_DAD 1070_MOM 1070_NT_1 1079_ASD_1 1079_DAD 1079_MOM 1079_NT_1 1080_ASD_1 1080_DAD 1080_MOM 1080_NT_1 1081_ASD_1 1081_DAD 1081_MOM 1082_ASD_1 1082_DAD 1082_MOM 1082_NT_1 1093_ASD_1 1093_DAD 1093_MOM 1093_NT_1 1094_ASD_1 1094_DAD 1094_MOM 1094_NT_1 1108_ASD_1 1108_DAD 1108_MOM 1108_NT_1 1111_ASD_1 1111_DAD 1111_MOM 1111

072_ASD_1 072_DAD 072_MOM 072_NT_1 1012_ASD_1 1012_DAD 1012_MOM 1012_NT_1 1030_DAD 1030_MOM 1030_NT_1 1030_NT_2 1031_ASD_1 1031_DAD 1031_MOM 1031_NT_1 1031_NT_2 1035_ASD_1 1035_DAD 1035_MOM 1035_NT_1 1039_ASD_1 1039_DAD 1039_MOM 1039_NT_1 1046_DAD 1046_MOM 1046_NT_1 1046_NT_2 1047_ASD_1 1047_DAD 1047_MOM 1048_ASD_1 1048_DAD 1048_MOM 1048_NT_1 1049_DAD 1049_MOM 1049_NT_1 1049_NT_2 1052_ASD_1 1052_DAD 1052_MOM 1052_NT_1 1053_ASD_1 1053_DAD 1053_MOM 1053_NT_1 1053_NT_2 1055_ASD_1 1055_DAD 1055_MOM 1055_NT_1 1059_ASD_1 1059_DAD 1059_MOM 1059_NT_1 1060_DAD 1060_MOM 1060_NT_1 1065_ASD_1 1065_DAD 1065_MOM 1065_NT_1 1065_NT_2 1068_ASD_1 1068_DAD 1068_MOM 1068_NT_1 1070_ASD_1 1070_DAD 1070_MOM 1070_NT_1 1079_ASD_1 1079_DAD 1079_MOM 1079_NT_1 1080_ASD_1 1080_DAD 1080_MOM 1080_NT_1 1081_ASD_1 1081_DAD 1081_MOM 1082_ASD_1 1082_DAD 1082_MOM 1082_NT_1 1093_ASD_1 1093_DAD 1093_MOM 1093_NT_1 1094_ASD_1 1094_DAD 1094_MOM 1094_NT_1 1108_ASD_1 1108_DAD 1108_MOM 1108_NT_1 1111_ASD_1 1111_DAD 1111_MOM 1111

072_ASD_1 072_DAD 072_MOM 072_NT_1 1012_ASD_1 1012_DAD 1012_MOM 1012_NT_1 1030_DAD 1030_MOM 1030_NT_1 1030_NT_2 1031_ASD_1 1031_DAD 1031_MOM 1031_NT_1 1031_NT_2 1035_ASD_1 1035_DAD 1035_MOM 1035_NT_1 1039_ASD_1 1039_DAD 1039_MOM 1039_NT_1 1046_DAD 1046_MOM 1046_NT_1 1046_NT_2 1047_ASD_1 1047_DAD 1047_MOM 1048_ASD_1 1048_DAD 1048_MOM 1048_NT_1 1049_DAD 1049_MOM 1049_NT_1 1049_NT_2 1052_ASD_1 1052_DAD 1052_MOM 1052_NT_1 1053_ASD_1 1053_DAD 1053_MOM 1053_NT_1 1053_NT_2 1055_ASD_1 1055_DAD 1055_MOM 1055_NT_1 1059_ASD_1 1059_DAD 1059_MOM 1059_NT_1 1060_DAD 1060_MOM 1060_NT_1 1065_ASD_1 1065_DAD 1065_MOM 1065_NT_1 1065_NT_2 1068_ASD_1 1068_DAD 1068_MOM 1068_NT_1 1070_ASD_1 1070_DAD 1070_MOM 1070_NT_1 1079_ASD_1 1079_DAD 1079_MOM 1079_NT_1 1080_ASD_1 1080_DAD 1080_MOM 1080_NT_1 1081_ASD_1 1081_DAD 1081_MOM 1082_ASD_1 1082_DAD 1082_MOM 1082_NT_1 1093_ASD_1 1093_DAD 1093_MOM 1093_NT_1 1094_ASD_1 1094_DAD 1094_MOM 1094_NT_1 1108_ASD_1 1108_DAD 1108_MOM 1108_NT_1 1111_ASD_1 1111_DAD 1111_MOM 1111