In [2]:
from collections import defaultdict
import numpy as np
from scipy.sparse import csc_matrix, save_npz, load_npz
import time
import gzip
from itertools import product


In [3]:
# Pull arguments
reference = 'hg19'
vcf_files = ['../data/GenotypeFiles/High_Confidence_Calls_HG19.vcf.gz', 
             '../data/GenotypeFiles/manuscript_supplement_filtered_ped_consistent.vcf.gz',
             '../data/GenotypeFiles/manuscript_supplement_hq_fails.vcf.gz']
out_directory = '../split_gen_platinum'
maxsize = 500000000

In [21]:
for chrom in ['chr%d' % c for c in range(1, 23)] + ['chrX', 'chrY']:
    chrom_int = 23 if chrom == 'chrX' else 24 if chrom == 'chrY' else 25 if chrom == 'chrMT' else int(chrom[3:])

    t0 = time.time()

    # pull sample_ids and write to file
    with gzip.open(vcf_files[0], 'rt') as f:
        # Skip header
        line = next(f)
        while line.startswith('##'):
            line = next(f)

    with open('%s/chr.%s.gen.samples.txt' % (out_directory, chrom[3:]), 'w+') as sample_f:
        sample_ids = line.strip().split('\t')[9:]
        sample_f.write('\n'.join(sample_ids))

    sample_id_to_index = dict([(x, i) for i, x in enumerate(sample_ids)])
    m = len(sample_ids)
    print('Num individuals with genomic data', len(sample_ids))

    # Pull data from vcf
    variant_info = []
    chrom_coord = []

    # Pull genotypes from vcf
    m = len(sample_ids)
    data, indices, indptr = np.zeros((maxsize,), dtype=np.int8), np.zeros((maxsize,), dtype=int), [0]
    gen_mapping = {'./.': -1, '0/0': 0, '0|0': 0, '0/1': 1, '0|1': 1, '1/0': 1, '1|0': 1, '1/1': 2, '1|1': 2}
    index, num_lines = 0, 0
    
    refs = []
    alts = []

    for vcf_file in vcf_files:
        print(vcf_file)
        with gzip.open(vcf_file, 'rt') as f:

            # Skip header
            line = next(f)
            while line.startswith('##'):
                line = next(f)

            sample_ids = line.strip().split('\t')[9:]
            col_indices = [sample_id_to_index[x] for x in sample_ids]

            line = next(f)
            for line in f:
                pieces = line.split('\t', maxsplit=1)

                if pieces[0] == chrom:
                    pieces = line.strip().split('\t')
                    _, pos, _, ref, alt = pieces[:5]
                    is_biallelic_snp = len(ref) == 1 and len(alt) == 1 and ref != '.' and alt != '.'
                    
                    if is_biallelic_snp and 'bwa_gatk' in pieces[7]:
                        refs.append(ref)
                        alts.append(alt)
                        fmt = pieces[8].strip().split(':')

                        # Write variant to file
                        variant_info.append('\t'.join(pieces[:9]) + '\n')
                        chrom_coord.append((chrom_int, int(pos), is_biallelic_snp, True))

                        # Pull out genotypes
                        gen_index = fmt.index('GT')
                        for i, piece in zip(col_indices, pieces[9:]):
                            segment = piece.split(':', maxsplit=gen_index+1)
                            gt = gen_mapping.get(segment[gen_index], -1) # For now we mark multi-base loci as unknown

                            if gt != 0:
                                indices[index] = i
                                data[index] = gt
                                index += 1
                        indptr.append(index)
                        num_lines += 1

    # reorder since data is coming from multiple files
    chrom_coord = np.asarray(chrom_coord, dtype=int)
    ordered_indices = np.argsort(chrom_coord[:, 1])

    chrom_coord = chrom_coord[ordered_indices, :]
    gen = csc_matrix((data[:index], indices[:index], indptr), shape=(m, num_lines), dtype=np.int8)[:, ordered_indices]
    refs = [refs[i] for i in ordered_indices]
    alts = [alts[i] for i in ordered_indices]
    
    # remove sites with same alt and genotypes
    keep_site = np.ones((chrom_coord.shape[0],), dtype=bool)
    for i in np.where(chrom_coord[:-1, 1] == chrom_coord[1:, 1])[0]:
        # positions are the same
        if alts[i] == alts[i+1] and np.all(gen[:, i].A.flatten() == gen[:, i+1].A.flatten()):
            keep_site[i+1] = False
        elif alts[i] != alts[i+1]:
            print(chrom_coord[i, 1], 'Alt mismatch', alts[i], alts[i+1], refs[i], refs[i+1])
        else:
            print(chrom_coord[i, 1], 'Gens mismatch', gen[:, i].A.flatten(), gen[:, i+1].A.flatten())
    print(np.sum(chrom_coord[:-1, 1] == chrom_coord[1:, 1]), chrom_coord.shape[0])
    
    with gzip.open('%s/chr.%s.gen.variants.txt.gz' % (out_directory, chrom[3:]), 'wt') as variant_f:
        for i in ordered_indices:
            variant_f.write(variant_info[i])

    # Save to file
    print(gen.shape)
    save_npz('%s/chr.%s.gen' % (out_directory, chrom[3:]), gen)
    np.save('%s/chr.%s.gen.coordinates' % (out_directory, chrom[3:]), chrom_coord)

    print('Completed in ', time.time()-t0, 'sec')


Num individuals with genomic data 13
../data/GenotypeFiles/High_Confidence_Calls_HG19.vcf.gz
../data/GenotypeFiles/manuscript_supplement_filtered_ped_consistent.vcf.gz
../data/GenotypeFiles/manuscript_supplement_hq_fails.vcf.gz
194964483 Gens mismatch [1 0 1 1 1 1 1 1 1 1 1 1 1] [2 0 1 1 1 1 1 1 1 1 1 1 1]
212227749 Gens mismatch [0 1 0 1 1 1 0 1 1 0 0 0 1] [0 1 0 1 1 0 0 1 1 0 0 0 1]
2 382263
(13, 382263)
Completed in  19.57804226875305 sec
Num individuals with genomic data 13
../data/GenotypeFiles/High_Confidence_Calls_HG19.vcf.gz
../data/GenotypeFiles/manuscript_supplement_filtered_ped_consistent.vcf.gz
../data/GenotypeFiles/manuscript_supplement_hq_fails.vcf.gz
55748731 Gens mismatch [0 1 1 0 0 1 1 0 1 1 0 0 1] [0 1 1 0 0 1 1 0 1 1 0 1 1]
1 396795
(13, 396795)
Completed in  19.523746013641357 sec
Num individuals with genomic data 13
../data/GenotypeFiles/High_Confidence_Calls_HG19.vcf.gz
../data/GenotypeFiles/manuscript_supplement_filtered_ped_consistent.vcf.gz
../data/GenotypeFile

IndexError: too many indices for array

In [17]:
chrom_coord = np.load('../split_gen_platinum/chr.1.gen.coordinates.npy')[:, 1]
gen_data = load_npz('../split_gen_platinum/chr.1.gen.npz')


In [18]:
print(chrom_coord)

[   101235    102951    108198 ... 249206148 249209140 249216384]


In [19]:
for i in range(1, chrom_coord.shape[0]):
    if chrom_coord[i] == chrom_coord[i-1]:
        print(chrom_coord[i-1], chrom_coord[i])
        print(gen_data[:, i-1:i+1].A)

194964483 194964483
[[1 2]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]]
212227749 212227749
[[0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]]


In [41]:
match_next = chrom_coord[1:] == chrom_coord[:-1]
gen_match_next = 