In [1]:
import gzip 
import numpy as np
import sys

def extractGT(FORMAT_STR):
    # return GT given the format string, and the rest in a list
    # 0|0:0:1,0,0:0,222,178:80:79,1
    tmp = FORMAT_STR.split(':')
    GT = tmp[0]
    return [int(GT[0]), int(GT[2])], tmp[1:]

def phase(Mother, Father, Son):
    # return phased haplotype of son, the first allele is maternal and the second is paternal
    if sum(Mother) == sum(Father) == sum(Son) == 1 or sum(Son) == 0 or sum(Son) == 2:
        return Son # triple het, we can do nothing; or Son is homozygous, no need to phase at all
    else:
        son_gt1, son_gt2 = Son
        if son_gt1 in Mother and (not son_gt1 in Father):
            assert(son_gt2 in Father)
            return [son_gt1, son_gt2]
        elif son_gt1 in Father and (not son_gt1 in Mother):
            assert(son_gt2 in Mother)
            return [son_gt2, son_gt1]
        elif son_gt2 in Mother and (not son_gt2 in Father):
            assert(son_gt1 in Father)
            return [son_gt2, son_gt1]
        elif son_gt2 in Father and (not son_gt2 in Mother):
            assert(son_gt1 in Mother)
            return [son_gt1, son_gt2]
        else:
            print('mendelian error!')
            print(f'{Mother}, {Father}, {Son}')
            return None



In [None]:
with gzip.open('AF.trio.ch3.vcf.gz', 'rt') as f:
    with open('AF.trio.ch3.phased.vcf', 'w') as out:
        for line in f:
            if line.startswith('#'):
                out.write(line)
            else:
                chrom, pos, id, ref, alt, qual, filter, info, format, mother, father, son1= line.strip().split('\t')
                mother_gt, mother_format = extractGT(mother)
                father_gt, father_format = extractGT(father)
                son1_gt, son1_format = extractGT(son1)
                # don't write tri-het sites to the output file
                if sum(mother_gt) == sum(father_gt) == sum(son1_gt) == 1:
                    print(f'skipping tri het sites')
                    continue
                son1_phased_gt = phase(mother_gt, father_gt, son1_gt)
                if son1_phased_gt != None:
                    son1_phased = '|'.join(map(str, son1_phased_gt))
                    son1_format.insert(0, son1_phased)
                    son1 = ':'.join(son1_format)
                    new_line = "\t".join([chrom, pos, id, ref, alt, qual, filter, info, format, mother, father, son1])
                    out.write(f'{new_line}\n')

In [3]:
with gzip.open('AF.ch3.1kg.vcf.gz', 'rt') as f:
    with open('AF.ch3.trioPhased.1kg.vcf', 'w') as out:
        for line in f:
            if line.startswith('#'):
                out.write(line)
            else:
                chrom, pos, id, ref, alt, qual, filter, info, format, mother, father, son1= line.strip().split('\t')
                mother_gt, mother_format = extractGT(mother)
                father_gt, father_format = extractGT(father)
                son1_gt, son1_format = extractGT(son1)
                # don't write tri-het sites to the output file
                if sum(mother_gt) == sum(father_gt) == sum(son1_gt) == 1:
                    print(f'skipping tri het sites')
                    continue
                son1_phased_gt = phase(mother_gt, father_gt, son1_gt)
                if son1_phased_gt != None:
                    son1_phased = '|'.join(map(str, son1_phased_gt))
                    son1_format.insert(0, son1_phased)
                    son1 = ':'.join(son1_format)
                    new_line = "\t".join([chrom, pos, id, ref, alt, qual, filter, info, format, mother, father, son1])
                    out.write(f'{new_line}\n')

skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
mendelian error!
[0, 0], [0, 0], [0, 1]
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
mendelian error!
[0, 0], [0, 0], [1, 0]
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
mendelian error!
[0, 0], [0, 0], [1, 0]
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skipping tri het sites
skippi