In [1]:
import csv
import gzip
import os

#Takes in two strings, one from the format column and one from the column that 
#specifies the data. Example inputs: 'GT:GQX:DP:DPF' as format_string, and '0/0:21:8:0'
#as data_string. Splits these strings and returns a string that represents the genotype
#enclosed in brackets. 
def get_genotype(format_string, data_string):
    format_list = format_string.split(":")
    data_list = data_string.split(":")
    info = {} #a dict of format subtype string --> data subtype string
    for f, d in zip(format_list, data_list):
        info[f] = d
    return '[' + info['GT'] + ']'

In [None]:
#The code in this cell writes out a file for each chromosome of one person's gcvf file. 
#The file contains the pass[0/0] count for that chromosome in the first row. The 
#rest of the rows have position in one column and category in another column (excluding pass[0/0]). 

with gzip.open('/40TB_3/InovaGenomes/Illumina/LP6005636-DNA_F04/ASM/LP6005636-DNA_F04.genome.vcf.gz') as gvcfFile:
    pos_category = []
    chrom = 'chrM'
    pass_00_count = 0
    gvcfReader = csv.reader(gvcfFile, delimiter = '\t')
    col_labels = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample']
    indexes = range(0, len(col_labels))
    col_dict= {}
    for index, label in zip(indexes, col_labels):
        col_dict[label] = index

    for i, row in enumerate(gvcfReader):
        if(not row[0].startswith("#")): #107 start
            current_chrom = row[col_dict['CHROM']]
            pos = row[col_dict['POS']]
            category = row[col_dict['FILTER']]
            if(category == 'PASS'):
                formatD = row[col_dict['FORMAT']]
                data = row[col_dict['Sample']]
                category += get_genotype(formatD, data)
            if(current_chrom != chrom): #if we have reached a new chromsome, create a file for the old
                #create file for the old chrom
                with open('%s position and category.gvcf'%chrom, 'wb') as output_file:
                    output_writer = csv.writer(output_file, delimiter = '\t')
                    output_writer.writerow([pass_00_count]) #print the pass00 count
                    for sublist in pos_category:
                        output_writer.writerow(sublist) #print each line's position and category
                #reset variables for the new chromosome
                pass_00_count = 0
                del pos_category[:]
                chrom = current_chrom
    
        #Regardless of whether we switched chromsomes or not, we still need to increment count and
        #add a new value to the map. That's why the block below isn't under an else clause. 
            if category == 'PASS[0/0]': #keep track of PASS[0/0] but don't add to map
                pass_00_count += 1;
            else: #add to map that will be printed later
                pos_category.append([pos, category]) #how about duplicate postions?
        if i >= 7976: break
    #When we have reached the end of the file, the last chromosome's data would not have been written yet. 
    #The way my code is set up, it doesn't print the file until a new chromsome appears. 
    #Writing file for last chromosome below. 
    with open('%s position and category.gvcf'%chrom, 'wb') as output_file:
        output_writer = csv.writer(output_file, delimiter = '\t')
        output_writer.writerow(['PASS[0/0] count:', pass_00_count]) #print the pass00 count
        for sublist in pos_category:
            output_writer.writerow(sublist)
     