In [28]:
import vcf as PyVCF
from vcf.parser import _Contig, _Format, _Info, _Filter, _Call
from vcf.model import make_calldata_tuple
import collections

In [35]:
f = open('./output.vcf', 'w')

In [36]:
class PyVcfTemplate(object):
    
    def __init__(self, infos=None, metadata=None, formats=None, filters=None, alts=None, contigs=None, samples=None):
        if infos is None:
            infos = {}
        if metadata is None:
            metadata = {}
        if formats is None:
            formats = {}
        if filters is None:
            filters = {}
        if alts is None:
            alts = {}
        if contigs is None:
            contigs = {}
        if samples is None:
            samples = []
        
        self.infos = {k:self._get_info(k, *v) for k, v in infos.items()}
        self.metadata = metadata
        self.formats = {k:_Format(k, *v) for k, v in formats.items()}
        self.filters = {k:_Filter(k, v) for k, v in filters.items()}
        self.alts = alts
        self.contigs = collections.OrderedDict((x, _Contig(x, None)) for x in contigs)
        self._column_headers = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
        self.samples = samples
    
    def _get_info(self, name, number, type, description):
        return _Info(name, number, type, description, '_', '_')
        
# def format_array_to_dict(number, type, description):
#     return {
#         'Number': number,
#         'Type': type,
#         'Description': description
#     }
contigs = [str(num) for num in range(1, 24)] + ['X', 'Y']
# contigs.update({'X': ['X'], 'Y': ['Y']})

template = PyVcfTemplate(
    metadata={'fileformat': 'VCFv4.2'},
    formats={
        'GT': [1, 'String', 'Genotype'],
        'AD': ['.', 'Integer', 'Allelic depths for the ref and alt alleles in the order listed']
    },
    samples=['SAMPLE1', 'SAMPLE2', 'SAMPLE3'],
    contigs=contigs,
    infos={
        'END': [1, 'Integer', 'Stop position of the interval']
    },
    filters={
        'PASS': 'All filters passed'
    }
    
    
)

vcf = PyVCF.Writer(f, template)

In [37]:
for chromosome in contigs:
    record = PyVCF.model._Record(
        chromosome,
        2020, # POS
        '.',
        'A',
        [PyVCF.model._Substitution('ACGTAGATTAC')],
        100,
        'PASS',
        {'END': 2031},
        'GT:AD',
        {'SAMPLE1': 0, 'SAMPLE2': 1, 'SAMPLE3': 2}
    )
    reverse_map = {v:k for k, v in record._sample_indexes.items()}
    calldata_tuple = make_calldata_tuple(record.FORMAT.split(':'))
#     print(calldata_tuple)
    samples = map(
        lambda x: _Call(record, reverse_map[x[0]], calldata_tuple(**x[1])),
        enumerate([{'GT': '0/1', 'AD': '20,23'}, {'GT': '.', 'AD': '.,.'}, {'GT': '.', 'AD': '.'}]))
    
    record.samples = samples
    vcf.write_record(record)

vcf.close()

In [None]:
# f = open('./sample.vcf', 'r')

In [None]:
# reader = PyVCF.Reader(f)
# for record in reader:
#     print(record.samples)
#     break

In [None]:
header.add_meta('fileformat', value='VCFv4.1')

In [None]:
header.formats.add('GT', 2, 'String', 'Genotype')
header.formats.add('AD', '.', 'Integer', 'Allelic depths for the ref and alt alleles in the order listed')

In [None]:
for sample in range(4000):
    header.add_sample('SAMPLE' + str(sample))
# header.add_sample('SAMPLE01')
# header.add_sample('SAMPLE02')
# header.add_sample('SAMPLE03')

In [None]:
chromosomes = (['chr' + str(num) for num in range(1, 23)] + ['X', 'Y'])
# chromosomes = sorted(chromosomes, reverse=True)
# print(chromosomes)
for chromosome in chromosomes:
    header.contigs.add(chromosome)

In [None]:
header.info.add('END', 1, 'Integer', 'Stop position of the interval')
print(list(header.contigs))
print(str(header.info.header))

In [None]:
# vcf_in = pysam.VariantFile('./test.vcf', 'r')
# header = vcf_in.header
vcf = pysam.VariantFile('./output.vcf', 'w', header=header)

In [None]:
print str(header.info.header)

In [None]:
record = None
for chromosome in chromosomes=chromosomes:
    record = header.new_record(
        contig=chromosome,
        alleles=('A', 'AC'),
        filter='.',
        id='.',
        start=100,
        qual=100,
        samples= [{'GT': (0, 1), 'AD': (20, 23)}, {}] * (4000 / 2)
    )
    vcf.write(record)

In [None]:
record.contig

In [None]:
vcf.close()
