In [6]:
import pysam

In [15]:
import os
import pandas as pd

In [30]:


def parse_vcf(vcf_file, chrom, start, end):
    vcf_reader = pysam.VariantFile(vcf_file)
    
    records = []
    for record in vcf_reader.fetch(chrom, start, end):
        if 'ANN' not in record.info:
            continue
        for sample in record.samples:
            if record.alts is None:
                alts = None
            else:
                alts = '/'.join(str(a) for a in record.alts)
                
            if record.info['TYPE'] is None:
                rec_type = None
            else:
                rec_type = '/'.join(str(t) for t in record.info['TYPE'])

            info = record.info['ANN'][0].split('|')
            records.append({
                'File': vcf_file,
                'Location': f"{record.chrom}:{record.pos}",
                'Reference': record.ref,
                'Alternate': alts,
                'Quality': record.qual,
                'Type': rec_type,
                'Allele frequency': sample['AF'] if 'AF' in sample else None,
                'Gene': info[3],
                'Impact': info[2],
                'Consequence': info[1],
                'RSID': record.id
            })
    
    return pd.DataFrame(records)

# Now, call the function:
df = parse_vcf("/Volumes/T7/genomicdata/vcf_amp_ad/29664-DLPFC.recalibrated.haplotypeCalls.annotated.vcf.gz", "chr19", 7926718, 7943667)

# And print the first few rows to check it:
print(df.head())


Empty DataFrame
Columns: []
Index: []


In [29]:
parse_vcf("/Volumes/T7/genomicdata/vcf_amp_ad/29664-DLPFC.recalibrated.haplotypeCalls.annotated.vcf.gz", chrom1, start1, end1)

[]

In [9]:
vcf_file = pysam.VariantFile("/Volumes/T7/genomicdata/vcf_amp_ad/29664-DLPFC.recalibrated.haplotypeCalls.annotated.vcf.gz")

# Print the info field of the first 5 records
for i, rec in enumerate(vcf_file.fetch()):
    print(rec.info)
    if i >= 4:
        break


<pysam.libcbcf.VariantRecordInfo object at 0x105830d30>
<pysam.libcbcf.VariantRecordInfo object at 0x105830f10>
<pysam.libcbcf.VariantRecordInfo object at 0x105830f70>
<pysam.libcbcf.VariantRecordInfo object at 0x105830f10>
<pysam.libcbcf.VariantRecordInfo object at 0x105830f70>


In [27]:
# Specify the region of interest (TIMM44 gene location)
chrom1 = "chr19"   # chromosome
start1 = 7926718  # start position
end1 = 7943667  # end position

# Fetch records from the specified region
for rec in vcf_file.fetch(chrom1, start1, end1):
    print(rec)


chr19	7927087	.	C	T	761.77	PASS	AC=1;AF=0.5;AN=2;BaseQRankSum=0.07;ClippingRankSum=-1.156;DP=49;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=60;MQ0=0;MQRankSum=-1.598;POSITIVE_TRAIN_SITE;QD=15.55;ReadPosRankSum=-0.472;SOR=0.735;VQSLOD=15.84;culprit=MQ;CSQ=T|downstream_gene_variant|MODIFIER|SNAPC2|ENSG00000104976|Transcript|ENST00000221573|protein_coding||||||||||rs59433043|3837|1||SNV|HGNC|HGNC:11135|1|P1|C|C|||CTXN1|||||||||||||8.298|0.620600||||||||||||||||-4.11800003051758|0|||||rs59433043|1.45129e-02|30938||||||||||||||||||||||||,T|3_prime_UTR_variant|MODIFIER|TIMM44|ENSG00000104980|Transcript|ENST00000270538|protein_coding|13/13||ENST00000270538.7:c.*100G>A||1728|||||rs59433043||-1||SNV|HGNC|HGNC:17316|1|P1|C|C|||CTXN1|||||||||||||8.298|0.620600||||||||||||||||-4.11800003051758|0|||||rs59433043|1.45129e-02|30938||||||||||||||||||||||||,T|upstream_gene_variant|MODIFIER|CTXN1|ENSG00000178531|Transcript|ENST00000318978|protein_coding||||||||||rs59433043|921|-1||SNV|HGNC|HGNC:31108|1|P1

In [13]:
with open('output.txt', 'w') as f:
    for rec in vcf_file.fetch(chrom, start, end):
        f.write(f'{rec.chrom}\t{rec.pos}\t{rec.id}\t{rec.ref}\t{rec.alts}\t{rec.qual}\t{rec.filter}\t{rec.info}\n')


In [35]:

vcf_file = pysam.VariantFile("/Volumes/T7/genomicdata/vcf_amp_ad/29664-DLPFC.recalibrated.haplotypeCalls.annotated.vcf.gz")  # specify the VCF file path

# Specify the region of interest (TIMM44 gene location)
chrom1 = "chr19"   # chromosome
start1 = 7926718  # start position
end1 = 7943667  # end position

# Define an empty DataFrame
df = pd.DataFrame(columns=['chrom', 'pos', 'ref', 'alt', 'qual', 'filter', 'info'])

# Fetch records from the specified region
for rec in vcf_file.fetch(chrom1, start1, end1):
    # Fetch the required details from each record
    chrom = rec.chrom
    pos = rec.pos
    ref = rec.ref
    alt = str(rec.alts[0])  # Let's consider only the first alternate allele
    qual = rec.qual
    filter_pass = rec.filter.keys()[0]  # Consider the first filter
    info = dict(rec.info)  # Convert the info field into a dictionary

    # Append these details into our DataFrame
    df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)

# Print the DataFrame
print(df)

   chrom      pos ref alt         qual filter  \
0  chr19  7927087   C   T   761.770020   PASS   
1  chr19  7928849   G   A   336.769989   PASS   
2  chr19  7928923  CA   C    30.730000   PASS   
3  chr19  7930131   T   G   969.770020   PASS   
4  chr19  7931929   G   A   738.770020   PASS   
5  chr19  7935167  CT   C    50.730000   PASS   
6  chr19  7938665   C   G  1124.770020   PASS   
7  chr19  7939010  GT   G  1418.729980   PASS   
8  chr19  7939037  AG   A  1204.729980   PASS   
9  chr19  7941174   T   C   619.770020   PASS   

                                                info  
0  {'AC': (1,), 'AF': (0.5,), 'AN': 2, 'BaseQRank...  
1  {'AC': (1,), 'AF': (0.5,), 'AN': 2, 'BaseQRank...  
2  {'AC': (1,), 'AF': (0.5,), 'AN': 2, 'BaseQRank...  
3  {'AC': (2,), 'AF': (1.0,), 'AN': 2, 'DP': 30, ...  
4  {'AC': (1,), 'AF': (0.5,), 'AN': 2, 'BaseQRank...  
5  {'AC': (1,), 'AF': (0.5,), 'AN': 2, 'BaseQRank...  
6  {'AC': (2,), 'AF': (1.0,), 'AN': 2, 'DP': 35, ...  
7  {'AC': (2,), 'AF'

  df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)
  df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)
  df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)
  df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)
  df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)
  df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)
  df = df.append({'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter_pass, 'info': info}, ignore_index=True)
  df = df.app

In [None]:
/Volumes/T7/genomicdata/vcf_amp_ad/213939-D.recalibrated.haplotypeCalls.annotated.vcf.gz