### Usage: Confirm validity of reference sequences vs vcf files
### Author: Temi
### Date: Tues Jan 24 2023

In [3]:
import os
import numpy as np
import cyvcf2
import kipoiseq
import pandas as pd
import multiprocessing

In [5]:
working_dir = '/grand/covid-ct/imlab/users/temi/projects/TFXcan/scripts/'
os.chdir(working_dir)

In [19]:
reference_genome = '/grand/covid-ct/imlab/data/hg_sequences/hg38/Homo_sapiens_assembly38.fasta'
vcf_file = '../genotypes/prj6_genotypes/merged_phased_SNPs.vcf.gz'

Use kipoiseq to read fasta file

In [6]:
class FastaStringExtractor:
    def __init__(self, fasta_file):
        import pyfaidx

        self.fasta = pyfaidx.Fasta(fasta_file)
        self._chromosome_sizes = {k: len(v) for k, v in self.fasta.items()}

    def extract(self, interval, **kwargs) -> str:
        # Truncate interval if it extends beyond the chromosome lengths.

        import kipoiseq
        chromosome_length = self._chromosome_sizes[interval.chrom]
        trimmed_interval = kipoiseq.Interval(interval.chrom,
                                    max(interval.start, 0),
                                    min(interval.end, chromosome_length),
                                    )
        # pyfaidx wants a 1-based interval
        sequence = str(self.fasta.get_seq(trimmed_interval.chrom,
                                            trimmed_interval.start + 1,
                                            trimmed_interval.stop).seq).upper()
        # Fill truncated values with N's.
        pad_upstream = 'N' * max(-interval.start, 0)
        pad_downstream = 'N' * max(interval.end - chromosome_length, 0)
        return pad_upstream + sequence + pad_downstream

    def close(self):
        return self.fasta.close()

A short snippet

In [30]:
region_chr = 'chr1'
region_start = 100001
region_end = 100009
SEQUENCE_LENGTH=len(range(region_start, region_end))

In [31]:
reg_interval = kipoiseq.Interval(region_chr, region_start, region_end).resize(SEQUENCE_LENGTH)
reference_region = fasta_extractor.extract(interval=reg_interval, anchor=[])
reference_region

'CTAAGCAC'

## The experiment

Use cyvcf to extract vcf

There are currently about 11 million variants in this vcf file

For this purpose of this notebook, I will use the top 1 million variants (for memory sake)

In [59]:
stop_at = 11000000

In [60]:
all_variants = []

for i, variant in enumerate(cyvcf2.cyvcf2.VCF(vcf_file)):
    if i > stop_at:
        break
    else:
        out = []
        out.extend([variant.CHROM, variant.start, variant.end, variant.REF, variant.ALT])
        all_variants.append(out)

[W::hts_idx_load3] The index file is older than the data file: ../genotypes/prj6_genotypes/merged_phased_SNPs.vcf.gz.tbi


Here is what the output looks like ( I think it went over by one but it does not matter)

In [61]:
all_variants[0:3] ; len(all_variants)

11000001

Extract the same regions from the reference genome

In [62]:
reference_variants_positions = [[v[0], v[1], v[2]] for v in all_variants]
reference_variants_positions[1:5]

[['chr1', 10247, 10248],
 ['chr1', 10462, 10463],
 ['chr1', 10491, 10492],
 ['chr1', 13272, 13273]]

In [63]:
len(reference_variants_positions)

11000001

In [64]:
reference_variants = []
for vpos in reference_variants_positions:
    slength = len(range(vpos[1], vpos[2]))
    reg_interval = kipoiseq.Interval(vpos[0], vpos[1], vpos[2]).resize(slength)
    reference_region = fasta_extractor.extract(interval=reg_interval, anchor=[])
    reference_variants.append(reference_region)

Convert to a string

In [65]:
reference_variants = ''.join(reference_variants)

Convert the vcf variants to a string too

In [66]:
vcf_ref_variants = ''.join([v[3] for v in all_variants])

Compare the reference variants with the vcf variants

In [67]:
reference_variants == vcf_ref_variants

True

In [69]:
len(reference_variants), len(vcf_ref_variants)

(11000001, 11000001)

They are the same

## Here, I extend the above to the GEUVADIS dataset which is split across different chromosomes :(

In [7]:
reference_genome = '/grand/covid-ct/imlab/data/hg_sequences/hg38/Homo_sapiens_assembly38.fasta'
vcf_files_dir = '/grand/covid-ct/imlab/data/GEUVADIS/vcf_snps_only'

In [8]:
chromosomes = [f'chr{i}' for i in list(range(1, 23))]
chromosomes.extend(['chrX', 'chrY', 'chrM'])
chromosomes[0:5]

['chr1', 'chr2', 'chr3', 'chr4', 'chr5']

In [9]:
fasta_extractor = FastaStringExtractor(reference_genome)

In [10]:
variants_status = {}

for chr in chromosomes:
    print(chr)
    vcf_f = f'{vcf_files_dir}/ALL.{chr}.shapeit2_integrated_SNPs_v2a_27022019.GRCh38.phased.vcf.gz'
    if os.path.isfile(vcf_f):
        chr_variants = []
        for i, variant in enumerate(cyvcf2.cyvcf2.VCF(vcf_f)):
            if i > 5000000:
                break
            else:
                out = []
                out.extend([variant.CHROM, variant.start, variant.end, variant.REF, variant.ALT])
                chr_variants.append(out)
    
    chr_reference_variants_positions = [[v[0], v[1], v[2]] for v in chr_variants]
    print(chr_reference_variants_positions[1:3])

    chr_reference_variants = []
    for vpos in chr_reference_variants_positions:
        slength = len(range(vpos[1], vpos[2]))
        reg_interval = kipoiseq.Interval(vpos[0], vpos[1], vpos[2]).resize(slength)
        reference_region = fasta_extractor.extract(interval=reg_interval, anchor=[])
        chr_reference_variants.append(reference_region)

    chr_reference_variants = ''.join(chr_reference_variants)
    chr_vcf_ref_variants = ''.join([v[3] for v in chr_variants])

    status = chr_reference_variants == chr_vcf_ref_variants
    print(f'    {status}')

    variants_status[chr] = status
    

chr1
[['chr1', 51478, 51479], ['chr1', 51897, 51898]]
    True
chr2
[['chr2', 10331, 10332], ['chr2', 10373, 10374]]
    True
chr3
[['chr3', 11804, 11805], ['chr3', 11805, 11806]]
    True
chr4
[['chr4', 10430, 10431], ['chr4', 10641, 10642]]
    True
chr5
[['chr5', 11894, 11895], ['chr5', 11950, 11951]]
    True
chr6
[['chr6', 93326, 93327], ['chr6', 118438, 118439]]
    True
chr7
[['chr7', 31442, 31443], ['chr7', 37444, 37445]]
    True
chr8
[['chr8', 71178, 71179], ['chr8', 73416, 73417]]
    True
chr9
[['chr9', 10688, 10689], ['chr9', 10689, 10690]]
    True
chr10
[['chr10', 36096, 36097], ['chr10', 45899, 45900]]
    True
chr11
[['chr11', 128424, 128425], ['chr11', 128428, 128429]]
    True
chr12
[['chr12', 10717, 10718], ['chr12', 11820, 11821]]
    True
chr13
[['chr13', 18171452, 18171453], ['chr13', 18171493, 18171494]]
    True
chr14
[['chr14', 16055979, 16055980], ['chr14', 16055980, 16055981]]
    True
chr15


KeyboardInterrupt: 