In [1]:
import urllib.request
import gzip
import shutil
from subprocess import PIPE, Popen
import os

## Download TEST data

In [2]:
# make ExampleFile directory if it does not exist
if not os.path.exists("ExampleFiles"):
    os.makedirs("ExampleFiles")

In [3]:
# Download vcf
urllib.request.urlretrieve("https://zenodo.org/record/3584238/files/patho.vcf.gz?download=1", 'ExampleFiles/patho.vcf.gz')
# Download vcf tabix
urllib.request.urlretrieve("https://zenodo.org/record/3584238/files/patho.vcf.gz.tbi?download=1", 'ExampleFiles/patho.vcf.gz.tbi')
# Download GTF
urllib.request.urlretrieve("https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/genes/hg19.ensGene.gtf.gz", 'ExampleFiles/hg19.gtf.gz')

('ExampleFiles/hg19.gtf.gz', <http.client.HTTPMessage at 0x2aed653574e0>)

In [4]:
# Download gzipped hg19 fasta (warning: 900mb)
urllib.request.urlretrieve("https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz", 'ExampleFiles/hg19.fa.gz')

('ExampleFiles/hg19.fa.gz', <http.client.HTTPMessage at 0x2aed65357898>)

In [5]:
# unzip fasta, make sure that you have gunzip installed
p = Popen("gunzip ExampleFiles/hg19.fa.gz", shell=True, stdout=PIPE, stderr=PIPE)
p.communicate()

(b'', b'')

In [6]:
# Path of the vcf file
vcf_path = "ExampleFiles/patho.vcf.gz"

# Path of the fasta file
fasta_path = "ExampleFiles/hg19.fa"

# Path of the gtf file
gtf_path = "ExampleFiles/hg19.gtf.gz"

# How are the 5'UTR called in your gtf
feature_type = "5UTR"

# Output to stora the table
output_path = "ExampleFiles/output.framepool.tsv"

## Filter GTF file for speed performance (Recommended)

In [7]:
import pyranges as pr
from cyvcf2 import VCF

In [8]:
# Import gtf with pyranges
gr = pr.read_gtf(gtf_path)

In [9]:
id_set = set()
for var in VCF(vcf_path):
    chrom = var.CHROM
    pos = var.POS
    id_set = set(gr[chrom, pos-1:pos].df.gene_id) | id_set # vcf is 1-based, pyranges is not

In [10]:
gr_subset = gr[gr.gene_id.isin(id_set)]
gr_subset = gr[gr.gene_id.isin(id_set)]
gtf_path = "ExampleFiles/reduced.gtf" # new reduced gtf file containing only the regions where there is a variant
gr_subset.to_gtf(gtf_path)

## Run the model

In [11]:
from VariantEffect.dataloader import SingleVariantFramepoolDataloader
import kipoi
import pyranges as pr
from cyvcf2 import VCF
from tqdm import tqdm
import pandas as pd

In [None]:
# Source model directly from directory
model = kipoi.get_model("../Framepool", source="dir")

In [13]:
# initialize Loader
svfp = SingleVariantFramepoolDataloader(gtf_path, fasta_path, vcf_path, feature_type)

In [14]:
predictions = []
for variant_obj in tqdm(svfp):
    pr = model.predict_on_batch(variant_obj["inputs"]) # make prediction
    
    
    var = variant_obj["metadata"]["variant"]
    
    scores = var["chrom"], var["pos"], \
            var["ref"], var["alt"], \
            pr["mrl_fold_change"][0], pr["shift_1"][0], pr["shift_2"][0], \
            variant_obj["metadata"]["transcript_id"]
    
    
    predictions.append(scores)

22it [00:01, 13.98it/s]


In [15]:
predictions_df = pd.DataFrame(predictions, columns=["chrom", "start", "ref", "alt", "mrl_fold_change", "shift_1", "shift_2", "transcript_id"])
predictions_df

Unnamed: 0,chrom,start,ref,alt,mrl_fold_change,shift_1,shift_2,transcript_id
0,chr1,93297626,C,A,-0.799757,-0.670084,0.025305,ENST00000370321
1,chr1,93297626,C,A,-0.107161,-0.189486,-0.054052,ENST00000470843
2,chr1,209975361,T,A,-1.068,-0.861996,0.055964,ENST00000367021
3,chr1,209975361,T,A,-0.675417,-0.53175,-0.228016,ENST00000456314
4,chr11,5248280,C,T,-0.828528,0.00658,-0.919977,ENST00000335295
5,chr17,66508599,G,A,-1.02586,0.015113,-1.124092,ENST00000392711
6,chr17,66508599,G,A,-1.088306,-0.048745,-1.154857,ENST00000585427
7,chr17,66508599,G,A,-1.141302,-0.975095,0.025098,ENST00000585608
8,chr17,66508599,G,A,-1.129289,-0.975691,0.034666,ENST00000589228
9,chr17,66508599,G,A,-0.142159,-0.277853,-0.063634,ENST00000536854


In [16]:
# save scores as a tab table
predictions_df.to_csv(output_path, sep="\t", index=False)