In [None]:
# download files
!mkdir -p downloads
!cd downloads
!wget https://github.com/FunctionLab/ExPecto/blob/584c48c85705ec1690a0139155ac62abb7333bd4/example/example.vcf.shift_0.diff.h5?raw=true -O test.h5
!ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.1.fa.gz
!gunzip Homo_sapiens.GRCh37.dna.chromosome.1.fa.gz
!cd ..

The first 10 entries in the h5 file are predictions for the forward direction and the last 10 are for reverse complement.

In [209]:
# get sequences
import kipoiseq
from kipoiseq.extractors import VariantSeqExtractor
from cyvcf2 import VCF
import pybedtools
from kipoiseq.transforms.functional import one_hot

ex = VariantSeqExtractor('downloads/Homo_sapiens.GRCh37.dna.chromosome.1.fa')
vcf = VCF('test/example.vcf')
refs, alts = [], []
for v in vcf:
    interval = pybedtools.create_interval_from_list([v.CHROM, v.POS - 1000, v.POS + 1000])
    alt = ex.extract(interval, [v], anchor=0)
    v.ALT = v.REF
    ref = ex.extract(interval, [v], anchor=0)
    
    refs.append(ref)
    alts.append(alt)
    
vcf.close()
ex.close()

refs_onehot = np.stack([one_hot(s, alphabet=['A', 'G', 'C', 'T'], neutral_value=0).T[..., np.newaxis] for s in refs])
refs_onehot = np.concatenate([refs_onehot, refs_onehot[:, ::-1, ::-1]])
alts_onehot = np.stack([one_hot(s, alphabet=['A', 'G', 'C', 'T'], neutral_value=0).T[..., np.newaxis] for s in alts])
alts_onehot = np.concatenate([alts_onehot, alts_onehot[:, ::-1, ::-1]])

np.save('test/refs_onehot.npy', refs_onehot.astype(np.float32))
np.save('test/alts_onehot.npy', alts_onehot.astype(np.float32))

In [212]:
# validate predictions
import h5py

# expect
f = h5py.File('downloads/test.h5')
preds_expect = f.f['pred'][:]

# seqs
refs = np.load("test/refs_onehot.npy"); alts = np.load("test/alts_onehot.npy")

# preds
m = kipoi.get_model("DeepSEA/beluga")
pred_refs = m.predict_on_batch(refs); pred_alts = m.predict_on_batch(alts)

# test
assert np.allclose((pred_alts-pred_refs), preds_expect, atol=1e-6)
np.abs((pred_alts-pred_refs)- preds_expect).max()