# Anchored string seq dl dataloader

In [1]:
import urllib.request
import gzip
import shutil
import pyranges as pr
from kipoiseq.dataloaders.sequence import AnchoredGTFDl
from kipoiseq.transforms.functional import one_hot_dna

## Get model and data

In [None]:
import kipoi
# Source model directly from directory
model = kipoi.get_model("https://github.com/kipoi/models/tree/master/Xpresso/human_median",
    source='github-permalink')

In [2]:
# Download GTF
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-gencode.v24.annotation_chr22.gtf?download=1", 'chrom22.gtf')
# Download fasta
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-hg38_chr22.fa?download=1", 'chrom22.fa')

('chrom22.fa', <http.client.HTTPMessage at 0x2af6f25d0fd0>)

## Build a TSS dataloader from the generic anchored GTF dataloader

In [3]:
class TSSDl(AnchoredGTFDl):
    
    def __init__(self, gtf_file, fasta_file, 
                 num_upstream, num_downstream,
                 interval_attrs=["gene_id", "Strand"],
                 use_strand=True):
        super().__init__(
            gtf_file, fasta_file, 
            num_upstream, num_downstream,
            gtf_filter = TSSDl._gtf_filter,
            anchor_extractor = TSSDl._anchor_extractor,
            transform = TSSDl._transform,
            interval_attrs=["gene_id", "Strand"],
            use_strand=True
        )
        
    @staticmethod    
    def _gtf_filter(gtf):
        return gtf.query('Feature == "gene" & gene_type == "protein_coding"')
    
    @staticmethod    
    def _anchor_extractor(gtf):
        gtf["anchor_pos"] = ((gtf.Start * (gtf.Strand == "+")) 
                  + ((gtf.End) * (gtf.Strand == "-")))
        # Maybe this needs to be gtf.End - 1, but not 100% sure
        return gtf
    
    @staticmethod    
    def _transform(x):
        return one_hot_dna(x)

In [4]:
tss = TSSDl("chrom22.gtf", 'chrom22.fa', 7000, 3500)

## Test it

In [5]:
tss[0]

{'inputs': array([[1.  , 0.  , 0.  , 0.  ],
        [0.  , 0.  , 1.  , 0.  ],
        [1.  , 0.  , 0.  , 0.  ],
        ...,
        [0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25]]),
 'metadata': {'gene_id': 'ENSG00000279973.1',
  'Strand': '+',
  'ranges': GenomicRanges(chr='chr22', start=11059500, end=11070000, id='0', strand='*')}}

In [6]:
it = tss.batch_iter(batch_size=1, shuffle=False, num_workers=0, drop_last=False)

In [7]:
preds = [model.predict_on_batch(x["inputs"]) for x in it]

### This yields the same predictions as with the previous dataloader (see the kipoi_example file kipoi/models/Xpresso

In [8]:
preds

[{'expression_pred': array([-0.09770225], dtype=float32)},
 {'expression_pred': array([-1.0523579], dtype=float32)},
 {'expression_pred': array([-0.80665106], dtype=float32)},
 {'expression_pred': array([0.87220347], dtype=float32)},
 {'expression_pred': array([-0.90328634], dtype=float32)},
 {'expression_pred': array([-0.7358816], dtype=float32)},
 {'expression_pred': array([0.7559425], dtype=float32)},
 {'expression_pred': array([0.6989577], dtype=float32)},
 {'expression_pred': array([0.21766458], dtype=float32)},
 {'expression_pred': array([0.45246226], dtype=float32)},
 {'expression_pred': array([0.34520018], dtype=float32)},
 {'expression_pred': array([0.7672451], dtype=float32)},
 {'expression_pred': array([-1.060151], dtype=float32)},
 {'expression_pred': array([-1.0968332], dtype=float32)},
 {'expression_pred': array([0.9087679], dtype=float32)},
 {'expression_pred': array([0.97289217], dtype=float32)},
 {'expression_pred': array([0.51203346], dtype=float32)},
 {'expression_pr