# Anchored string seq dl dataloader

In [1]:
from importlib import reload
import urllib.request
import gzip
import shutil
import pyranges as pr
from kipoiseq.dataloaders.sequence import AnchoredGTFDl
from kipoiseq.transforms.functional import one_hot_dna

## Get model and data

In [2]:
import kipoi
# Source model directly from directory
model = kipoi.get_model("https://github.com/kipoi/models/tree/master/Xpresso/human_median",
    source='github-permalink')

Using downloaded and verified file: /data/ouga04b/ag_gagneur/home/karollus/.kipoi/github-permalink/kipoi/models/master/Xpresso/downloaded/model_files/human_median/weights/9d00a3bc614da81655328b6e278569e2


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.




In [3]:
# Download GTF
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-gencode.v24.annotation_chr22.gtf?download=1", 'chrom22.gtf')
# Download fasta
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-hg38_chr22.fa?download=1", 'chrom22.fa')

('chrom22.fa', <http.client.HTTPMessage at 0x150e4d683320>)

## Build a TSS dataloader from the generic anchored GTF dataloader

In [3]:
class TSSDl(AnchoredGTFDl):
    
    def __init__(self, gtf_file, fasta_file, 
                 num_upstream, num_downstream,
                 interval_attrs=["gene_id", "Strand"],
                 use_strand=True):
        super().__init__(
            gtf_file, fasta_file, 
            num_upstream, num_downstream,
            gtf_filter = 'gene_type == "protein_coding"',
            anchor = "tss",
            transform = TSSDl._transform,
            interval_attrs=["gene_id", "Strand"],
            use_strand=True
        )
    
    @staticmethod    
    def _transform(x):
        return one_hot_dna(x)

In [4]:
tss = TSSDl("chrom22.gtf", 'chrom22.fa', 7000, 3500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  + (gtf.End * (gtf.Strand == "-")))


## Test it

In [5]:
tss[0]

{'inputs': array([[1.  , 0.  , 0.  , 0.  ],
        [0.  , 0.  , 1.  , 0.  ],
        [1.  , 0.  , 0.  , 0.  ],
        ...,
        [0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25]]),
 'metadata': {'gene_id': 'ENSG00000279973.1',
  'Strand': '+',
  'ranges': GenomicRanges(chr='chr22', start=11059500, end=11070000, id='0', strand='*')}}

In [6]:
it = tss.batch_iter(batch_size=1, shuffle=False, num_workers=0, drop_last=False)

In [7]:
preds = [model.predict_on_batch(x["inputs"]) for x in it]

In [8]:
preds

[{'expression_pred': array([-0.09770225], dtype=float32)},
 {'expression_pred': array([-1.0523579], dtype=float32)},
 {'expression_pred': array([-0.80665106], dtype=float32)},
 {'expression_pred': array([0.87220347], dtype=float32)},
 {'expression_pred': array([-0.90328634], dtype=float32)},
 {'expression_pred': array([-0.7358816], dtype=float32)},
 {'expression_pred': array([0.7559425], dtype=float32)},
 {'expression_pred': array([0.6989577], dtype=float32)},
 {'expression_pred': array([0.21766458], dtype=float32)},
 {'expression_pred': array([0.45246226], dtype=float32)},
 {'expression_pred': array([0.34520018], dtype=float32)},
 {'expression_pred': array([0.7672451], dtype=float32)},
 {'expression_pred': array([-1.060151], dtype=float32)},
 {'expression_pred': array([-1.0968332], dtype=float32)},
 {'expression_pred': array([0.9087679], dtype=float32)},
 {'expression_pred': array([0.97289217], dtype=float32)},
 {'expression_pred': array([0.51203346], dtype=float32)},
 {'expression_pr

Same results as previously

# Test start codon based extractor

In [9]:
class StartCodonDl(AnchoredGTFDl):
    
    def __init__(self, gtf_file, fasta_file, 
                 num_upstream, num_downstream,
                 interval_attrs=["gene_id", "Strand"],
                 use_strand=True):
        super().__init__(
            gtf_file, fasta_file, 
            num_upstream, num_downstream,
            gtf_filter = 'gene_type == "protein_coding"',
            anchor = "start_codon",
            transform = None,
            interval_attrs=["gene_id", "Strand"],
            use_strand=True
        )

In [10]:
start = StartCodonDl("chrom22.gtf", 'chrom22.fa', 50, 50)

In [11]:
start[0]

{'inputs': array('GAGTGTTAGGAGGGTGGCCTGAGCAGTAGGATTGGGGCTGGAGCAGTAAGATGGCAGCCGGAGCGGTAAGAGTGCAGCCTGAGCGGTAGGAGGGTGGCTG',
       dtype='<U100'),
 'metadata': {'gene_id': 'ENSG00000279973.1',
  'Strand': '+',
  'ranges': GenomicRanges(chr='chr22', start=11066450, end=11066550, id='0', strand='*')}}

In [12]:
str(start[0]["inputs"])[:53]

'GAGTGTTAGGAGGGTGGCCTGAGCAGTAGGATTGGGGCTGGAGCAGTAAGATG'

In [13]:
start[1000]

{'inputs': array('GGAGCACATGGAACGCTTTGGGGTCCCTTTTTAGCCGGGGATTCCAGTGAATGAAAACGGTAGCAGGGGCTCTTTTGAGCTTGGTCATGGGGCAGCCCTC',
       dtype='<U100'),
 'metadata': {'gene_id': 'ENSG00000184470.20',
  'Strand': '-',
  'ranges': GenomicRanges(chr='chr22', start=19881039, end=19881139, id='1000', strand='*')}}

In [14]:
str(start[1000]["inputs"])[:53]

'GGAGCACATGGAACGCTTTGGGGTCCCTTTTTAGCCGGGGATTCCAGTGAATG'

Works for both strands as expected