In [1]:
# Imports
import kipoi
import os
import numpy as np
import pandas as pd

### Source Model

In [2]:
# Source model directly from directory
model = kipoi.get_model("../Xpresso_kipoi/human_median", source="dir")

Using downloaded and verified file: /home/vagar/Xpresso_kipoi/downloaded/model_files/human_median/weights/9d00a3bc614da81655328b6e278569e2


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.




### Download and prepare example files (optional)

In [3]:
import urllib.request
import gzip
import shutil
import pyranges as pr

In [4]:
# make ExampleFile directory if it does not exist
if not os.path.exists("ExampleFiles"):
    os.makedirs("ExampleFiles")

In [5]:
# Download GTF
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-gencode.v24.annotation_chr22.gtf?download=1", 'ExampleFiles/chrom22.gtf')
# Download fasta
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-hg38_chr22.fa?download=1", 'ExampleFiles/chrom22.fa')

('ExampleFiles/chrom22.fa', <http.client.HTTPMessage at 0x7fa9c8221a58>)

In [6]:
# Extract implied TSS sites from gtf
# Read in with pyranges
gr = pr.read_gtf('ExampleFiles/chrom22.gtf')
# Extract protein coding genes
prot_genes = gr.df[(gr.df.Feature == 'gene') & (gr.df.gene_type == 'protein_coding')]
# Compute implied TSS
prot_genes['TSS'] = (prot_genes.Start * (prot_genes.Strand == "+")) + (prot_genes.End * (prot_genes.Strand == "-"))
# Determine region around TSS
prot_genes['region_start'] = prot_genes.TSS + (-7000*(prot_genes.Strand == "+")) + (-3500 * (prot_genes.Strand == "-"))
prot_genes['region_end'] = prot_genes.TSS + (3500*(prot_genes.Strand == "+")) + (7000 * (prot_genes.Strand == "-"))
# Add nuisance column to make bed6
prot_genes["score"] = "."

In [7]:
# write bed file
bed = prot_genes[['Chromosome', 'region_start', 'region_end', 'gene_id', 'score', 'Strand']]
bed.to_csv("ExampleFiles/chrom22.bed", sep='\t', header=False, index=False)

### Provide the Parameters

In [8]:
# Path of the fasta file
fasta_path = "ExampleFiles/chrom22.fa"
# Set false if fasta has a chr prefix, true otherwise
num_chr = False

# Path of the bed file specifying the promoter regions
bed_path = "ExampleFiles/chrom22.bed"

# output file path
output_file_path = "predictions.tsv"

### Run Prediction

In [9]:
model.pipeline.predict_to_file(output_file_path, {"intervals_file":bed_path, 
                               "fasta_file":fasta_path,
                               "num_chr_fasta":num_chr},
                              batch_size=64)

100%|██████████| 7/7 [00:06<00:00,  1.20it/s]


### Load results

In [10]:
# Load data as dataframe
df = pd.read_csv(output_file_path, sep="\t")
df

Unnamed: 0,metadata/ranges/chr,metadata/ranges/end,metadata/ranges/id,metadata/ranges/start,metadata/ranges/strand,preds/expression_pred
0,chr22,11070000,0,11059500,+,-0.097702
1,chr22,15531657,1,15521157,+,-1.052358
2,chr22,15693525,2,15683025,+,-0.806651
3,chr22,17088453,3,17077953,+,0.872203
4,chr22,17363448,4,17352948,+,-0.903286
5,chr22,17566938,5,17556438,+,-0.735882
6,chr22,17632354,6,17621854,+,0.755943
7,chr22,18081419,7,18070919,+,0.698958
8,chr22,18113830,8,18103330,+,0.217665
9,chr22,18153398,9,18142898,+,0.452462


In [11]:
# Merge back with gene_ids
df = df.rename(columns={"metadata/ranges/chr":"Chromosome", "metadata/ranges/start":"region_start", "metadata/ranges/end":"region_end", "metadata/ranges/strand":"strand"})
merged = prot_genes.merge(df, on=["Chromosome", "region_start", "region_end"])

In [None]:
merged