<a href="https://colab.research.google.com/github/kundajelab/revcomp_experiments/blob/master/RegressionExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import division, print_function

Run some stuff to set up the colab environment (won't need to do this on the cluster)

In [None]:
#download the human genome
#Get hg38 fasta by download 2bit and then converting to fa
#On the cluster, this file is stored in /mnt/data/annotations/...
![[ -f hg38.2bit ]] || wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit -O hg38.2bit  
![[ -f twoBitToFa ]] || wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/twoBitToFa -O twoBitToFa
!chmod a+x twoBitToFa
![[ -f hg38.genome.fa ]] || ./twoBitToFa hg38.2bit hg38.genome.fa

#download hg38 chromsizes file
# This would also be in /mnt/data/annotations
![[ -f hg38.chrom.sizes ]] || wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes -O hg38.chrom.sizes
  
#install bedtools
!apt-get install bedtools

#Install samtools
%cd /content
![[ -f samtools-1.9.tar.bz2 ]] || wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2
!tar -xjf samtools-1.9.tar.bz2
%cd samtools-1.9
!./configure
!make
!make install
%cd ..

#make index for the human genome
#These are already built for the files on the cluster
![[ -e hg38.genome.fa.fai ]] || samtools faidx hg38.genome.fa

Reading package lists... Done
Building dependency tree       
Reading state information... Done
bedtools is already the newest version (2.26.0+dfsg-5).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.
/content
/content/samtools-1.9
checking for gcc... gcc
checking whether the C compiler works... yes
checking for C compiler default output file name... a.out
checking for suffix of executables... 
checking whether we are cross compiling... no
checking for suffix of object files... o
checking whether we are using the GNU C compiler... yes
checking whether gcc accepts -g... yes
checking for gcc option to accept ISO C89... none needed
checking for special C compiler options needed for large files... no
checking for _FILE_OFFSET_BITS value needed for large files... no
checking location of HTSlib source tree... htslib-1.9
checking for NcursesW wid

In [None]:
![[ -f ENCFF794GVQ.bed.gz ]] || wget https://www.encodeproject.org/files/ENCFF794GVQ/@@download/ENCFF794GVQ.bed.gz
!ln -s ENCFF794GVQ.bed.gz peaks_with_signal.bed.gz

ln: failed to create symbolic link 'peaks_with_signal.bed.gz': File exists


In [None]:
#We want to prepare a bed file that has +/- 1kb around the summit, followed by
# the signal strength
! zcat peaks_with_signal.bed.gz | perl -lane 'print $F[0]."\t".($F[1]+$F[9])."\t".($F[1]+$F[9])."\t+\t".($F[6])' | gzip -c > summits_with_signal.bed.gz

#We split into training/test/validation set by chromosome
!zcat summits_with_signal.bed.gz | egrep -w 'chr1|chr8|chr21' | gzip -c > test_summits_with_signal.bed.gz
!zcat summits_with_signal.bed.gz | egrep -w 'chr22' | gzip -c > valid_summits_with_signal.bed.gz
!zcat summits_with_signal.bed.gz | egrep -w -v 'chr1|chr8|chr21|chr22' | gzip -c > train_summits_with_signal.bed.gz

In [None]:
![[ -e seqdataloader ]] && rm -rf seqdataloader
!git clone https://github.com/kundajelab/seqdataloader.git
%cd seqdataloader
!pip uninstall seqdataloader
!pip install .
%cd ..

Cloning into 'seqdataloader'...
remote: Enumerating objects: 116, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 624 (delta 72), reused 77 (delta 37), pack-reused 508[K
Receiving objects: 100% (624/624), 3.80 MiB | 43.24 MiB/s, done.
Resolving deltas: 100% (398/398), done.
/content/seqdataloader
Uninstalling seqdataloader-0.126:
  Would remove:
    /usr/local/bin/genomewide_labels
    /usr/local/lib/python3.6/dist-packages/seqdataloader-0.126.dist-info/*
    /usr/local/lib/python3.6/dist-packages/seqdataloader/*
Proceed (y/n)? y
  Successfully uninstalled seqdataloader-0.126
Processing /content/seqdataloader
Building wheels for collected packages: seqdataloader
  Building wheel for seqdataloader (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-nkg7shj3/wheels/c2/db/13/112d41662f69fb8c7986c218293570cc1550fc21eed966e31b
Successfully built seqdataloader
Installing collected pac

In [None]:
!zcat summits_with_signal.bed.gz | head

chr5	40756289	40756289	3.19598
chr9	129488708	129488708	3.25621
chr12	67269154	67269154	3.33539
chr9	2281574	2281574	3.57597
chr10	118594719	118594719	3.60814
chr6	37354217	37354217	3.66291
chr12	121800504	121800504	3.78161
chr20	62362217	62362217	3.86367
chr1	42963108	42963108	4.00591
chr10	35127150	35127150	4.05166


In [None]:
from seqdataloader.batchproducers import coordbased
import gzip
import numpy as np

class ColsInBedFile(
    coordbased.coordstovals.core.AbstractSingleNdarrayCoordsToVals):
    def __init__(self, gzipped_bed_file, **kwargs):
        super(ColsInBedFile, self).__init__(**kwargs)
        self.gzipped_bed_file = gzipped_bed_file
        coords_to_vals = {}
        for row in gzip.open(gzipped_bed_file, 'rb'):
            row = row.decode("utf-8").rstrip()
            split_row = row.split("\t")
            chrom_start_end = split_row[0]+":"+split_row[1]+"-"+split_row[2]
            vals = np.array([float(x) for x in split_row[4:]])
            coords_to_vals[chrom_start_end] = vals
        self.coords_to_vals = coords_to_vals
        
    def _get_ndarray(self, coors):
        to_return = []
        for coor in coors:
            chrom_start_end = (coor.chrom+":"
                               +str(coor.start)+"-"+str(coor.end))
            to_return.append(self.coords_to_vals[chrom_start_end])
        return np.array(to_return)
    
    
inputs_coordstovals = coordbased.coordstovals.fasta.PyfaidxCoordsToVals(
  genome_fasta_path="hg38.genome.fa",
  center_size_to_use=1000)

targets_coordstovals = ColsInBedFile(
       gzipped_bed_file="summits_with_signal.bed.gz")
            
keras_train_batch_generator = coordbased.core.KerasBatchGenerator(
    coordsbatch_producer=coordbased.coordbatchproducers.SimpleCoordsBatchProducer(
      bed_file="train_summits_with_signal.bed.gz",
      #coord_batch_transformer=coordbased.coordbatchtransformers.ReverseComplementAugmenter(),
      batch_size=64,
      shuffle_before_epoch=True,
      seed=1234
    ),
    inputs_coordstovals=inputs_coordstovals,
    targets_coordstovals=targets_coordstovals
)

In [None]:
#define your model
model.fit_generator(..)

(array([[[0, 1, 0, 0],
         [0, 1, 0, 0],
         [0, 1, 0, 0],
         ...,
         [0, 0, 1, 0],
         [0, 0, 1, 0],
         [1, 0, 0, 0]],
 
        [[0, 0, 1, 0],
         [0, 0, 0, 1],
         [0, 0, 0, 1],
         ...,
         [0, 0, 0, 1],
         [1, 0, 0, 0],
         [0, 0, 1, 0]],
 
        [[0, 1, 0, 0],
         [0, 0, 1, 0],
         [0, 1, 0, 0],
         ...,
         [1, 0, 0, 0],
         [1, 0, 0, 0],
         [1, 0, 0, 0]],
 
        ...,
 
        [[0, 0, 1, 0],
         [1, 0, 0, 0],
         [0, 0, 0, 1],
         ...,
         [0, 0, 1, 0],
         [0, 0, 0, 1],
         [0, 0, 1, 0]],
 
        [[0, 0, 1, 0],
         [0, 0, 1, 0],
         [1, 0, 0, 0],
         ...,
         [0, 1, 0, 0],
         [0, 1, 0, 0],
         [0, 1, 0, 0]],
 
        [[1, 0, 0, 0],
         [0, 0, 0, 1],
         [0, 0, 1, 0],
         ...,
         [0, 0, 0, 1],
         [0, 0, 0, 1],
         [0, 0, 0, 1]]]), array([[ 20.37783],
        [ 48.88749],
        [ 16.1