In [None]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model
from keras import backend as K
import tensorflow as tf
import isolearn.keras as iso
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from keras.utils import plot_model
import time

from aparent.predictor import *
##################################################
#import bioPython for working with FASTA files
from Bio import SeqIO
##################################################


#loading model
aparent_model = load_model('./saved_models/aparent_large_lessdropout_all_libs_no_sampleweights.h5')
plot_model(aparent_model, show_shapes = True, to_file='APARENTmodel.png')
aparent_encoder = get_aparent_encoder(lib_bias=4)

#setting up files, prediction cor chr 21
fastaDestination = "./fastas/"
fastaNames = ["chrY"]
predDestination = "./PredictionBinaries/"
#strideSizes = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,25,30,35,40,45,50]
strideSizes = [10]
increaseSize = 100000
#running files
for name in fastaNames:
    contigSeq = SeqIO.read(fastaDestination + name + ".fasta", "fasta")
    seq = contigSeq.seq #actual genomic sequence from the file
    #split seq into 100K portions
    print ("PREDICTING ", contigSeq.id, " with length ", len(seq))
    for stride in strideSizes:
            print ("Stride length is: ", stride)
            start = 0
            end = increaseSize - 1
            for i in range(0,int(len(seq)/stride)):
                startTime = time.time()
                sliceSeq = seq[start:end + 1]
                x,y = find_polya_peaks_memoryFriendly(
                    aparent_model,
                    aparent_encoder,
                    sliceSeq,
                    sequence_stride=stride,
                    conv_smoothing=False,
                    peak_min_height=0.01,
                    peak_min_distance=50,
                    peak_prominence=(0.01, None),
                )
                repPeriod = name.replace(".", "_")
                np.save(predDestination + name + "Predictions/" +repPeriod + "_StrideLen" + str(stride) + "Start" + str(start+ 1) + "End" + str(end + 1), y )
                secondsDiffs = time.time()-startTime
                print ("Time for ",start, "to", end, ":", str(int(secondsDiffs/60)) + " mins " + str(secondsDiffs%60.0))
                start += increaseSize
                end += increaseSize
            restSeq = seq[end:]
            x,y = find_polya_peaks_memoryFriendly(
                aparent_model,
                aparent_encoder,
                restSeq,
                sequence_stride=stride,
                conv_smoothing=False,
                peak_min_height=0.01,
                peak_min_distance=50,
                peak_prominence=(0.01, None),
            )
            repPeriod = name.replace(".", "_")
            np.save(predDestination + name + "Predictions/" +repPeriod + "_StrideLen" + str(stride) + "Start" + str(end + 1) + "End" + str(len(seq)), y )
    print ("FINISHED")




PREDICTING  CM000686.2  with length  57227415
Stride length is:  10
Time for  0 to 99999 : 0 mins 32.43858289718628
Time for  100000 to 199999 : 0 mins 32.58081316947937
Time for  200000 to 299999 : 0 mins 32.528916358947754
Time for  300000 to 399999 : 0 mins 32.76347517967224
Time for  400000 to 499999 : 0 mins 32.60949373245239
Time for  500000 to 599999 : 0 mins 32.55373954772949
Time for  600000 to 699999 : 0 mins 33.49790811538696
Time for  700000 to 799999 : 0 mins 33.230608224868774
Time for  800000 to 899999 : 0 mins 33.22089958190918
Time for  900000 to 999999 : 0 mins 35.267927169799805
Time for  1000000 to 1099999 : 0 mins 34.548213958740234
Time for  1100000 to 1199999 : 0 mins 34.63970470428467
Time for  1200000 to 1299999 : 0 mins 34.64574718475342
Time for  1300000 to 1399999 : 0 mins 33.84622502326965
Time for  1400000 to 1499999 : 0 mins 34.78647565841675
Time for  1500000 to 1599999 : 0 mins 39.16990828514099
Time for  1600000 to 1699999 : 0 mins 34.669469594955444
T