Binary heterochromatin classification


In [14]:
import numpy as np
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Input ,Dense, Dropout, Activation, LSTM
from keras.layers import Lambda, Convolution1D, MaxPooling1D, Flatten, Reshape
from keras.layers.wrappers import TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
from keras.metrics import categorical_crossentropy, binary_crossentropy
#For data saving
import pickle
import random
#other imports
import gzip
import glob
import os
import keras.backend as K
import os
#cwd = os.path.dirname(os.path.realpath("SURF_001_TwoClass13.ipynb"))

Loading algorithims and labels

In [15]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

In [16]:
def oneHot_DNA(chrom):
    one_hot_full = np.zeros((len(chrom), len(chrom[0]), 4), dtype=np.int8)
    for i, seq in enumerate(chrom):
        seq_onehot = np.zeros((len(seq), 4))
        for j, nuc in enumerate(seq):
            if nuc == 'a':
                seq_onehot[j, :] = np.array([1, 0, 0, 0], dtype=np.int8)
            elif nuc == 't':
                seq_onehot[j, :] = np.array([0, 1, 0, 0], dtype=np.int8)
            elif nuc == 'c':
                seq_onehot[j, :] = np.array([0, 0, 1, 0], dtype=np.int8)
            elif nuc == 'g':
                seq_onehot[j, :] = np.array([0, 0, 0, 1], dtype=np.int8)
            one_hot_full[i,:,:] = seq_onehot
    return one_hot_full

In [17]:
# Do 2D onhot arrays and feed each full sequence into the nn one by one

In [18]:
def oneHot_DNA_full_PaddingZeros(chrom, max):
    largest_size = max
    #for seq in chrom:
    #  size = len(seq)
    #  if size > largest_size:
    #    largest_size = size
    one_hot_full = np.zeros((len(chrom), largest_size, 4), dtype=np.int8)
    for i, seq in enumerate(chrom):
        seq_onehot = np.zeros((largest_size, 4))
        for j, nuc in enumerate(seq):
            if nuc == 'a':
                seq_onehot[j, :] = np.array([1, 0, 0, 0], dtype=np.int8)
            elif nuc == 't':
                seq_onehot[j, :] = np.array([0, 1, 0, 0], dtype=np.int8)
            elif nuc == 'c':
                seq_onehot[j, :] = np.array([0, 0, 1, 0], dtype=np.int8)
            elif nuc == 'g':
                seq_onehot[j, :] = np.array([0, 0, 0, 1], dtype=np.int8)
            else:
                print('issue')
            one_hot_full[i,:,:] = seq_onehot
    return one_hot_full

In [19]:
def oneHot_DNA_full_PaddingZeros_1batch(chrom, max):
    largest_size = max
  #for seq in chrom:
  #  size = len(seq)
  #  if size > largest_size:
  #    largest_size = size
    one_hot_full = np.zeros((1, largest_size, 4), dtype=np.int8)
    for i, seq in enumerate(chrom):
        seq_onehot = np.zeros((largest_size, 4))
        for j, nuc in enumerate(seq):
            if nuc == 'a':
                seq_onehot[j, :] = np.array([1, 0, 0, 0], dtype=np.int8)
            elif nuc == 't':
                seq_onehot[j, :] = np.array([0, 1, 0, 0], dtype=np.int8)
            elif nuc == 'c':
                seq_onehot[j, :] = np.array([0, 0, 1, 0], dtype=np.int8)
            elif nuc == 'g':
                seq_onehot[j, :] = np.array([0, 0, 0, 1], dtype=np.int8)
            else:
                print('issue')
            one_hot_full[i,:,:] = seq_onehot
    return one_hot_full

In [20]:
chrom = [['a', 'a', 'a'], ['t', 't'], ['c', 'c', 'c','c'], ['g','g','g']]

In [21]:
def oneHot_labels2_1batch(chrom):
    seq_onehot = np.zeros((1, 2), dtype=np.int8)
    if chrom == 1:
        seq_onehot[0, :] = np.array([1, 0], dtype=np.int8)
    elif chrom == 0:
        seq_onehot[0, :] = np.array([0, 1], dtype=np.int8)
    else:
        print("issue - with the labels")
    return seq_onehot

Onehot encoding the samples and labels:

In [22]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']

max_num = 0
for x, indexes in enumerate(genome):
    with open('../DataSet_13_notonehot/DataSet_13_test_samples_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        test_samples = pickle.load(f2)
    with open('../DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        train_samples = pickle.load(f2)
    with open('../DataSet_13_notonehot/DataSet_13_test_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        test_labels = pickle.load(f2)
    with open('../DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        train_labels = pickle.load(f2)
    for i, label in enumerate(train_labels):
        if label == 0:
            if max_num < len(train_samples[i]):
                max_num = len(train_samples[i])
    for i, label in enumerate(test_labels):
        if label == 0:
            if max_num < len(test_samples[i]):
                 max_num = len(test_samples[i])
    print(max_num)
    print(max_num)

223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836


In [23]:
largest_seq = max_num

In [11]:
count_a = 0
count_a_place = 0
count_b = 0
count_b_place = 0
count_c = 0
count_c_place = 0
count = 0
partition_a = {}
train_a = []
labels_a = {}
partition_b = {}
train_b = []
labels_b = []

for x, indexes in enumerate(genome):
    train_samples = []
    train_labels = []
    with open('../DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
        train_samples = pickle.load(f1)
        #print(len(train_samples))
    with open('../DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        train_labels = pickle.load(f2)
        #print(len(train_labels))
    for y, index in enumerate(train_samples):
        len_seq = len(index)
        new_train_samples = []
        new_train_labels = []
        index_array = []
        index_array.append(index)
        if count_a_place > 10000:
            count_a_place -= 10000
        if count_b_place > 10000:
            count_b_place -= 10000
        if count_c_place > 10000:
            count_c_place -= 10000
        if len_seq <= 1000:
          #  new_train_samples = oneHot_DNA_full_PaddingZeros(index_array, 1000)
            new_train_labels = oneHot_labels2_1batch(train_labels[y])
            count_a += 1
            count_a_place += 1
            #  print("count_a")
            #with open('../DataSet_14_Part_4/' + str(int(count_a / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_a) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_labels, f)
          #  np.save('../DataSet_14_Part_4/' + str(int(count_a / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_a) + '.npy', new_train_labels, allow_pickle=True, fix_imports=True)
            #with open('../DataSet_14_Part_4/' + str(int(count_a / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_a) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_samples, f)
         #   np.save('../DataSet_14_Part_4/' + str(int(count_a / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_a) + '.npy', new_train_samples, allow_pickle=True, fix_imports=True)
            labels_a[str('DataSet_14_train_samples_onehot_batches1_Part_' + str(count_a))] = new_train_labels
            train_a.append(str('DataSet_14_train_samples_onehot_batches1_Part_' + str(count_a)))
        elif len_seq > 1000 and len_seq <= 10000:
          #  new_train_samples = oneHot_DNA_full_PaddingZeros_1batch(index_array, 10000)
          #  new_train_labels = oneHot_labels2_1batch(train_labels[y])
            count_b += 1
            count_b_place += 1
            #  print('count_b:' + str(count_b))
            #  print('count_b_place:' + str(count_b_place))
            #  print("count_b")
         #   np.save('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_b) + '.npy', new_train_labels, allow_pickle=True, fix_imports=True)
         #   np.save('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_b) + '.npy', new_train_samples, allow_pickle=True, fix_imports=True)
            #with open('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_b) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_labels, f)
            #with open('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_b) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_samples, f)
         #   labels_b.append(str(str(int(count_b / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_b)))
         #   train_b.append(str(str(int(count_b / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_b)))
        elif len_seq > 10000:
            count_c += 1
            count_c_place += 1
            #print("count_c")
            #new_train_samples = oneHot_DNA_full_PaddingZeros_1batch(index_array, max_num)
            #new_train_labels = oneHot_labels2_1batch(train_labels[y])
            #with open('../DataSet_14_Part_3/' + str(int(count_c / 10000))+'/' + str(int(count_c_place / 100))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_c) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_labels, f)
            #with open('../DataSet_14_Part_3/' + str(int(count_c / 10000))+'/' + str(int(count_c_place / 100))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_c) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_samples, f)
        else:
            print("error")
        if count % 100 == 0:
            print("Count:" + str(count))
        count += 1
partition_a['train'] = train_a
partition_a['labels'] = labels_a
with open('../partition_a.dat', 'wb') as f:
    pickle.dump(partition_a, f)
print(count_a)
print(count_b)
print(count_c)

Count:0
Count:100
Count:200
Count:300
Count:400
Count:500
Count:600
Count:700
Count:800
Count:900
Count:1000
Count:1100
Count:1200
Count:1300
Count:1400
Count:1500
Count:1600
Count:1700
Count:1800
Count:1900
Count:2000
Count:2100
Count:2200
Count:2300
Count:2400
Count:2500
Count:2600
Count:2700
Count:2800
Count:2900
Count:3000
Count:3100
Count:3200
Count:3300
Count:3400
Count:3500
Count:3600
Count:3700
Count:3800
Count:3900
Count:4000
Count:4100
Count:4200
Count:4300
Count:4400
Count:4500
Count:4600
Count:4700
Count:4800
Count:4900
Count:5000
Count:5100
Count:5200
Count:5300
Count:5400
Count:5500
Count:5600
Count:5700
Count:5800
Count:5900
Count:6000
Count:6100
Count:6200
Count:6300
Count:6400
Count:6500
Count:6600
Count:6700
Count:6800
Count:6900
Count:7000
Count:7100
Count:7200
Count:7300
Count:7400
Count:7500
Count:7600
Count:7700
Count:7800
Count:7900
Count:8000
Count:8100
Count:8200
Count:8300
Count:8400
Count:8500
Count:8600
Count:8700
Count:8800
Count:8900
Count:9000
Count:9100


In [30]:
count_a = 0
count_a_place = 0
count_b = 0
count_b_place = 0
count_c = 0
count_c_place = 0
count = 0
partition_a = {}
test_a = []
labels_a = {}
partition_b = {}
test_b = []
labels_b = []
just_labels = []
for x, indexes in enumerate(genome):
    test_samples = []
    test_labels = []
    with open('../DataSet_13_notonehot/DataSet_13_test_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
        test_samples = pickle.load(f1)
        #print(len(test_samples))
    with open('../DataSet_13_notonehot/DataSet_13_test_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        test_labels = pickle.load(f2)
        #print(len(train_labels))
    for y, index in enumerate(test_samples):
        len_seq = len(index)
        new_test_samples = []
        new_test_labels = []
        index_array = []
        index_array.append(index)
        if count_a_place > 10000:
            count_a_place -= 10000
        if count_b_place > 10000:
            count_b_place -= 10000
        if count_c_place > 10000:
            count_c_place -= 10000
        if len_seq <= 1000:
            new_test_samples = oneHot_DNA_full_PaddingZeros(index_array, 1000)
            new_test_labels = oneHot_labels2_1batch(test_labels[y])
            count_a += 1
            count_a_place += 1
            #  print("count_a")
            #with open('../DataSet_14_Part_4/' + str(int(count_a / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_a) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_labels, f)
            np.save('../DataSet_14_Test_2/DataSet_14_test_labels_onehot_batches1_Part_' + str(count_a) + '.npy', new_test_labels, allow_pickle=True, fix_imports=True)
            #with open('../DataSet_14_Part_4/' + str(int(count_a / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_a) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_samples, f)
            np.save('../DataSet_14_Test_2/DataSet_14_test_samples_onehot_batches1_Part_' + str(count_a) + '.npy', new_test_samples, allow_pickle=True, fix_imports=True)
            labels_a[str('DataSet_14_test_onehot_batches1_Part_' + str(count_a))] = new_test_labels
            just_labels.append(new_test_labels)
            test_a.append(str('DataSet_14_test_samples_onehot_batches1_Part_' + str(count_a)))
        elif len_seq > 1000 and len_seq <= 10000:
          #  new_train_samples = oneHot_DNA_full_PaddingZeros_1batch(index_array, 10000)
          #  new_train_labels = oneHot_labels2_1batch(train_labels[y])
            count_b += 1
            count_b_place += 1
            #  print('count_b:' + str(count_b))
            #  print('count_b_place:' + str(count_b_place))
            #  print("count_b")
         #   np.save('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_b) + '.npy', new_train_labels, allow_pickle=True, fix_imports=True)
         #   np.save('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_b) + '.npy', new_train_samples, allow_pickle=True, fix_imports=True)
            #with open('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_b) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_labels, f)
            #with open('../DataSet_14_Part_5/' + str(int(count_b / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_b) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_samples, f)
         #   labels_b.append(str(str(int(count_b / 20000))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_b)))
         #   train_b.append(str(str(int(count_b / 20000))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_b)))
        elif len_seq > 10000:
            count_c += 1
            count_c_place += 1
            #print("count_c")
            #new_train_samples = oneHot_DNA_full_PaddingZeros_1batch(index_array, max_num)
            #new_train_labels = oneHot_labels2_1batch(train_labels[y])
            #with open('../DataSet_14_Part_3/' + str(int(count_c / 10000))+'/' + str(int(count_c_place / 100))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_c) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_labels, f)
            #with open('../DataSet_14_Part_3/' + str(int(count_c / 10000))+'/' + str(int(count_c_place / 100))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_c) + '.dat', 'wb') as f:
            #    pickle.dump(new_train_samples, f)
        else:
            print("error")
        if count % 100 == 0:
            print("Count:" + str(count))
        count += 1
partition_a['test'] = test_a
partition_a['labels'] = labels_a
with open('../test_labels.dat', 'wb') as f:
    pickle.dump(just_labels, f)
with open('../partition_test_a.dat', 'wb') as f:
    pickle.dump(partition_a, f)
print(count_a)
print(count_b)
print(count_c)

Count:0
Count:100
Count:200
Count:300
Count:400
Count:500
Count:600
Count:700
Count:800
Count:900
Count:1000
Count:1100
Count:1200
Count:1300
Count:1400
Count:1500
Count:1600
Count:1700
Count:1800
Count:1900
Count:2000
Count:2100
Count:2200
Count:2300
Count:2400
Count:2500
Count:2600
Count:2700
Count:2800
Count:2900
Count:3000
Count:3100
Count:3200
Count:3300
Count:3400
Count:3500
Count:3600
Count:3700
Count:3800
Count:3900
Count:4000
Count:4100
Count:4200
Count:4300
Count:4400
Count:4500
Count:4600
Count:4700
Count:4800
Count:4900
Count:5000
Count:5100
Count:5200
Count:5300
Count:5400
Count:5500
Count:5600
Count:5700
Count:5800
Count:5900
Count:6000
Count:6100
Count:6200
Count:6300
Count:6400
Count:6500
Count:6600
Count:6700
Count:6800
Count:6900
Count:7000
Count:7100
Count:7200
Count:7300
Count:7400
Count:7500
Count:7600
Count:7700
Count:7800
Count:7900
Count:8000
Count:8100
Count:8200
Count:8300
Count:8400
Count:8500
Count:8600
Count:8700
Count:8800
Count:8900
Count:9000
Count:9100


In [27]:
print(labels_a)

{'DataSet_14_test_onehot_batches1_Part_1': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_2': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_3': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_4': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_5': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_6': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_7': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_8': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_9': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_10': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_11': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_12': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_13': array([[1, 0]], dtype=int8), 'DataSet_14_test_onehot_batches1_Part_14': array([[1, 0]], dtype=int8), 

In [13]:
a = partition_a['labels']['DataSet_14_train_samples_onehot_batches1_Part_59221']
print(a)

[[0 1]]


In [14]:
import random
rand_list = []
# 20%
while(len(rand_list) != 13520):
    random_num = random.randint(0, 67602)
    if random_num not in rand_list:
        rand_list.append(random_num)
print(len(rand_list))

13520


In [15]:
# Partition with train and val split 80, 20:
new_partition_a = {}
training = partition_a['train']
print(len(training))
new_partition_a['labels'] = partition_a['labels']
new_train = []
new_validation = []
for i, seq in enumerate(training):
    if i in rand_list:
        new_validation.append(seq)
    else:
        new_train.append(seq)
new_partition_a['train'] = new_train
print(len(new_train))
new_partition_a['validation'] = new_validation
print(len(new_validation))

with open('../partition_train_val_a.dat', 'wb') as f:
    pickle.dump(new_partition_a, f)

67602
54082
13520


In [19]:
a = new_partition_a['labels']['DataSet_14_train_samples_onehot_batches1_Part_59221']
print(a)
a = new_partition_a['train'][0]
print(a)
print(new_partition_a['labels'][a])

[[0 1]]
DataSet_14_train_samples_onehot_batches1_Part_1
[[1 0]]


In [None]:
'''
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
test_samples = []
test_labels = []
part = 0 

for x, indexes in enumerate(genome):
  test_samples = []
  test_labels = []
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
    test_samples = pickle.load(f1)
      #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
    test_labels = pickle.load(f2)
    print(len(test_samples))
    print(len(test_labels))
  
  test_samples = oneHot_DNA_full_PaddingZeros(test_samples, largest_seq)
  print(test_samples.shape)

  test_labels = oneHot_labels2(test_labels)
  print(test_labels.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_labels_onehot_batches5_Chr:{}.dat'.format(indexes), 'wb') as f:
    pickle.dump(test_labels, f)
    print(test_labels.shape)
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_samples_onehot_batches5_Chr:{}.dat'.format(indexes), 'wb') as f:
    pickle.dump(test_samples, f)
    print(test_samples.shape)
'''
  

In [None]:
with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_labels_onehot_batches5_Chr:chr1.dat', 'rb') as f:
  test_labels = pickle.load(f)
with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_samples_onehot_batches5_Chr:chr1.dat', 'rb') as f:
  test_samples = pickle.load(f)

In [None]:
'''
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
train_samples = []
train_labels = []
part = 0 
size_batches = 5
num_place = 0
for x, indexes in enumerate(genome):
  if x != 7:
    print(indexes)
    train_samples = []
    train_labels = []
    with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
      train_samples = pickle.load(f1)
      #print(len(train_samples))
    with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
      train_labels = pickle.load(f2)
      #print(len(train_labels))
    size = len(train_samples)
    if size > size_batches:
      loop = 0
      while size > 0:
        if num_place > 20000:
          num_place -= 20000
        if size - size_batches > 0:
          new_train_samples = oneHot_DNA_full_PaddingZeros(train_samples[loop:(loop + size_batches)], largest_seq)
          #print(new_train_samples.shape)
          new_train_labels = oneHot_labels2(train_labels[loop:loop + size_batches])
          #print(new_train_labels.shape)
          loop += size_batches
          size -= size_batches
          part += 1
          num_place += 1
        else:
          new_train_samples = oneHot_DNA_full_PaddingZeros(train_samples[loop:len(train_samples)], largest_seq)
          #print(new_train_samples.shape)
          new_train_labels = oneHot_labels2(train_labels[loop:len(train_samples)])
          #print(new_train_labels.shape)
          part += 1
          loop += size_batches
          size -= size_batches
          num_place += 1
        if part % 1000 == 0:
          print('part:' + str(part))
        with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/' + str(int(part / 20000))+'/' + str(int(num_place / 100))+'/DataSet_13_train_labels_onehot_batches5_Part:' + str(part) + '.dat', 'wb') as f:
          pickle.dump(new_train_labels, f)
        with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/' + str(int(part / 20000))+'/' + str(int(num_place / 100))+'/DataSet_13_train_samples_onehot_batches5_Part:' + str(part) + '.dat', 'wb') as f:
          pickle.dump(new_train_samples, f)

'''

**Batches:**

**Batches 1**:

    with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_labels_onehot_batches1_Part:' + str(part) + '.dat', 'rb') as f:
      train_labels = pickle.load(f)
    with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_samples_onehot_batches1_Part:' + str(part) + '.dat', 'rb') as f:
      train_samples = pickle.load(f)

**Batches 200**:

     with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_labels_onehot_batches200_Part:' + str(part) + '.dat', 'rb') as f:
      train_labels = pickle.load(f)
    with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_samples_onehot_batches200_Part:' + str(part) + '.dat', 'rb') as f:
      train_samples = pickle.load(f)

      

part:73000 last run


Splitting into groups of 200 bp and then one hot encoding:


In [None]:
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[0] + '.dat', 'rb') as f1:
    train_samples = pickle.load(f1)
    #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[0] + '.dat', 'rb') as f2:
    train_labels = pickle.load(f2)

In [None]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
train_samples = []
train_labels = []
for x, indexes in enumerate(genome):
  train_samples = []
  train_labels = []
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
    train_samples = pickle.load(f1)
    #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
    train_labels = pickle.load(f2)

  ## Loop for spliting sequences and labels to 200 bp:
  count = 0
  new_train_seq = []
  new_train_labels = []
  count_train = 0
  for seq in train_samples:
    for i in range(0, len(seq), 200):
      new_train_seq.append(seq[i : i+200])
      new_train_labels.append(train_labels[count])
    count += 1
    count_train += 1
  print(count_train)
  print(len(new_train_seq))
  print(len(new_train_labels))

  ## Making sure the size is right no less than 200 bp
  #import math
  sample_new = []
  label_new = []
  for j, seq in enumerate(new_train_seq):
    if len(seq) >= 200:
      sample_new.append(seq)
      label_new.append(new_train_labels[j])
  train_samples = sample_new
  train_labels = label_new
  print(len(train_samples))
  print(len(train_labels))
  
  train_samples = oneHot_DNA(train_samples)
  print(train_samples.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_train_labels_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(train_samples, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_samples_onehot_200bp' + genome[x] + '.dat', 'rb') as f:
  #  train_samples = pickle.load(f)
  #  print(train_samples.shape)

  train_labels = oneHot_labels2(train_labels)
  print(train_labels.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_train_labels_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(train_labels, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_labels_onehot_200bp' + genome[x] + '.dat', 'rb') as f:
  #  train_labels = pickle.load(f)
  #  print(train_labels.shape) 
  

In [None]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
test_samples = []
test_labels = []
for x, indexes in enumerate(genome):
  test_samples = []
  test_labels = []
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
    test_samples = pickle.load(f1)
    #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
    test_labels = pickle.load(f2)
  count_test = 0
  for q in test_labels:
    if q != 1:
      count_test += 1
  print(count_test)

  ## Loop for spliting sequences and labels to 200 bp:
  count = 0
  new_test_seq = []
  new_test_labels = []
  count_test = 0
  for seq in test_samples:
    for i in range(0, len(seq), 200):
      new_test_seq.append(seq[i : i+200])
      new_test_labels.append(test_labels[count])
    count += 1
    count_test += 1
  # print(count_test)
  # print(len(new_test_seq))
  # print(len(new_test_labels))
  #    print(count_test)
#    print(len(new_test_seq))
#    print(len(new_test_labels))

  count_test = 0
  for q in new_test_labels:
    if q != 1:
      count_test += 1
  print(count_test)
  print(test_labels[0])


  ## Making sure the size is right no less than 200 bp
  #import math
  sample_new = []
  label_new = []
  for i, seq in enumerate(new_test_seq):
    if len(seq) >= 200:
      sample_new.append(seq)
      label_new.append(new_test_labels[i])
  test_samples = sample_new
  test_labels = label_new
  # print(len(test_samples))
  # print(len(test_labels))
  
  count_test = 0
  for q in test_labels:
    if q != 1:
      count_test += 1
  print(count_test)
  print(test_labels[0])

  test_samples = oneHot_DNA(test_samples)
  print(test_samples.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_test_samples_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(test_samples, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_test_samples_onehot_200bp' + genome[i] + '.dat', 'rb') as f:
  #  test_samples = pickle.load(f)
  #  print(test_samples.shape)

  test_labels = oneHot_labels2(test_labels)
  print(test_labels.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_test_labels_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(test_labels, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_test_labels_onehot_200bp' + genome[i] + '.dat', 'rb') as f:
  #  test_labels = pickle.load(f)
  #  print(test_labels.shape) 

In [None]:
with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_test_labels_' + genome[0] + '.dat', 'rb') as f2:
  test_labels = pickle.load(f2)
  print(len(test_labels))
  count_test = 0
  for q in test_labels:
    if q != 1:
      count_test += 1
  print(count_test)