Binary heterochromatin classification


In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Input ,Dense, Dropout, Activation, LSTM
from keras.layers import Lambda, Convolution1D, MaxPooling1D, Flatten, Reshape
from keras.layers.wrappers import TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
from keras.metrics import categorical_crossentropy, binary_crossentropy
#For data saving
import pickle
import random
#other imports
import gzip
import glob
import os
import keras.backend as K
import os
#cwd = os.path.dirname(os.path.realpath("SURF_001_TwoClass13.ipynb"))

Loading algorithims and labels

In [2]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

In [3]:
def oneHot_DNA(chrom):
    one_hot_full = np.zeros((len(chrom), len(chrom[0]), 4), dtype=np.int8)
    for i, seq in enumerate(chrom):
        seq_onehot = np.zeros((len(seq), 4))
        for j, nuc in enumerate(seq):
            if nuc == 'a':
                seq_onehot[j, :] = np.array([1, 0, 0, 0], dtype=np.int8)
            elif nuc == 't':
                seq_onehot[j, :] = np.array([0, 1, 0, 0], dtype=np.int8)
            elif nuc == 'c':
                seq_onehot[j, :] = np.array([0, 0, 1, 0], dtype=np.int8)
            elif nuc == 'g':
                seq_onehot[j, :] = np.array([0, 0, 0, 1], dtype=np.int8)
            one_hot_full[i,:,:] = seq_onehot
    return one_hot_full

In [4]:
# Do 2D onhot arrays and feed each full sequence into the nn one by one

In [5]:
def oneHot_DNA_full_PaddingZeros(chrom, max):
    largest_size = max
    #for seq in chrom:
    #  size = len(seq)
    #  if size > largest_size:
    #    largest_size = size
    one_hot_full = np.zeros((len(chrom), largest_size, 4), dtype=np.int8)
    for i, seq in enumerate(chrom):
        seq_onehot = np.zeros((largest_size, 4))
        for j, nuc in enumerate(seq):
            if nuc == 'a':
                seq_onehot[j, :] = np.array([1, 0, 0, 0], dtype=np.int8)
            elif nuc == 't':
                seq_onehot[j, :] = np.array([0, 1, 0, 0], dtype=np.int8)
            elif nuc == 'c':
                seq_onehot[j, :] = np.array([0, 0, 1, 0], dtype=np.int8)
            elif nuc == 'g':
                seq_onehot[j, :] = np.array([0, 0, 0, 1], dtype=np.int8)
            else:
                print('issue')
            one_hot_full[i,:,:] = seq_onehot
    return one_hot_full

In [6]:
def oneHot_DNA_full_PaddingZeros_1batch(chrom, max):
    largest_size = max
  #for seq in chrom:
  #  size = len(seq)
  #  if size > largest_size:
  #    largest_size = size
    one_hot_full = np.zeros((1, largest_size, 4), dtype=np.int8)
    for i, seq in enumerate(chrom):
        seq_onehot = np.zeros((largest_size, 4))
        for j, nuc in enumerate(seq):
            if nuc == 'a':
                seq_onehot[j, :] = np.array([1, 0, 0, 0], dtype=np.int8)
            elif nuc == 't':
                seq_onehot[j, :] = np.array([0, 1, 0, 0], dtype=np.int8)
            elif nuc == 'c':
                seq_onehot[j, :] = np.array([0, 0, 1, 0], dtype=np.int8)
            elif nuc == 'g':
                seq_onehot[j, :] = np.array([0, 0, 0, 1], dtype=np.int8)
            else:
                print('issue')
            one_hot_full[i,:,:] = seq_onehot
    return one_hot_full

In [7]:
chrom = [['a', 'a', 'a'], ['t', 't'], ['c', 'c', 'c','c'], ['g','g','g']]

In [8]:
def oneHot_labels2_1batch(chrom):
    seq_onehot = np.zeros((1, 2), dtype=np.int8)
    if chrom == 1:
        seq_onehot[0, :] = np.array([1, 0], dtype=np.int8)
    elif chrom == 0:
        seq_onehot[0, :] = np.array([0, 1], dtype=np.int8)
    else:
        print("issue - with the labels")
    return seq_onehot

Onehot encoding the samples and labels:

In [9]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']

max_num = 0
for x, indexes in enumerate(genome):
    with open('../DataSet_13_notonehot/DataSet_13_test_samples_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        test_samples = pickle.load(f2)
    with open('../DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        train_samples = pickle.load(f2)
    with open('../DataSet_13_notonehot/DataSet_13_test_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        test_labels = pickle.load(f2)
    with open('../DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        train_labels = pickle.load(f2)
    for i, label in enumerate(train_labels):
        if label == 0:
            if max_num < len(train_samples[i]):
                max_num = len(train_samples[i])
    for i, label in enumerate(test_labels):
        if label == 0:
            if max_num < len(test_samples[i]):
                 max_num = len(test_samples[i])
    print(max_num)
    print(max_num)

223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223800
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836
223836


In [10]:
largest_seq = max_num

In [22]:
count_a = 0
count_a_place = 0
count_b = 0
count_b_place = 0
count_c = 0
count_c_place = 0
count = 0

for x, indexes in enumerate(genome):
    train_samples = []
    train_labels = []
    with open('../DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
        train_samples = pickle.load(f1)
        #print(len(train_samples))
    with open('../DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
        train_labels = pickle.load(f2)
        #print(len(train_labels))
    for y, index in enumerate(train_samples):
        len_seq = len(index)
        new_train_samples = []
        new_train_labels = []
        index_array = []
        index_array.append(index)
        if count_a_place > 10000:
            count_a_place -= 10000
        if count_b_place > 10000:
            count_b_place -= 10000
        if count_c_place > 10000:
            count_c_place -= 10000
        if len_seq <= 1000:
            if count > 9990 and count < 10000:
                new_train_samples = oneHot_DNA_full_PaddingZeros(index_array, 1000)
                new_train_labels = oneHot_labels2_1batch(train_labels[y])
            if count > 9990 and count < 10000:
                print(new_train_samples.shape)
                print(train_labels[y])
                print(new_train_labels)
            count += 1
'''

            count_a += 1
            count_a_place += 1
            #  print("count_a")
            with open('../DataSet_14_Part_1/' + str(int(count_a / 10000))+'/' + str(int(count_a_place / 100))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_a) + '.dat', 'wb') as f:
                pickle.dump(new_train_labels, f)
            with open('../DataSet_14_Part_1/' + str(int(count_a / 10000))+'/' + str(int(count_a_place / 100))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_a) + '.dat', 'wb') as f:
                pickle.dump(new_train_samples, f)
        elif len_seq > 1000 and len_seq <= 10000:
            new_train_samples = oneHot_DNA_full_PaddingZeros_1batch(index_array, 10000)
            new_train_labels = oneHot_labels2_1batch(train_labels[y])
            count_b += 1
            count_b_place += 1
            #  print('count_b:' + str(count_b))
            #  print('count_b_place:' + str(count_b_place))
            #  print("count_b")
            with open('../DataSet_14_Part_2/' + str(int(count_b / 10000))+'/' + str(int(count_b_place / 100))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_b) + '.dat', 'wb') as f:
                pickle.dump(new_train_labels, f)
            with open('../DataSet_14_Part_2/' + str(int(count_b / 10000))+'/' + str(int(count_b_place / 100))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_b) + '.dat', 'wb') as f:
                pickle.dump(new_train_samples, f)
        elif len_seq > 10000:
            count_c += 1
            count_c_place += 1
            #print("count_c")
            new_train_samples = oneHot_DNA_full_PaddingZeros_1batch(index_array, max_num)
            new_train_labels = oneHot_labels2_1batch(train_labels[y])
            with open('../DataSet_14_Part_3/' + str(int(count_c / 10000))+'/' + str(int(count_c_place / 100))+'/DataSet_14_train_labels_onehot_batches1_Part_' + str(count_c) + '.dat', 'wb') as f:
                pickle.dump(new_train_labels, f)
            with open('../DataSet_14_Part_3/' + str(int(count_c / 10000))+'/' + str(int(count_c_place / 100))+'/DataSet_14_train_samples_onehot_batches1_Part_' + str(count_c) + '.dat', 'wb') as f:
                pickle.dump(new_train_samples, f)
        else:
            print(error)
        if count % 100 == 0:
            print("Count:" + str(count))

        count += 1
    '''

(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]
(1, 1000, 4)
0
[[0 1]]


'\n\n            count_a += 1\n            count_a_place += 1\n            #  print("count_a")\n            with open(\'../DataSet_14_Part_1/\' + str(int(count_a / 10000))+\'/\' + str(int(count_a_place / 100))+\'/DataSet_14_train_labels_onehot_batches1_Part_\' + str(count_a) + \'.dat\', \'wb\') as f:\n                pickle.dump(new_train_labels, f)\n            with open(\'../DataSet_14_Part_1/\' + str(int(count_a / 10000))+\'/\' + str(int(count_a_place / 100))+\'/DataSet_14_train_samples_onehot_batches1_Part_\' + str(count_a) + \'.dat\', \'wb\') as f:\n                pickle.dump(new_train_samples, f)\n        elif len_seq > 1000 and len_seq <= 10000:\n            new_train_samples = oneHot_DNA_full_PaddingZeros_1batch(index_array, 10000)\n            new_train_labels = oneHot_labels2_1batch(train_labels[y])\n            count_b += 1\n            count_b_place += 1\n            #  print(\'count_b:\' + str(count_b))\n            #  print(\'count_b_place:\' + str(count_b_place))\n     

In [None]:
'''
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
test_samples = []
test_labels = []
part = 0 

for x, indexes in enumerate(genome):
  test_samples = []
  test_labels = []
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
    test_samples = pickle.load(f1)
      #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
    test_labels = pickle.load(f2)
    print(len(test_samples))
    print(len(test_labels))
  
  test_samples = oneHot_DNA_full_PaddingZeros(test_samples, largest_seq)
  print(test_samples.shape)

  test_labels = oneHot_labels2(test_labels)
  print(test_labels.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_labels_onehot_batches5_Chr:{}.dat'.format(indexes), 'wb') as f:
    pickle.dump(test_labels, f)
    print(test_labels.shape)
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_samples_onehot_batches5_Chr:{}.dat'.format(indexes), 'wb') as f:
    pickle.dump(test_samples, f)
    print(test_samples.shape)
'''
  

In [None]:
with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_labels_onehot_batches5_Chr:chr1.dat', 'rb') as f:
  test_labels = pickle.load(f)
with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/testing_data/DataSet_13_test_samples_onehot_batches5_Chr:chr1.dat', 'rb') as f:
  test_samples = pickle.load(f)

In [None]:
'''
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
train_samples = []
train_labels = []
part = 0 
size_batches = 5
num_place = 0
for x, indexes in enumerate(genome):
  if x != 7:
    print(indexes)
    train_samples = []
    train_labels = []
    with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
      train_samples = pickle.load(f1)
      #print(len(train_samples))
    with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
      train_labels = pickle.load(f2)
      #print(len(train_labels))
    size = len(train_samples)
    if size > size_batches:
      loop = 0
      while size > 0:
        if num_place > 20000:
          num_place -= 20000
        if size - size_batches > 0:
          new_train_samples = oneHot_DNA_full_PaddingZeros(train_samples[loop:(loop + size_batches)], largest_seq)
          #print(new_train_samples.shape)
          new_train_labels = oneHot_labels2(train_labels[loop:loop + size_batches])
          #print(new_train_labels.shape)
          loop += size_batches
          size -= size_batches
          part += 1
          num_place += 1
        else:
          new_train_samples = oneHot_DNA_full_PaddingZeros(train_samples[loop:len(train_samples)], largest_seq)
          #print(new_train_samples.shape)
          new_train_labels = oneHot_labels2(train_labels[loop:len(train_samples)])
          #print(new_train_labels.shape)
          part += 1
          loop += size_batches
          size -= size_batches
          num_place += 1
        if part % 1000 == 0:
          print('part:' + str(part))
        with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/' + str(int(part / 20000))+'/' + str(int(num_place / 100))+'/DataSet_13_train_labels_onehot_batches5_Part:' + str(part) + '.dat', 'wb') as f:
          pickle.dump(new_train_labels, f)
        with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/test5/' + str(int(part / 20000))+'/' + str(int(num_place / 100))+'/DataSet_13_train_samples_onehot_batches5_Part:' + str(part) + '.dat', 'wb') as f:
          pickle.dump(new_train_samples, f)

'''

chr1
part:1000
chr2
part:2000
part:3000
chr3
part:4000
part:5000
chr4
part:6000
chr5
part:7000
chr6
part:8000
part:9000
chr7
part:10000
chr9
part:11000
chr10
part:12000
chr11
part:13000
chr12
part:14000
part:15000
chr13
chr14
part:16000
chr15
part:17000
chr16
chr17
part:18000
chr18
part:19000
chr19
part:20000
chr20
chr21
part:21000
chr22
chrX
part:22000


**Batches:**

**Batches 1**:

    with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_labels_onehot_batches1_Part:' + str(part) + '.dat', 'rb') as f:
      train_labels = pickle.load(f)
    with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_samples_onehot_batches1_Part:' + str(part) + '.dat', 'rb') as f:
      train_samples = pickle.load(f)

**Batches 200**:

     with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_labels_onehot_batches200_Part:' + str(part) + '.dat', 'rb') as f:
      train_labels = pickle.load(f)
    with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_samples_onehot_batches200_Part:' + str(part) + '.dat', 'rb') as f:
      train_samples = pickle.load(f)

      

part:73000 last run


Splitting into groups of 200 bp and then one hot encoding:


In [None]:
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[0] + '.dat', 'rb') as f1:
    train_samples = pickle.load(f1)
    #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[0] + '.dat', 'rb') as f2:
    train_labels = pickle.load(f2)

In [None]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
train_samples = []
train_labels = []
for x, indexes in enumerate(genome):
  train_samples = []
  train_labels = []
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
    train_samples = pickle.load(f1)
    #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_train_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
    train_labels = pickle.load(f2)

  ## Loop for spliting sequences and labels to 200 bp:
  count = 0
  new_train_seq = []
  new_train_labels = []
  count_train = 0
  for seq in train_samples:
    for i in range(0, len(seq), 200):
      new_train_seq.append(seq[i : i+200])
      new_train_labels.append(train_labels[count])
    count += 1
    count_train += 1
  print(count_train)
  print(len(new_train_seq))
  print(len(new_train_labels))

  ## Making sure the size is right no less than 200 bp
  #import math
  sample_new = []
  label_new = []
  for j, seq in enumerate(new_train_seq):
    if len(seq) >= 200:
      sample_new.append(seq)
      label_new.append(new_train_labels[j])
  train_samples = sample_new
  train_labels = label_new
  print(len(train_samples))
  print(len(train_labels))
  
  train_samples = oneHot_DNA(train_samples)
  print(train_samples.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_train_labels_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(train_samples, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_samples_onehot_200bp' + genome[x] + '.dat', 'rb') as f:
  #  train_samples = pickle.load(f)
  #  print(train_samples.shape)

  train_labels = oneHot_labels2(train_labels)
  print(train_labels.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_train_labels_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(train_labels, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_train_labels_onehot_200bp' + genome[x] + '.dat', 'rb') as f:
  #  train_labels = pickle.load(f)
  #  print(train_labels.shape) 
  

0
0
0
0
0


IndexError: ignored

In [None]:
genome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
test_samples = []
test_labels = []
for x, indexes in enumerate(genome):
  test_samples = []
  test_labels = []
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_samples_1-Version_' + genome[x] + '.dat', 'rb') as f1:
    test_samples = pickle.load(f1)
    #print(len(train_samples))
  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_13_notonehot/DataSet_13_test_labels_1-Version_' + genome[x] + '.dat', 'rb') as f2:
    test_labels = pickle.load(f2)
  count_test = 0
  for q in test_labels:
    if q != 1:
      count_test += 1
  print(count_test)

  ## Loop for spliting sequences and labels to 200 bp:
  count = 0
  new_test_seq = []
  new_test_labels = []
  count_test = 0
  for seq in test_samples:
    for i in range(0, len(seq), 200):
      new_test_seq.append(seq[i : i+200])
      new_test_labels.append(test_labels[count])
    count += 1
    count_test += 1
  # print(count_test)
  # print(len(new_test_seq))
  # print(len(new_test_labels))
  #    print(count_test)
#    print(len(new_test_seq))
#    print(len(new_test_labels))

  count_test = 0
  for q in new_test_labels:
    if q != 1:
      count_test += 1
  print(count_test)
  print(test_labels[0])


  ## Making sure the size is right no less than 200 bp
  #import math
  sample_new = []
  label_new = []
  for i, seq in enumerate(new_test_seq):
    if len(seq) >= 200:
      sample_new.append(seq)
      label_new.append(new_test_labels[i])
  test_samples = sample_new
  test_labels = label_new
  # print(len(test_samples))
  # print(len(test_labels))
  
  count_test = 0
  for q in test_labels:
    if q != 1:
      count_test += 1
  print(count_test)
  print(test_labels[0])

  test_samples = oneHot_DNA(test_samples)
  print(test_samples.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_test_samples_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(test_samples, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_test_samples_onehot_200bp' + genome[i] + '.dat', 'rb') as f:
  #  test_samples = pickle.load(f)
  #  print(test_samples.shape)

  test_labels = oneHot_labels2(test_labels)
  print(test_labels.shape)

  with open('gdrive/My Drive/SURF_2020_Weiss/Data_Sets/DataSet13/DataSet_200bp_onehot/DataSet_13_test_labels_onehot_200bp' + genome[x] + '.dat', 'wb') as f:
    pickle.dump(test_labels, f)
  #with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_test_labels_onehot_200bp' + genome[i] + '.dat', 'rb') as f:
  #  test_labels = pickle.load(f)
  #  print(test_labels.shape) 

1282
23174
1
23152
1
(33572, 200, 4)
(33572, 2)
1046
20874
1
20862
1
(28674, 200, 4)
(28674, 2)
898
16692
1
16690
1
(23386, 200, 4)
(23386, 2)
734
16066
1
16064
1
(24034, 200, 4)
(24034, 2)
724
19312
1
19310
1
(25486, 200, 4)
(25486, 2)
806
19240
1
19236
1
(25698, 200, 4)
(25698, 2)
858
15162
1
15156
1
(31468, 200, 4)
(31468, 2)
646
14822
1
14818
1
(23394, 200, 4)
(23394, 2)
680
12174
1
12134
1
(15648, 200, 4)
(15648, 2)
670
12136
1
12122
1
(20088, 200, 4)
(20088, 2)
770
14906
1
14900
1
(26872, 200, 4)
(26872, 2)
764
12766
1
12766
1
(21218, 200, 4)
(21218, 2)
474
7382
1
7382
1
(10540, 200, 4)
(10540, 2)
466
7840
1
7840
1
(14790, 200, 4)
(14790, 2)
498
10560
1
10556
1
(16076, 200, 4)
(16076, 2)
486
6324
1
6324
1
(12346, 200, 4)
(12346, 2)
612
6852
1
6840
1
(9706, 200, 4)
(9706, 2)
302
7772
1
7770
1
(10640, 200, 4)
(10640, 2)
578
5074
1
5064
1
(19232, 200, 4)
(19232, 2)
390
5964
1
5962
1
(8872, 200, 4)
(8872, 2)
178
4904
1
4900
1
(8562, 200, 4)
(8562, 2)
264
2550
1
2536
1
(3320, 200, 4)


In [None]:
with open('gdrive/My Drive/SURF_2020_Weiss/E003_hg38_25_imputed12marks/DataSet_13_test_labels_' + genome[0] + '.dat', 'rb') as f2:
  test_labels = pickle.load(f2)
  print(len(test_labels))
  count_test = 0
  for q in test_labels:
    if q != 1:
      count_test += 1
  print(count_test)

2360
1334
