# Preparing datasets
While this code doesn't represent the entire data preparation process (a long and complicated process), it shows what the data look like for the attX classification task, Task 1. This was used to prepare the data for models.


Written By: Matt Durrant

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd

# TensorFlow and tf.keras
import tensorflow as tf
import random
from tensorflow import keras
from keras.utils import to_categorical

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

# https://www.tensorflow.org/tutorials/keras/classification
print(tf.__version__)

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/Shared\ drives/CS230/datasets
import matplotlib.pyplot as plt

random.seed(42)

Using TensorFlow backend.


1.15.0
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive/Shared drives/CS230/datasets


In [0]:
# Loading the attX classifier darta (Task 1)
train1, dev1, test1 = pd.read_csv("set1/attx_classifier/train_attx_classifier.tsv", sep='\t'), pd.read_csv("set1/attx_classifier/dev_attx_classifier.tsv", sep='\t'), pd.read_csv("set1/attx_classifier/test_attx_classifier.tsv", sep='\t')
train2, dev2, test2 = pd.read_csv("set2/attx_classifier/train_attx_classifier.tsv", sep='\t'), pd.read_csv("set2/attx_classifier/dev_attx_classifier.tsv", sep='\t'), pd.read_csv("set2/attx_classifier/test_attx_classifier.tsv", sep='\t')

In [4]:
train1.head()

Unnamed: 0,set,type,att,y1,y2
0,train,attb,TTCTAATTACCTCTAATAATGCATACATTGTCGTTGTCTTCCCAGA...,1,1
1,train,attb,ATATTGAAGAAAGGCTATCTAAATGGACTGGGAAAGCAAATATATT...,1,1
2,train,attb,CTTTAGAACTAATGATCAGGGTTGTTATATTTTGTACCTCACTCTT...,1,1
3,train,attb,CAATCGGTATGCCATGAGACAAGGTAATAGTAGTGGCGAAAGTATG...,1,1
4,train,attb,TTTATATTTAATTATTAAATTAACAAATTTTAATTGGCGGATGAGG...,1,1


In [7]:
train1.tail()

Unnamed: 0,set,type,att,y1,y2
116329,train,control,CCGGCTGTTCGGTGGATTTGTGCAGAGATTGAACTGTGATGGTTGT...,0,0
116330,train,control,CCTTGATTCTGGTTATTTGACAACCTCAATCTGTAAAGGTTTATCG...,0,0
116331,train,control,CATGCGGTTACTCACTATCGTGTTGCGGAGCATTTCCGTGAACATA...,0,0
116332,train,control,CAGGTGCAAGAGGCAAGACGCAAGGCGACAGAAAGGATATAGCGCC...,0,0
116333,train,control,CAACTGGCGGGGAACCAGCGGGTCATCCTCACGCCCACTGTAACGA...,0,0


In [9]:
print("SET 1 - TRAINING SET DIMENSIONS:",   train1.shape)
print("SET 1 - DEV SET DIMENSIONS:",   dev1.shape)
print("SET 1 - TEST SET DIMENSIONS:",   test1.shape)
print("SET 2 - TRAINING SET DIMENSIONS:",   train2.shape)
print("SET 2 - DEV SET DIMENSIONS:",   dev2.shape)
print("SET 2 - TEST SET DIMENSIONS:",   test2.shape)

SET 1 - TRAINING SET DIMENSIONS: (116334, 5)
SET 1 - DEV SET DIMENSIONS: (20796, 5)
SET 1 - TEST SET DIMENSIONS: (40296, 5)
SET 2 - TRAINING SET DIMENSIONS: (60501, 5)
SET 2 - DEV SET DIMENSIONS: (9915, 5)
SET 2 - TEST SET DIMENSIONS: (19662, 5)


In [0]:
# Code used to one-hot encode the data
def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if char.upper()=="A":
            char_idx = 0
        elif char.upper()=="C":
            char_idx = 1
        elif char.upper()=="G":
            char_idx = 2
        elif char.upper()=="T":
            char_idx = 3
        elif char.upper()=="N":
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
            
def one_hot_encode_along_channel_axis(sequence):
    sequence = ''.join(sequence)
    to_return = np.zeros((150,4), dtype=np.int8)
    padlength = 150 - len(sequence)
    padleft = int(padlength/2)
    padright = padlength - padleft
    padded = "N"*padleft + sequence + "N"*padright
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=padded, one_hot_axis=1)
    return to_return

def one_hot_encode_y(y, dim):
  outy = np.zeros((1, dim))
  outy[0,y] = 1
  return(list(outy))

In [0]:
# Getting one-hot encodings of each dataset 1 (raw)
train1_att = np.array(train1.att)
train1_y1 = np.array(train1.y1)
train1_y2 = np.array(train1.y2)

train1_att = np.array([one_hot_encode_along_channel_axis(seq) for seq in train1_att])
train1_y2 = np.array([one_hot_encode_y(y, dim=3) for y in train1_y2])
train1_y2 = train1_y2.reshape((train1_y2.shape[0], train1_y2.shape[-1]))


dev1_att = np.array(dev1.att)
dev1_y1 = np.array(dev1.y1)
dev1_y2 = np.array(dev1.y2)

dev1_att = np.array([one_hot_encode_along_channel_axis(seq) for seq in dev1_att])
dev1_y2 = np.array([one_hot_encode_y(y, dim=3) for y in dev1_y2])
dev1_y2 = dev1_y2.reshape((dev1_y2.shape[0], dev1_y2.shape[-1]))


test1_att = np.array(test1.att)
test1_y1 = np.array(test1.y1)
test1_y2 = np.array(test1.y2)

test1_att = np.array([one_hot_encode_along_channel_axis(seq) for seq in test1_att])
test1_y2 = np.array([one_hot_encode_y(y, dim=3) for y in test1_y2])
test1_y2 = test1_y2.reshape((test1_y2.shape[0], test1_y2.shape[-1]))

In [0]:
# Getting one-hot encodings for dataset 2 (clean)
train2_att = np.array(train2.att)
train2_y1 = np.array(train2.y1)
train2_y2 = np.array(train2.y2)

train2_att = np.array([one_hot_encode_along_channel_axis(seq) for seq in train2_att])
train2_y2 = np.array([one_hot_encode_y(y, dim=3) for y in train2_y2])
train2_y2 = train2_y2.reshape((train2_y2.shape[0], train2_y2.shape[-1]))


dev2_att = np.array(dev2.att)
dev2_y1 = np.array(dev2.y1)
dev2_y2 = np.array(dev2.y2)

dev2_att = np.array([one_hot_encode_along_channel_axis(seq) for seq in dev2_att])
dev2_y2 = np.array([one_hot_encode_y(y, dim=3) for y in dev2_y2])
dev2_y2 = dev2_y2.reshape((dev2_y2.shape[0], dev2_y2.shape[-1]))


test2_att = np.array(test2.att)
test2_y1 = np.array(test2.y1)
test2_y2 = np.array(test2.y2)

test2_att = np.array([one_hot_encode_along_channel_axis(seq) for seq in test2_att])
test2_y2 = np.array([one_hot_encode_y(y, dim=3) for y in test2_y2])
test2_y2 = test2_y2.reshape((test2_y2.shape[0], test2_y2.shape[-1]))

In [16]:
# Here is an example of one one-hot encoded DNA sequence
train1_att[0]

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 0,

In [0]:
final_prepped = {'set1': {'train':{'x':train1_att, 'y':train1_y2}, 
                          'dev':{'x':dev1_att, 'y':dev1_y2}, 
                          'test':{'x':test1_att, 'y':test1_y2}}, 
                 'set2': {'train':{'x':train2_att, 'y':train2_y2}, 
                          'dev':{'x':dev2_att, 'y':dev2_y2}, 
                          'test':{'x':test2_att, 'y':test2_y2}}}

In [0]:
# Function to shuffle both the x and y datasets in unison
# This is so that batches don't contain only one class
def shuffle_in_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

In [0]:
# Now shuffling x and y in unison
random.seed(42)
shuffle_in_unison(final_prepped['set1']['train']['x'], final_prepped['set1']['train']['y'])
shuffle_in_unison(final_prepped['set1']['dev']['x'], final_prepped['set1']['dev']['y'])
shuffle_in_unison(final_prepped['set1']['test']['x'], final_prepped['set1']['test']['y'])
shuffle_in_unison(final_prepped['set2']['train']['x'], final_prepped['set2']['train']['y'])
shuffle_in_unison(final_prepped['set2']['dev']['x'], final_prepped['set2']['dev']['y'])
shuffle_in_unison(final_prepped['set2']['test']['x'], final_prepped['set2']['test']['y'])

# That's it!