# Tensorflow on RNA sequences

In [3]:
import numpy as np
import tensorflow as tf

In [4]:
# open a tensorflow session
sess = tf.InteractiveSession()

In [5]:
# create dataset object that holds features and labels and cycles through the data in batches
class Dataset(object):

    def __init__(self, features, labels):
        assert (len(features) == len(labels))
        self.features = np.array(features)
        self.labels = np.array(labels)
        self.index = 0
        self.size = len(labels)
    
    def next_batch(self, batch_size):
        old_index = self.index
        new_index = self.index + batch_size
        self.index = new_index % self.size
        if new_index <= self.size:
            return (self.features[old_index: new_index], self.labels[old_index: new_index])
        else:
            subfeatures = np.concatenate([self.features[old_index:], self.features[:self.index]])
            sublabels = np.concatenate([self.labels[old_index:], self.labels[:self.index]])
            return (subfeatures, sublabels)
    
    def reset_index(self):
        self.index = 0    
    

In [6]:
# # create an object that implements a basic neural network for classification
# class Classify(object):
    
#     def __init__(self, sess, feat_length, label_length):
#         self.sess = sess
#         self.feat_length = feat_length
#         self.label_length = label_length

#         # create variables for the features, labels, and weights
#         self.x = tf.placeholder(tf.float32, shape=[None, feat_length])
#         self.y_ = tf.placeholder(tf.float32, shape=[None, label_length])
#         self.W = tf.Variable(tf.zeros([feat_length,label_length]))
#         self.b = tf.Variable(tf.zeros([label_length]))
        
#     def make_model(self, func, loss_func, optim_func):
#         self.y = func(self.x, self.W, self.b)
#         self.cross_entropy = loss_func(self.y_, self.y)
#         self.train_step = optim_func(self.cross_entropy)
        
#         self.correct_prediction = tf.equal(tf.argmax(self.y,1), tf.argmax(self.y_,1))
#         self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
    
#     def train(self, num_epoch, batch_size, train_data):
#         self.sess.run(tf.initialize_all_variables())
#         for i in range(num_epoch):
#             batch = train_data.next_batch(batch_size)
#             self.train_step.run(feed_dict={self.x: batch[0], self.y_: batch[1]})
        
#     def test(self, test_data):
#         print(self.accuracy.eval(feed_dict={self.x: test_data.features, self.y_: test_data.labels}))

### Import MNIST data for testing

In [7]:
# import sample data for digit recognition
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [8]:
mnist_train = Dataset(mnist.train.images, mnist.train.labels)
mnist_test = Dataset(mnist.test.images, mnist.test.labels)

## Convolutional Neural Network

In [58]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1], padding='SAME')

def train_model(sess, train_step, eval_var, num_epoch, batch_size, report_int, train, test):

    # initialize variables
    sess.run(tf.global_variables_initializer())

    # train epochs
    for i in range(num_epoch):
        batch = train.next_batch(50)
        if i%report_int == 0:
            train_accuracy = eval_var.eval(feed_dict={x:batch[0],
                                                      y_: batch[1],
                                                      keep_prob: 1.0})

            print("step %d, training accuracy %g"%(i, train_accuracy),
                  "test accuracy %g"%eval_var.eval(feed_dict={x: test.features,
                                                              y_: test.labels,
                                                              keep_prob: 1.0}))
        train_step.run(feed_dict={x: batch[0],
                                  y_: batch[1],
                                  keep_prob: 0.5})

    print("test accuracy %g"%eval_var.eval(feed_dict={x: test.features,
                                                      y_: test.labels,
                                                      keep_prob: 1.0}))
    

In [10]:
# class Convolutional_2D_NN(object):
    
#     def __init__(self, sess, input_size, output_size):
#         self.x = tf.placeholder(tf.float32, shape=[None, input_size*input_size])
#         self.x_image = tf.reshape(x, [-1,input_size,input_size,1])
#         self.y_ = tf.placeholder(tf.float32, shape=[None, output_size])

#         self.current_layer = self.x_image
    
#     def add_layer(self, layer_func):
#         self.current_layer = layer_func(self.current_layer)
        

### First implement MNIST with a convolutional NN using tensorflow tutorial

In [11]:
init_size = 28
final_size = 7
num_output1 = 32
num_output2 = 64
fully_connected_nodes = 1024
out_nodes = 10

# create placeholders for data
x = tf.placeholder(tf.float32, shape=[None, init_size*init_size])
x_image = tf.reshape(x, [-1,init_size,init_size,1])
y_ = tf.placeholder(tf.float32, shape=[None, out_nodes])

# add convolution that traverses every 4x4 square, without overlaps
W_conv1 = weight_variable([4, 4, 1, num_output1])
b_conv1 = bias_variable([num_output1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# add a second convolution that looks at every 2x2 square, with overlaps
W_conv2 = weight_variable([2, 2, num_output1, num_output2])
b_conv2 = bias_variable([num_output2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# add a fully connected layer
W_fc1 = weight_variable([final_size * final_size * num_output2, fully_connected_nodes])
b_fc1 = bias_variable([fully_connected_nodes])

h_pool2_flat = tf.reshape(h_pool2, [-1, final_size * final_size * num_output2])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# add dropout to reduce overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([fully_connected_nodes, out_nodes])
b_fc2 = bias_variable([out_nodes])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2


In [17]:
# use a cross entropy loss function and optimize with Adam
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

train_model(sess, train_step, accuracy, 1000, 50, 100, mnist_train, mnist_test)


step 0, training accuracy 0.08 test accuracy 0.0731
step 100, training accuracy 0.66 test accuracy 0.7809
step 200, training accuracy 0.9 test accuracy 0.8655
step 300, training accuracy 0.92 test accuracy 0.9016
step 400, training accuracy 0.9 test accuracy 0.915
step 500, training accuracy 0.94 test accuracy 0.9169
step 600, training accuracy 0.94 test accuracy 0.9289
step 700, training accuracy 1 test accuracy 0.9345
step 800, training accuracy 0.92 test accuracy 0.9444
step 900, training accuracy 0.98 test accuracy 0.9393
test accuracy 0.9464


## Classify RNA sequences based on complementarity

In [18]:
def generate_random_sequence(length):
    """Generate a random RNA sequence of a given length"""
    
    nts = ['A','U','C','G']
    sequence = np.random.choice(nts, size=length, replace=True)

    return ''.join(sequence)

def get_complementary(seq):
    """Get the complementary sequence of a given RNA sequence"""
    
    intab = "AUCG"
    outtab = "UAGC"
    trantab = str.maketrans(intab, outtab)

    return seq.translate(trantab)

def generate_match_pair(length, random_seed=None):
    """Generate two sequences that are base-paired"""
    
    if random_seed is not None:
        np.random.seed(random_seed)

    seq1 = generate_random_sequence(length)
    seq2 = get_complementary(seq1)
    
    return seq1, seq2

def generate_seed_match_pair(length1, length2, random_seed=None):
    """Generate two sequences that are base-paired at positions 1-7"""
    
    if random_seed is not None:
        np.random.seed(random_seed)

    seq1 = generate_random_sequence(length1)
    up_fragment = generate_random_sequence(1)
    down_fragment = generate_random_sequence(length2-7)
    mid_fragment = get_complementary(seq1[1:7])
    
    seq2 = up_fragment + mid_fragment + down_fragment
    
    return seq1, seq2

def generate_random_pair(length1, length2, random_seed=None):
    """Generate two random sequences that are not perfectly complementary"""
    
    if random_seed is not None:
        np.random.seed(random_seed)
    
    seq1 = generate_random_sequence(length1)
    match_seq1 = get_complementary(seq1)

    while True:
        seq2 = generate_random_sequence(length2)

        if match_seq1 != seq2:
            return seq1, seq2

def one_hot_encode(seq, nt_order):
    """Convert RNA sequence to one-hot encoding"""
    
    one_hot = [list(np.array(nt_order == nt, dtype=int)) for nt in seq]
    one_hot = [item for sublist in one_hot for item in sublist]
    
    return np.array(one_hot)

def make_square(seq1, seq2):
    """Given two sequences, calculate outer product of one-hot encodings"""

    return np.outer(one_hot_encode(seq1, np.array(['A','U','C','G'])),
                    one_hot_encode(seq2, np.array(['U','A','G','C']))).flatten()

In [19]:
print(generate_seed_match_pair(20,20))
print(generate_random_pair(20,20))
print(generate_match_pair(20,20))
print(make_square('AAA','AAA').reshape(12,12))
print(make_square('UAG','AUC').reshape(12,12))

('GCGUGCGCCCGAAGUAUGCA', 'GGCACGCAACGUGGCUCGCC')
('CACAAAGGUUGACUUUGCGU', 'UUAGAAGAGAGUAAUGACCC')
('GCGGACUAGCGCACAUCCGG', 'CGCCUGAUCGCGUGUAGGCC')
[[0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 1]]


In [20]:
# add another convolution function that looks at 4x4 blocks without overlapping
def conv2d_4step(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 4, 4, 1], padding='SAME')


### Learn to distinguish complementary sequences from random sequences

In [21]:
# generate complementary and random 4mers
features = np.zeros((1000, 256))
labels = np.zeros((1000, 2))

for i in range(1000):
    if np.random.random() < 0.5:
        seq1, seq2 = generate_match_pair(4)
        labels[i,:] = [1, 0]
    else:
        seq1, seq2 = generate_random_pair(4,4)
        labels[i,:] = [0, 1]

    features[i,:] = make_square(seq1, seq2)

match_train = Dataset(features[:900, :], labels[:900, :])
match_test = Dataset(features[900:, :], labels[900:, :])

In [22]:
init_size = 16
final_size = 1
num_output1 = 4
num_output2 = 8
fully_connected_nodes = 512
out_nodes = 2

# create placeholders for data
x = tf.placeholder(tf.float32, shape=[None, init_size*init_size])
x_image = tf.reshape(x, [-1,init_size,init_size,1])
y_ = tf.placeholder(tf.float32, shape=[None, out_nodes])

# add convolution that traverses every 4x4 square, without overlaps
W_conv1 = weight_variable([4, 4, 1, num_output1])
b_conv1 = bias_variable([num_output1])

h_conv1 = tf.nn.relu(conv2d_4step(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# add a second convolution that looks at every 2x2 square, with overlaps
W_conv2 = weight_variable([2, 2, num_output1, num_output2])
b_conv2 = bias_variable([num_output2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# add a fully connected layer
W_fc1 = weight_variable([final_size * final_size * num_output2, fully_connected_nodes])
b_fc1 = bias_variable([fully_connected_nodes])

h_pool2_flat = tf.reshape(h_pool2, [-1, final_size * final_size * num_output2])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# add dropout to reduce overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([fully_connected_nodes, out_nodes])
b_fc2 = bias_variable([out_nodes])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

In [24]:
# use a cross entropy loss function and optimize with Adam
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

train_model(sess, train_step, accuracy, 20000, 50, 1000, match_train, match_test)


step 0, training accuracy 0.44 test accuracy 0.43
step 1000, training accuracy 0.82 test accuracy 0.84
step 2000, training accuracy 0.96 test accuracy 0.92
step 3000, training accuracy 0.96 test accuracy 0.98
step 4000, training accuracy 0.98 test accuracy 0.99
step 5000, training accuracy 0.94 test accuracy 1
step 6000, training accuracy 1 test accuracy 1
step 7000, training accuracy 0.94 test accuracy 1
step 8000, training accuracy 0.98 test accuracy 1
step 9000, training accuracy 1 test accuracy 1
step 10000, training accuracy 1 test accuracy 1
step 11000, training accuracy 1 test accuracy 1
step 12000, training accuracy 1 test accuracy 1
step 13000, training accuracy 1 test accuracy 1
step 14000, training accuracy 0.98 test accuracy 1
step 15000, training accuracy 1 test accuracy 1
step 16000, training accuracy 0.94 test accuracy 1
step 17000, training accuracy 0.98 test accuracy 1
step 18000, training accuracy 1 test accuracy 1
step 19000, training accuracy 1 test accuracy 1
test 

### Learn to distinguish seed-matched sequences from random sequences

In [25]:
# generate mirna,utr pairs of length 20 with a seed match
features = np.zeros((1000, 6400))
labels = np.zeros((1000,2))

for i in range(1000):
    if np.random.random() < 0.5:
        seq1, seq2 = generate_seed_match_pair(20,20)
        labels[i,:] = [1, 0]
    else:
        seq1, seq2 = generate_random_pair(20,20)
        labels[i,:] = [0, 1]
    
    features[i,:] = make_square(seq1, seq2)

seed_match_train = Dataset(features[:900, :], labels[:900, :])
seed_match_test = Dataset(features[900:, :], labels[900:, :])

In [31]:
init_size = 80
final_size = 5
num_output1 = 8
num_output2 = 16
fully_connected_nodes = 512
out_nodes = 2

# create placeholders for data
x = tf.placeholder(tf.float32, shape=[None, init_size*init_size])
x_image = tf.reshape(x, [-1,init_size,init_size,1])
y_ = tf.placeholder(tf.float32, shape=[None, out_nodes])

# add convolution that traverses every 4x4 square, without overlaps
W_conv1 = weight_variable([4, 4, 1, num_output1])
b_conv1 = bias_variable([num_output1])

h_conv1 = tf.nn.relu(conv2d_4step(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# add a second convolution that looks at every 2x2 square, with overlaps
W_conv2 = weight_variable([2, 2, num_output1, num_output2])
b_conv2 = bias_variable([num_output2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# add a fully connected layer
W_fc1 = weight_variable([final_size * final_size * num_output2, fully_connected_nodes])
b_fc1 = bias_variable([fully_connected_nodes])

h_pool2_flat = tf.reshape(h_pool2, [-1, final_size * final_size * num_output2])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# add dropout to reduce overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([fully_connected_nodes, out_nodes])
b_fc2 = bias_variable([out_nodes])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

In [69]:
# use a cross entropy loss function and optimize with Adam
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

train_model(sess, train_step, accuracy, 10000, 50, 1000, seed_match_train, seed_match_test)

### Learn to regress on seed pairing stability

In [34]:
def calculate_sps(seq):
    """
    Parameters:
    ==========
    seq: string, consist of A, U, C, G
    
    Returns:
    =======
    float: seed pairing stability
    """
    thermo_dict = {'AA':-0.93,'AU':-1.10,'AC':-2.24,'AG':-2.08,
               'UA':-1.33,'UU':-0.93,'UC':-2.35,'UG':-2.11,
               'CA':-2.11,'CU':-2.08,'CC':-3.26,'CG':-2.36,
               'GA':-2.35,'GU':-2.24,'GC':-3.42,'GG':-3.26}
    init = 4.09
    terminal_au = 0.45
    
    # initialize score
    score = init
    
    # add score for each dinucleotide
    for i in range(len(seq)-1):
        score += thermo_dict[seq[i:i+2]]
    
    # add score for each terminal AU
    score += terminal_au*((seq[0]+seq[-1]).count('A') + (seq[0]+seq[-1]).count('U'))
    
    return score


In [35]:
# generate random sequences
features = np.zeros((1000, 6400))
labels = np.zeros((1000,1))
seq1s, seq2s = [], []

for i in range(1000):
    if np.random.random() < 0.5:
        seq1, seq2 = generate_seed_match_pair(20,20)
        seq1s.append(seq1)
        seq2s.append(seq2)
        labels[i,:] = calculate_sps(seq1[1:7]) + (np.random.random()-0.5)
    else:
        seq1, seq2 = generate_random_pair(20,20)
        seq1s.append(seq1)
        seq2s.append(seq2)
        labels[i,:] = np.random.random()-0.5
    
    features[i,:] = make_square(seq1, seq2)

seed_match_sps_train = Dataset(features[:900, :], labels[:900, :])
seed_match_sps_test = Dataset(features[900:, :], labels[900:, :])

In [60]:
init_size = 80
final_size = 5
num_output1 = 16
num_output2 = 32
fully_connected_nodes = 512
out_nodes = 1

# create placeholders for data
x = tf.placeholder(tf.float32, shape=[None, init_size*init_size])
x_image = tf.reshape(x, [-1,init_size,init_size,1])
y_ = tf.placeholder(tf.float32, shape=[None, out_nodes])

# add convolution that traverses every 4x4 square, without overlaps
W_conv1 = weight_variable([4, 4, 1, num_output1])
b_conv1 = bias_variable([num_output1])

h_conv1 = tf.nn.relu(conv2d_4step(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# add a second convolution that looks at every 2x2 square, with overlaps
W_conv2 = weight_variable([2, 2, num_output1, num_output2])
b_conv2 = bias_variable([num_output2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# add a fully connected layer
W_fc1 = weight_variable([final_size * final_size * num_output2, fully_connected_nodes])
b_fc1 = bias_variable([fully_connected_nodes])

h_pool2_flat = tf.reshape(h_pool2, [-1, final_size * final_size * num_output2])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# add dropout to reduce overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([fully_connected_nodes, out_nodes])
b_fc2 = bias_variable([out_nodes])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

In [68]:
# use an L2 loss and Adam optimizer
SS_err = tf.reduce_sum(tf.square(tf.sub(y_conv, y_)))
SS_tot = tf.reduce_sum(tf.square(tf.sub(y_, tf.reduce_mean(y_))))
R_2 = tf.sub(tf.cast(1.0, tf.float32), tf.div(SS_err, SS_tot))

train_step = tf.train.AdamOptimizer(1e-4).minimize(SS_err)

train_model(sess, train_step, R_2, 20000, 50, 1000, seed_match_sps_train, seed_match_sps_test)


# Regression on real data

In [62]:
import pandas as pd

LOGFC_FILE1 = '../data/Supplementary1.csv'
LOGFC_FILE2 = '../data/Supplementary2.csv'
GENE_FILE = '../data/Gene_info.txt'
SEED_FILE = '../data/seed_dict.csv'
SEQ_FILE = '../data/seq_dict.csv'

def rev_comp(seq):
    """Get the reverse complement of a given RNA sequence"""
    
    intab = "AUCG"
    outtab = "UAGC"
    trantab = str.maketrans(intab, outtab)

    return seq.translate(trantab)[::-1]
    
rev_comp('AUUACCGGC')

'GCCGGUAAU'

In [63]:
GENE_INFO = pd.read_csv(GENE_FILE,sep='\t').drop(['Gene description','Species ID'],1)
GENE_INFO = GENE_INFO.groupby('Gene symbol').agg(lambda x:tuple(x))
GENE_INFO.loc[:,'Isoform ratio'] = [float(max(x))/np.nansum(x) for x in GENE_INFO['3P-seq tags + 5']]
GENE_INFO.loc[:,'Transcript ID'] = [x[y.index(1)] for (x,y) in zip(GENE_INFO['Transcript ID'],GENE_INFO['Representative transcript?'])]
GENE_INFO = GENE_INFO[['Transcript ID','Isoform ratio']]

SEED_INFO = pd.read_csv(SEED_FILE,sep='\t')
SEED_DICT = {}
for row in SEED_INFO.iterrows():
    SEED_DICT[row[1]['col']] = row[1]['seed']

SEEDS = SEED_DICT.values()

SEQ_INFO = pd.read_csv(SEQ_FILE, sep='\t')
SEQ_INFO['seed'] = [SEED_DICT[x] for x in SEQ_INFO['col']]
SEQ_INFO = SEQ_INFO.set_index('seed')
SEQ_INFO.head()

Unnamed: 0_level_0,col,mirna_seq
seed,Unnamed: 1_level_1,Unnamed: 2_level_1
UCGUAGG,1595297366,UUCGUAGGUCAAAAUACAC
UCAUCUC,1595297383,UUCAUCUCCAAUUCGUAGG
UGCUCUU,1595297389,AUGCUCUUUCCUCCUGUGC
UUUGGAA,1595297394,UUUUGGAACAGUCUUUCCG
UUGGAAC,1595297399,UUUGGAACAGUCUUUCCGA


In [64]:
UTRS = pd.read_csv('../../05_TargetPrediction/targetscan_files/UTR_Sequences_Ensembl_Human.txt',
                   sep='\t', usecols=['Ensembl ID','UTR sequence']).set_index('Ensembl ID')
UTRS['UTR sequence'] = [x.replace('-','').upper().replace('T','U') for x in UTRS['UTR sequence']]
# UTRS['UTR length'] = [len(x) for x in UTRS['UTR sequence']]
UTRS.head()

Unnamed: 0_level_0,UTR sequence
Ensembl ID,Unnamed: 1_level_1
CDR1as,GUUUCCGAUGGCACCUGUGUCAAGGUCUUCCAACAACUCCGGGUCU...
ENST00000000233.5,CCAGCCAGGGGCAGGCCCCUGAUGCCCGGAAGCUCCUGCGUGCAUC...
ENST00000000412.3,AUUGCACUUUAUAUGUCCAGCCUCUUCCUCAGUCCCCCAAACCAAA...
ENST00000001008.4,CCCCUCUCCACCAGCCCUACUCCUGCGGCUGCCUGCCCCCCAGUCU...
ENST00000001146.2,CCCAAGACCCACCCGCCUCAGCCCAGCCCAGGCAGCGGGGUGGUGG...


In [65]:
logFCs1 = pd.read_csv(LOGFC_FILE1)
logFCs1 = logFCs1[logFCs1['Used in training'] == 'yes']
logFCs1['Transcript ID'] = list(GENE_INFO.loc[logFCs1['Gene symbol']]['Transcript ID'])
logFCs1 = logFCs1.drop(['Used in training','RefSeq ID','Gene symbol'],1).dropna(subset=['Transcript ID'])
logFCs1 = logFCs1.drop_duplicates(subset=['Transcript ID']).set_index('Transcript ID')
logFCs1.columns = [SEED_DICT[x] if x in SEED_DICT else x for x in logFCs1.columns]

logFCs2 = pd.read_csv(LOGFC_FILE2)
logFCs2['Transcript ID'] = list(GENE_INFO.loc[logFCs2['Gene symbol']]['Transcript ID'])
logFCs2 = logFCs2.drop(['RefSeq ID'],1).dropna(subset=['Transcript ID'])
logFCs2 = logFCs2.drop_duplicates(subset=['Transcript ID']).set_index('Transcript ID')
logFCs2.columns = [SEED_DICT[x] if x in SEED_DICT else x for x in logFCs2.columns]

logFCs = pd.concat([logFCs1,logFCs2],axis=1,join='outer').dropna(subset=['Gene symbol'])
logFCs['Isoform ratio'] = list(GENE_INFO.loc[logFCs['Gene symbol']]['Isoform ratio'])
print(len(logFCs))
logFCs = logFCs[logFCs['Isoform ratio'] > 0.9]
print(len(logFCs))
logFCs.head()

7638
5854


Unnamed: 0,UCGUAGG,UCAUCUC,UGCUCUU,UUUGGAA,UUGGAAC,CAAACAC,AAUACAC,UUUCCUC,AGCUUCC,AGUCAGA,...,AAGGCAC,Gene symbol,AGCAGCA,AAAGUGC,AACACUG,AAUACUG,UGACCUA,GAGGUAG,GCAGCAU,Isoform ratio
ENST00000000412.3,,,,,,,,,,,...,0.0,M6PR,-0.098,-0.514,-0.056,0.031,-0.003,0.039,0.078,1.0
ENST00000001008.4,-0.016,-0.557,0.11,0.241,-0.231,0.054,-0.031,0.16,-0.021,0.099,...,0.056,FKBP4,0.016,0.133,0.007,-0.114,-0.166,0.002,0.038,1.0
ENST00000001146.2,,,,,,,,,,,...,,CYP26B1,-0.08,-0.053,-0.211,0.15,0.021,0.275,-0.336,1.0
ENST00000002165.6,-0.058,0.092,0.036,0.038,-0.191,0.067,0.057,0.046,-0.055,-0.075,...,,FUCA2,0.063,0.024,-0.018,0.178,0.015,0.031,-0.176,0.999025
ENST00000002596.5,,,,,,,,,,,...,,HS3ST1,0.178,-0.277,0.291,-0.42,-0.197,0.282,0.144,1.0


In [67]:
zipped = list(zip(SEEDS, [rev_comp(seed[:-1]) for seed in SEEDS], [SEQ_INFO.loc[seed]['mirna_seq'] for seed in SEEDS]))
features, labels, matched, sequences = [], [], [], []

for row in logFCs.iterrows():
    utr = UTRS.loc[row[0]]['UTR sequence']
    row = row[1]
    for seed, rev, mirna in zipped:
        if len(mirna) < 20:
            continue
        val = row[seed]
        if np.isnan(val):
            continue
        elif utr.count(rev) == 1:
            loc = utr.find(rev)
            if (loc-15) >= 0:
                if (loc + 9) < len(utr):
                    seq1, seq2 = mirna[:20], utr[loc-15:loc+9][::-1]
                    sequences.append((seq1, seq2))
                    features.append(make_square(seq1, seq2))
                    labels.append(val)
                    matched.append(1)
        elif (utr.count(rev) == 0) & (np.random.random() < 0.3):
            loc = np.random.randint(0,len(utr))
            if (loc-15) >= 0:
                if (loc + 9) < len(utr):
                    seq1, seq2 = mirna[:20], utr[loc-15:loc+9][::-1]
                    sequences.append((seq1, seq2))
                    features.append(make_square(seq1, seq2))
                    labels.append(val)
                    matched.append(0)

features = np.array(features)
labels = np.array(labels).reshape(len(labels), 1)
test_size = int(len(features)/10)
logfc_train = Dataset(features[test_size:], labels[test_size:])
logfc_test = Dataset(features[:test_size], labels[:test_size])


In [None]:
init_size = 80
num_output1 = 8
num_output2 = 16
fully_connected_nodes = 512
out_nodes = 1

# create placeholders for data
x = tf.placeholder(tf.float32, shape=[None, 20*24*16])
x_image = tf.reshape(x, [-1,20*4,24*4,1])
y_ = tf.placeholder(tf.float32, shape=[None, out_nodes])

# add convolution that traverses every 4x4 square, without overlaps
W_conv1 = weight_variable([4, 4, 1, num_output1])
b_conv1 = bias_variable([num_output1])

h_conv1 = tf.nn.relu(conv2d_4step(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# add a second convolution that looks at every 2x2 square, with overlaps
W_conv2 = weight_variable([2, 2, num_output1, num_output2])
b_conv2 = bias_variable([num_output2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# add a fully connected layer
W_fc1 = weight_variable([5 * 6 * num_output2, fully_connected_nodes])
b_fc1 = bias_variable([fully_connected_nodes])

h_pool2_flat = tf.reshape(h_pool2, [-1, 5 * 6 * num_output2])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# add dropout to reduce overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([fully_connected_nodes, out_nodes])
b_fc2 = bias_variable([out_nodes])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

In [None]:
# use an L2 loss and Adam optimizer
SS_err = tf.reduce_sum(tf.square(tf.sub(y_conv, y_)))
SS_tot = tf.reduce_sum(tf.square(tf.sub(y_, tf.reduce_mean(y_))))
R_2 = tf.sub(tf.cast(1.0, tf.float32), tf.div(SS_err, SS_tot))

train_step = tf.train.AdamOptimizer(1e-4).minimize(SS_err)

train_model(sess, train_step, R_2, 20000, 50, logfc_train, logfc_test)