# Tensorflow on RNA sequences

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
# open a tensorflow session
sess = tf.InteractiveSession()

In [3]:
# create dataset object that holds features and labels and cycles through the data in batches
class Dataset(object):

    def __init__(self, features, labels):
        assert (len(features) == len(labels))
        self.features = np.array(features)
        self.labels = np.array(labels)
        self.index = 0
        self.size = len(labels)
    
    def next_batch(self, batch_size):
        old_index = self.index
        new_index = self.index + batch_size
        self.index = new_index % self.size
        if new_index <= self.size:
            return (self.features[old_index: new_index], self.labels[old_index: new_index])
        else:
            subfeatures = np.concatenate([self.features[old_index:], self.features[:self.index]])
            sublabels = np.concatenate([self.labels[old_index:], self.labels[:self.index]])
            return (subfeatures, sublabels)
    
    def reset_index(self):
        self.index = 0
    

### Import MNIST data for testing

In [7]:
# import sample data for digit recognition
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [8]:
mnist_train = Dataset(mnist.train.images, mnist.train.labels)
mnist_test = Dataset(mnist.test.images, mnist.test.labels)

## Convolutional Neural Network

In [4]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1], padding='SAME')

def train_model(sess, train_step, eval_var, num_epoch, batch_size, report_int, keep_prob_train, train, test):

    # initialize variables
    sess.run(tf.global_variables_initializer())

    # train epochs
    for i in range(num_epoch):
        batch = train.next_batch(50)
        if i%report_int == 0:
            train_accuracy = eval_var.eval(feed_dict={x:batch[0],
                                                      y_: batch[1],
                                                      keep_prob: 1.0})

            print("step %d, training accuracy %g"%(i, train_accuracy),
                  "test accuracy %g"%eval_var.eval(feed_dict={x: test.features,
                                                              y_: test.labels,
                                                              keep_prob: 1.0}))
        train_step.run(feed_dict={x: batch[0],
                                  y_: batch[1],
                                  keep_prob: keep_prob_train})

    print("test accuracy %g"%eval_var.eval(feed_dict={x: test.features,
                                                      y_: test.labels,
                                                      keep_prob: 1.0}))
    

### First implement MNIST with a convolutional NN using tensorflow tutorial

In [11]:
init_size = 28
final_size = 7
num_output1 = 32
num_output2 = 64
fully_connected_nodes = 1024
out_nodes = 10

# create placeholders for data
x = tf.placeholder(tf.float32, shape=[None, init_size*init_size])
x_image = tf.reshape(x, [-1,init_size,init_size,1])
y_ = tf.placeholder(tf.float32, shape=[None, out_nodes])

# add convolution that traverses every 4x4 square, without overlaps
W_conv1 = weight_variable([4, 4, 1, num_output1])
b_conv1 = bias_variable([num_output1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# add a second convolution that looks at every 2x2 square, with overlaps
W_conv2 = weight_variable([2, 2, num_output1, num_output2])
b_conv2 = bias_variable([num_output2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# add a fully connected layer
W_fc1 = weight_variable([final_size * final_size * num_output2, fully_connected_nodes])
b_fc1 = bias_variable([fully_connected_nodes])

h_pool2_flat = tf.reshape(h_pool2, [-1, final_size * final_size * num_output2])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# add dropout to reduce overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([fully_connected_nodes, out_nodes])
b_fc2 = bias_variable([out_nodes])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2


In [17]:
# use a cross entropy loss function and optimize with Adam
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

train_model(sess, train_step, accuracy, 1000, 50, 100, mnist_train, mnist_test)


step 0, training accuracy 0.08 test accuracy 0.0731
step 100, training accuracy 0.66 test accuracy 0.7809
step 200, training accuracy 0.9 test accuracy 0.8655
step 300, training accuracy 0.92 test accuracy 0.9016
step 400, training accuracy 0.9 test accuracy 0.915
step 500, training accuracy 0.94 test accuracy 0.9169
step 600, training accuracy 0.94 test accuracy 0.9289
step 700, training accuracy 1 test accuracy 0.9345
step 800, training accuracy 0.92 test accuracy 0.9444
step 900, training accuracy 0.98 test accuracy 0.9393
test accuracy 0.9464


## Classify RNA sequences based on complementarity

In [5]:
def generate_random_sequence(length):
    """Generate a random RNA sequence of a given length"""
    
    nts = ['A','U','C','G']
    sequence = np.random.choice(nts, size=length, replace=True)

    return ''.join(sequence)

def get_complementary(seq):
    """Get the complementary sequence of a given RNA sequence"""
    
    intab = "AUCG"
    outtab = "UAGC"
    trantab = str.maketrans(intab, outtab)

    return seq.translate(trantab)

def generate_match_pair(length, random_seed=None):
    """Generate two sequences that are base-paired"""
    
    if random_seed is not None:
        np.random.seed(random_seed)

    seq1 = generate_random_sequence(length)
    seq2 = get_complementary(seq1)
    
    return seq1, seq2

def generate_seed_match_pair(length1, length2, random_seed=None):
    """Generate two sequences that are base-paired at positions 1-7"""
    
    if random_seed is not None:
        np.random.seed(random_seed)

    seq1 = generate_random_sequence(length1)
    up_fragment = generate_random_sequence(1)
    down_fragment = generate_random_sequence(length2-7)
    mid_fragment = get_complementary(seq1[1:7])
    
    seq2 = up_fragment + mid_fragment + down_fragment
    
    return seq1, seq2

def generate_random_pair(length1, length2, random_seed=None):
    """Generate two random sequences that are not perfectly complementary"""
    
    if random_seed is not None:
        np.random.seed(random_seed)
    
    seq1 = generate_random_sequence(length1)
    match_seq1 = get_complementary(seq1)

    while True:
        seq2 = generate_random_sequence(length2)

        if match_seq1 != seq2:
            return seq1, seq2

def one_hot_encode(seq, nt_order):
    """Convert RNA sequence to one-hot encoding"""
    
    one_hot = [list(np.array(nt_order == nt, dtype=int)) for nt in seq]
    one_hot = [item for sublist in one_hot for item in sublist]
    
    return np.array(one_hot)

def make_square(seq1, seq2):
    """Given two sequences, calculate outer product of one-hot encodings"""

    return np.outer(one_hot_encode(seq1, np.array(['A','U','C','G'])),
                    one_hot_encode(seq2, np.array(['U','A','G','C'])))

In [6]:
print(generate_seed_match_pair(20,20))
print(generate_random_pair(20,20))
print(generate_match_pair(20,20))
print(make_square('AAA','AAA'))
print(make_square('UAG','AUC'))

('UUACAAAGACCAACGACUUC', 'AAUGUUUGUAUUUCCAUAAC')
('CCUAACAGGAGCUCGACUAU', 'AAUUGCUAAACCCUCCGCCC')
('GCGGACUAGCGCACAUCCGG', 'CGCCUGAUCGCGUGUAGGCC')
[[0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 1]]


In [93]:
# make 2D neural network object
class NeuralNet2D(object):
    
    def __init__(self, sess, dim1, dim2, label_size):
        self.sess = sess
        self.x = tf.placeholder(tf.float32, shape=[None, dim1*dim2])
        self.layers = [tf.reshape(self.x, [-1,dim1,dim2,1])]
        self.layer_index = 0
        self.y_ = tf.placeholder(tf.float32, shape=[None, label_size])
    
    def add_convolution(self, dim1, dim2, stride1, stride2, output_channels, padding='SAME'):
        input_channels = self.layers[self.layer_index].get_shape().as_list()[-1]

        self.layers.append(tf.nn.relu(tf.nn.conv2d(self.layers[self.layer_index],
                                                   weight_variable([dim1, dim2, input_channels, output_channels]),
                                                   strides=[1, stride1, stride2, 1],
                                                   padding=padding) + bias_variable([output_channels])))
        self.layer_index += 1

    def add_max_pool(self, dim1, dim2, stride1, stride2, padding='SAME'):
        self.layers.append(tf.nn.max_pool(self.layers[self.layer_index],
                                            ksize=[1, dim1, dim2, 1],
                                            strides=[1, stride1, stride2, 1], padding=padding))
        
        self.layer_index += 1
    
    def add_fully_connected(self, num_nodes):
        dim = self.layers[self.layer_index].get_shape().as_list()
        dim = dim[1] * dim[2] * dim[3]
        self.layers.append(tf.nn.relu(tf.matmul(tf.reshape(self.layers[self.layer_index], [-1, dim]),
                                                weight_variable([dim, num_nodes])) + bias_variable([num_nodes])))
        self.layer_index += 1
    
    def add_dropout(self, num_nodes):
        dim = self.layers[self.layer_index].get_shape().as_list()
        self.keep_prob = tf.placeholder(tf.float32)
        
        self.layers.append(tf.matmul(tf.nn.dropout(self.layers[self.layer_index], self.keep_prob),
                                     weight_variable([dim[-1], num_nodes])) + bias_variable([num_nodes]))
        self.layer_index += 1
    
    def make_train_step(self, problem_type):
        current_layer = self.layers[self.layer_index]
        if problem_type == 'classification':
            cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(current_layer,
                                                                                   self.y_))
            self.train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
            
            correct_prediction = tf.equal(tf.argmax(current_layer,1), tf.argmax(self.y_,1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            
        elif problem_type == 'regression':
            SS_err = tf.reduce_sum(tf.square(tf.sub(current_layer, self.y_)))
            SS_tot = tf.reduce_sum(tf.square(tf.sub(self.y_, tf.reduce_mean(self.y_))))
            R_2 = tf.sub(tf.cast(1.0, tf.float32), tf.div(SS_err, SS_tot))

            self.train_step = tf.train.AdamOptimizer(1e-4).minimize(SS_err)
            self.accuracy = R_2
        
        else:
            print('problem_type must be \'classification\' or \'regression\'')
            
    
    def train_model(self, train, test, num_epoch=20000, batch_size=50, 
                    report_int=1000, keep_prob_train=0.5):

        # initialize variables
        self.sess.run(tf.global_variables_initializer())

        # train epochs
        for i in range(num_epoch):
            batch = train.next_batch(batch_size)
            if i%report_int == 0:
                train_accuracy = self.accuracy.eval(feed_dict={self.x: batch[0],
                                                               self.y_: batch[1],
                                                               self.keep_prob: 1.0})

                print("step %d, training accuracy %g"%(i, train_accuracy),
                      "test accuracy %g"%self.accuracy.eval(feed_dict={self.x: test.features,
                                                                       self.y_: test.labels,
                                                                       self.keep_prob: 1.0}))
            self.train_step.run(feed_dict={self.x: batch[0],
                                           self.y_: batch[1],
                                           self.keep_prob: keep_prob_train})

        print("test accuracy %g"%self.accuracy.eval(feed_dict={self.x: test.features,
                                                               self.y_: test.labels,
                                                               self.keep_prob: 1.0}))


### Learn to distinguish complementary sequences from random sequences

In [53]:
# generate complementary and random 4mers
features = np.zeros((1000, 256))
labels = np.zeros((1000, 2))

for i in range(1000):
    if np.random.random() < 0.5:
        seq1, seq2 = generate_match_pair(4)
        labels[i,:] = [1, 0]
    else:
        seq1, seq2 = generate_random_pair(4,4)
        labels[i,:] = [0, 1]

    features[i,:] = make_square(seq1, seq2).flatten()

match_train = Dataset(features[:900, :], labels[:900, :])
match_test = Dataset(features[900:, :], labels[900:, :])

In [95]:
dim1, dim2 = 16, 16
num_output1 = 4
fully_connected_nodes = 1024
out_nodes = 2

NN = NeuralNet2D(sess, dim1, dim2, out_nodes)
NN.add_convolution(4, 4, 4, 4, num_output1)
NN.add_fully_connected(fully_connected_nodes)
NN.add_dropout(out_nodes)
NN.make_train_step('classification')
NN.train_model(match_train, match_test, num_epoch=5000,
               batch_size=50, report_int=1000, keep_prob_train=0.5)

step 0, training accuracy 0.58 test accuracy 0.51
step 1000, training accuracy 1 test accuracy 0.98
step 2000, training accuracy 1 test accuracy 0.99
step 3000, training accuracy 1 test accuracy 0.99
step 4000, training accuracy 1 test accuracy 0.99
test accuracy 0.99


### Learn to distinguish seed-matched sequences from random sequences

In [96]:
# generate mirna,utr pairs of length 20 with a seed match
features = np.zeros((1000, 6400))
labels = np.zeros((1000,2))

for i in range(1000):
    if np.random.random() < 0.5:
        seq1, seq2 = generate_seed_match_pair(20,20)
        labels[i,:] = [1, 0]
    else:
        seq1, seq2 = generate_random_pair(20,20)
        labels[i,:] = [0, 1]
    
    features[i,:] = make_square(seq1, seq2).flatten()

seed_match_train = Dataset(features[:900, :], labels[:900, :])
seed_match_test = Dataset(features[900:, :], labels[900:, :])

In [97]:
dim1, dim2 = 80, 80
num_output1 = 4
fully_connected_nodes = 1024
out_nodes = 2

NN = NeuralNet2D(sess, dim1, dim2, out_nodes)
NN.add_convolution(4, 4, 4, 4, num_output1)
NN.add_fully_connected(fully_connected_nodes)
NN.add_dropout(out_nodes)
NN.make_train_step('classification')
NN.train_model(seed_match_train, seed_match_test, num_epoch=5000,
               batch_size=50, report_int=1000, keep_prob_train=0.5)

step 0, training accuracy 0.5 test accuracy 0.52
step 1000, training accuracy 1 test accuracy 0.95
step 2000, training accuracy 1 test accuracy 0.95
step 3000, training accuracy 1 test accuracy 0.95
step 4000, training accuracy 1 test accuracy 0.96
test accuracy 0.95


### Learn to regress on seed pairing stability

In [98]:
def calculate_sps(seq):
    """
    Parameters:
    ==========
    seq: string, consist of A, U, C, G
    
    Returns:
    =======
    float: seed pairing stability
    """
    thermo_dict = {'AA':-0.93,'AU':-1.10,'AC':-2.24,'AG':-2.08,
               'UA':-1.33,'UU':-0.93,'UC':-2.35,'UG':-2.11,
               'CA':-2.11,'CU':-2.08,'CC':-3.26,'CG':-2.36,
               'GA':-2.35,'GU':-2.24,'GC':-3.42,'GG':-3.26}
    init = 4.09
    terminal_au = 0.45
    
    # initialize score
    score = init
    
    # add score for each dinucleotide
    for i in range(len(seq)-1):
        score += thermo_dict[seq[i:i+2]]
    
    # add score for each terminal AU
    score += terminal_au*((seq[0]+seq[-1]).count('A') + (seq[0]+seq[-1]).count('U'))
    
    return score


In [106]:
# generate random sequences
features = np.zeros((1000, 6400))
labels = np.zeros((1000,1))
seq1s, seq2s = [], []

for i in range(1000):
    if np.random.random() < 0.5:
        seq1, seq2 = generate_seed_match_pair(20,20)
        seq1s.append(seq1)
        seq2s.append(seq2)
        labels[i,:] = calculate_sps(seq1[1:7]) + (np.random.random()-0.5)
    else:
        seq1, seq2 = generate_random_pair(20,20)
        seq1s.append(seq1)
        seq2s.append(seq2)
        labels[i,:] = np.random.random()-0.5
    
    features[i,:] = make_square(seq1, seq2).flatten()

seed_match_sps_train = Dataset(features[:900, :], labels[:900, :])
seed_match_sps_test = Dataset(features[900:, :], labels[900:, :])

In [110]:
dim1, dim2 = 80, 80
num_output1 = 16
num_output2 = 16
fully_connected_nodes = 32
out_nodes = 1

NN = NeuralNet2D(sess, dim1, dim2, out_nodes)
NN.add_convolution(4, 4, 4, 4, num_output1)
NN.add_convolution(2, 2, 1, 1, num_output2)
NN.add_fully_connected(fully_connected_nodes)
NN.add_dropout(out_nodes)
NN.make_train_step('regression')
NN.train_model(seed_match_sps_train, seed_match_sps_test, num_epoch=10000,
               batch_size=100, report_int=1000, keep_prob_train=0.5)

step 0, training accuracy -1.04116 test accuracy -0.901628
step 1000, training accuracy 0.540802 test accuracy 0.367427
step 2000, training accuracy 0.942358 test accuracy 0.861274
step 3000, training accuracy 0.981681 test accuracy 0.913068
step 4000, training accuracy 0.988818 test accuracy 0.926939
step 5000, training accuracy 0.987021 test accuracy 0.925711
step 6000, training accuracy 0.985119 test accuracy 0.928617
step 7000, training accuracy 0.99206 test accuracy 0.936477
step 8000, training accuracy 0.996499 test accuracy 0.941244
step 9000, training accuracy 0.994621 test accuracy 0.942679
test accuracy 0.944907


# Regression on real data

In [111]:
import pandas as pd

LOGFC_FILE1 = '../data/Supplementary1.csv'
LOGFC_FILE2 = '../data/Supplementary2.csv'
GENE_FILE = '../data/Gene_info.txt'
SEED_FILE = '../data/seed_dict.csv'
SEQ_FILE = '../data/seq_dict.csv'

def rev_comp(seq):
    """Get the reverse complement of a given RNA sequence"""
    
    intab = "AUCG"
    outtab = "UAGC"
    trantab = str.maketrans(intab, outtab)

    return seq.translate(trantab)[::-1]
    
rev_comp('AUUACCGGC')

'GCCGGUAAU'

In [112]:
GENE_INFO = pd.read_csv(GENE_FILE,sep='\t').drop(['Gene description','Species ID'],1)
GENE_INFO = GENE_INFO.groupby('Gene symbol').agg(lambda x:tuple(x))
GENE_INFO.loc[:,'Isoform ratio'] = [float(max(x))/np.nansum(x) for x in GENE_INFO['3P-seq tags + 5']]
GENE_INFO.loc[:,'Transcript ID'] = [x[y.index(1)] for (x,y) in zip(GENE_INFO['Transcript ID'],GENE_INFO['Representative transcript?'])]
GENE_INFO = GENE_INFO[['Transcript ID','Isoform ratio']]
GENE_INFO_TRANSCRIPT = GENE_INFO.reset_index().set_index('Transcript ID')

SEED_INFO = pd.read_csv(SEED_FILE,sep='\t')
SEED_DICT = {}
for row in SEED_INFO.iterrows():
    SEED_DICT[row[1]['col']] = row[1]['seed']

SEEDS = sorted(list(SEED_DICT.values()))

SEQ_INFO = pd.read_csv(SEQ_FILE, sep='\t')
SEQ_INFO['seed'] = [SEED_DICT[x] for x in SEQ_INFO['col']]
SEQ_INFO = SEQ_INFO.set_index('seed')
SEQ_INFO['mirna_seq'] = [(x + 'AAAAAAA')[:24] for x in SEQ_INFO['mirna_seq']]
SEQ_INFO.head()

Unnamed: 0_level_0,col,mirna_seq
seed,Unnamed: 1_level_1,Unnamed: 2_level_1
UCGUAGG,1595297366,UUCGUAGGUCAAAAUACACAAAAA
UCAUCUC,1595297383,UUCAUCUCCAAUUCGUAGGAAAAA
UGCUCUU,1595297389,AUGCUCUUUCCUCCUGUGCAAAAA
UUUGGAA,1595297394,UUUUGGAACAGUCUUUCCGAAAAA
UUGGAAC,1595297399,UUUGGAACAGUCUUUCCGAAAAAA


In [113]:
UTRS = pd.read_csv('../../05_TargetPrediction/targetscan_files/UTR_Sequences_Ensembl_Human.txt',
                   sep='\t', usecols=['Ensembl ID','UTR sequence']).set_index('Ensembl ID')
UTRS['UTR sequence'] = [x.replace('-','').upper().replace('T','U') for x in UTRS['UTR sequence']]
UTRS['UTR length'] = [len(x) for x in UTRS['UTR sequence']]
UTRS.head()

Unnamed: 0_level_0,UTR sequence,UTR length
Ensembl ID,Unnamed: 1_level_1,Unnamed: 2_level_1
CDR1as,GUUUCCGAUGGCACCUGUGUCAAGGUCUUCCAACAACUCCGGGUCU...,1485
ENST00000000233.5,CCAGCCAGGGGCAGGCCCCUGAUGCCCGGAAGCUCCUGCGUGCAUC...,422
ENST00000000412.3,AUUGCACUUUAUAUGUCCAGCCUCUUCCUCAGUCCCCCAAACCAAA...,1457
ENST00000001008.4,CCCCUCUCCACCAGCCCUACUCCUGCGGCUGCCUGCCCCCCAGUCU...,2163
ENST00000001146.2,CCCAAGACCCACCCGCCUCAGCCCAGCCCAGGCAGCGGGGUGGUGG...,3001


In [114]:
logFCs1 = pd.read_csv(LOGFC_FILE1)
logFCs1 = logFCs1[logFCs1['Used in training'] == 'yes']
logFCs1['Transcript ID'] = list(GENE_INFO.loc[logFCs1['Gene symbol']]['Transcript ID'])
logFCs1 = logFCs1.drop(['Used in training','RefSeq ID','Gene symbol'],1).dropna(subset=['Transcript ID'])
logFCs1 = logFCs1.drop_duplicates(subset=['Transcript ID']).set_index('Transcript ID')
logFCs1.columns = [SEED_DICT[x] if x in SEED_DICT else x for x in logFCs1.columns]

logFCs2 = pd.read_csv(LOGFC_FILE2)
logFCs2['Transcript ID'] = list(GENE_INFO.loc[logFCs2['Gene symbol']]['Transcript ID'])
logFCs2 = logFCs2.drop(['RefSeq ID','Gene symbol'],1).dropna(subset=['Transcript ID'])
logFCs2 = logFCs2.drop_duplicates(subset=['Transcript ID']).set_index('Transcript ID')
logFCs2.columns = [SEED_DICT[x] if x in SEED_DICT else x for x in logFCs2.columns]

logFCs = pd.concat([logFCs1,logFCs2],axis=1,join='outer')
logFCs = pd.concat([logFCs, GENE_INFO_TRANSCRIPT], axis=1, join='inner')

logFCs = logFCs[logFCs['Isoform ratio'] > 0.9]
print(len(logFCs))
logFCs.head()

6426


Unnamed: 0,UCGUAGG,UCAUCUC,UGCUCUU,UUUGGAA,UUGGAAC,CAAACAC,AAUACAC,UUUCCUC,AGCUUCC,AGUCAGA,...,AAGGCAC,AGCAGCA,AAAGUGC,AACACUG,AAUACUG,UGACCUA,GAGGUAG,GCAGCAU,Gene symbol,Isoform ratio
ENST00000318602.7,0.128,-0.075,-0.113,0.066,-0.053,0.15,-0.007,-0.024,-0.035,0.051,...,0.239,,,,,,,,A2M,1.0
ENST00000401850.1,,,,,,,,,,,...,-0.175,-0.286,0.052,0.195,-0.133,0.093,0.036,0.159,A4GALT,1.0
ENST00000236709.3,,,,,,,,,,,...,,-0.461,0.08,0.011,-0.037,0.167,0.05,-0.13,A4GNT,1.0
ENST00000209873.4,0.017,-0.081,-0.043,-0.023,0.153,-0.06,-0.026,0.061,-0.042,0.028,...,-0.063,-0.144,-0.014,0.033,-0.059,0.046,-0.012,0.093,AAAS,1.0
ENST00000337664.4,,,,,,,,,,,...,,-0.136,-0.143,0.003,0.118,-0.317,0.38,0.023,AADAT,1.0
