In [None]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import pandas as pd
import numpy as np
import datetime
import time

import load_data

In [None]:
def fc_layer(A_prev, size_in, size_out, name="fully-connected"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1))
        b = tf.Variable(tf.constant(0.1, shape=[size_out]))
        act = tf.matmul(A_prev, w) + b
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act, w, b
    

def prep_batch_MT(X, y, labels, num_epoch=0, batch_size=0, data_type="train"):
    data_dict = {"y_"+label: y[label] for label in labels}
    data_dict["X"] = X
    dataset = tf.data.Dataset.from_tensor_slices(data_dict)
    if data_type == "train":
        dataset=dataset.repeat(num_epoch)
        dataset=dataset.batch(batch_size)
    elif data_type == "test":
        dataset=dataset.repeat()
        dataset=dataset.batch(X.shape[0])        
    else:
        raise("data type \"{0}\" not supported".format(data_type))
    dataset = dataset.prefetch(2)
    iterator = dataset.make_initializable_iterator()
    next_batch = iterator.get_next()
    
    y_true = {label: next_batch["y_"+label] for label in labels}
    x = next_batch["X"]
    return x, y_true, iterator


def build_model(x, N_IN, N_HIDDEN, N_OUT):
    parameters = {}
    y_pred = {}
    a1, w1, b1 = fc_layer(x, N_IN, N_HIDDEN, name="fc")
    parameters["a1"] = a1
    parameters["w1"] = w1
    parameters["b1"] = b1
    hidden = tf.nn.relu(a1, name="hidden")
    for label, n_out in N_OUT.items():
        y, w, b = fc_layer(hidden, N_HIDDEN, N_OUT[label], name="softmax_"+label)
        y_pred[label] = y
        parameters["w_" + label] = w
        parameters["b_" + label] = b
    return y_pred, parameters


def feed_forward(x, labels, parameters):
    y_pred = {}
    w1 = parameters["w1"]
    b1 = parameters["b1"]
    hidden = tf.nn.relu(tf.matmul(x, w1) + b1)
    for label in labels:
        w = parameters["w_"+label]
        b = parameters["b_"+label]
        y_pred[label] = tf.matmul(hidden, w) + b
    return y_pred
    
    
def back_prop(y_true, y_pred, learning_rate):
    sum_cross_entropy = 0
    for label in y_true.keys():
        sum_cross_entropy += tf.nn.softmax_cross_entropy_with_logits(
            labels=y_true[label], logits=y_pred[label], name="loss_"+label)
    tf.summary.scalar("cross_entropy_loss", sum_cross_entropy)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(sum_cross_entropy)
    return train_step


def accuracy(y_pred, y_true, name=""):
    accuracy = {}
    summary = {}
    for label in y_pred.keys():
        correct_prediction = tf.equal(tf.argmax(y_pred[label], 1), 
                                      tf.argmax(y_true[label], 1))
        accuracy[label] = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        summary[label] = tf.summary.scalar(name+label, accuracy[label])
    return accuracy, summary


def train_model(data, labels=["tissue", "gender", "tumor"], 
                batch_size=1000, 
                num_epoch=1000, 
                learning_rate=1e-3, extra=""):

    # set up parameters
    tf.reset_default_graph()
    LOGDIR = "/tmp/tcga_{0}/".format(str(datetime.datetime.today().date()))
    N_IN = data.train.X.shape[1]
    N_OUT ={}
    for label_name in labels:
        N_OUT[label_name] = data.train.y[label_name].shape[1]
    N_HIDDEN = int(np.mean(N_IN + sum(N_OUT.values()))/4)
    
    # set up train step and training accuracy
    x, y_true, train_iterator = prep_batch_MT(
        data.train.X, data.train.y, labels, num_epoch=num_epoch, batch_size=batch_size)
    
    y_pred, parameters = build_model(x, N_IN, N_HIDDEN, N_OUT)
    train_step = back_prop(y_true, y_pred, learning_rate)
    train_accuracy, train_summ = accuracy(y_true, y_pred, name="train_accuracy_")    
    
    # set up test accuracy
    x_test, y_test_true, test_iterator = prep_batch_MT(
        data.test.X, data.test.y, labels, data_type="test")
    y_test_pred = feed_forward(x_test, labels, parameters)
    test_accuracy, test_summ = accuracy(y_test_true, y_test_pred, name="test_accuracy_")

    # prepare session and summary writer
    sess = tf.Session()
    summ = tf.summary.merge_all()
    writer = tf.summary.FileWriter(LOGDIR + "pca_{0}".format(extra))
    writer.add_graph(sess.graph)
    
    # initializing
    t0 = time.time()
    sess.run(tf.global_variables_initializer())
    sess.run([train_iterator.initializer, test_iterator.initializer])
    
    epoch = 0
    train_size = data.train.X.shape[0]
    
    # training
    for i in range(100000000):
        try:
            sess.run(train_step)
            if (i*batch_size) / train_size > epoch:
                epoch += 1
                [train_accu, train_s, test_accu, test_s] = sess.run(
                    [train_accuracy, train_summ, test_accuracy, test_summ])
                for label_name in labels:
                    writer.add_summary(train_s[label_name], epoch)
                    writer.add_summary(test_s[label_name], epoch)
                if epoch % 100 == 0:
                    print("epoch", epoch)
                    print("training accuracy:\n", train_accu, 
                          "\ntest_accuracy\n", test_accu)
        except tf.errors.OutOfRangeError:
            break
    print("training time:", time.time() - t0)

In [None]:
for PCA_variance in [0.6, 0.7, 0.8, 0.9]:
    print(PCA_variance)
    tcga = load_data.read_data_sets("./data/mRNA_PCA_{0}_variance_MinMaxScaled.csv".format(PCA_variance))
    train_model(tcga, extra=PCA_variance)

In [None]:
tcga = load_data.read_data_sets("./data/mRNA_lognorm_MinMaxScaled.csv")

In [None]:
train_model(tcga, extra="all")