In [4]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import pandas as pd
import numpy as np
import datetime
import time

import load_data

In [13]:
def fc_layer(A_prev, size_in, size_out, name="fully-connected"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1))
        b = tf.Variable(tf.constant(0.1, shape=[size_out]))
        act = tf.matmul(A_prev, w) + b
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act, w, b


def build_model(x, N_IN, N_HIDDEN, N_OUT):
    parameters = {}
    y_pred = {}
    a1, w1, b1 = fc_layer(x, N_IN, N_HIDDEN, name="fc")
    parameters["a1"] = a1
    parameters["w1"] = w1
    parameters["b1"] = b1
    hidden = tf.nn.relu(a1, name="hidden")
    for label, n_out in N_OUT.items():
        y, w, b = fc_layer(hidden, N_HIDDEN, N_OUT[label], name="softmax_"+label)
        y_pred[label] = y
        parameters["w_" + label] = w
        parameters["b_" + label] = b
    return y_pred, parameters


def feed_forward(x, labels, parameters):
    y_pred = {}
    w1 = parameters["w1"]
    b1 = parameters["b1"]
    hidden = tf.nn.relu(tf.matmul(x, w1) + b1)
    for label in labels:
        w = parameters["w_"+label]
        b = parameters["b_"+label]
        y_pred[label] = tf.matmul(hidden, w) + b
    return y_pred
    
    
def back_prop(y_true, y_pred, learning_rate):
    sum_cross_entropy = 0
    for label in y_true.keys():
        sum_cross_entropy += tf.nn.softmax_cross_entropy_with_logits(
            labels=y_true[label], logits=y_pred[label], name="loss_"+label)
    tf.summary.scalar("cross_entropy_loss", sum_cross_entropy)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(sum_cross_entropy)
    return train_step


def accuracy(y_pred, y_true, name=""):
    accuracy = {}
    summary = {}
    for label in y_pred.keys():
        correct_prediction = tf.equal(tf.argmax(y_pred[label], 1), 
                                      tf.argmax(y_true[label], 1))
        accuracy[label] = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        summary[label] = tf.summary.scalar(name+label, accuracy[label])
    return accuracy, summary


def train_model(data, labels=["tissue", "tumor", "gender"], 
                batch_size=128, num_epoch=1000, learning_rate=1e-3, extra=""):
    # set up parameters
    tf.reset_default_graph()
    LOGDIR = "/tmp/tcga_{0}/".format(str(datetime.datetime.today().date()))
    N_IN = data.train.num_features
    N_OUT = data.train.label_classes
    N_HIDDEN = int(np.mean(N_IN + sum(N_OUT.values()))/4)
    
    # set up train step and training accuracy
    (train_batch, train_iter, val_all, val_iter, 
        train_all, train_iter_all) = data.prep_train_batch(batch_size=batch_size)
    
    x = train_batch["X"]
    y_true = {key: value for key, value in train_batch.items() if key != "X"}
    y_pred, parameters = build_model(x, N_IN, N_HIDDEN, N_OUT)
    train_step = back_prop(y_true, y_pred, learning_rate)
    
    # set up train and test accuracy
    x_train_all = train_all["X"]
    y_train_all_true = {key: value for key, value in train_all.items() if key != "X"}
    y_train_all_pred = feed_forward(x_train_all, labels, parameters)
    train_accuracy, train_summ = accuracy(y_train_all_true, 
                                          y_train_all_pred, name="train_accuracy_")
    
    x_val = val_all["X"]
    y_val_true = {key: value for key, value in val_all.items() if key != "X"}
    y_val_pred = feed_forward(x_val, labels, parameters)
    val_accuracy, val_summ = accuracy(y_val_true, y_val_pred, name="val_accuracy_")

    
    # prepare session and summary writer
    sess = tf.Session()
    summ = tf.summary.merge_all()
    writer = tf.summary.FileWriter(LOGDIR + "pca_{0}".format(extra))
    writer.add_graph(sess.graph)
    
    # initializing
    t0 = time.time()
    sess.run(tf.global_variables_initializer())
    
    
    # training
    for epoch in range(num_epoch):
        sess.run([train_iter.initializer])
        try:
            while True:
                sess.run(train_step)
        except tf.errors.OutOfRangeError:
            sess.run([train_iter_all.initializer, val_iter.initializer])
            [train_accu, train_s, val_accu, val_s] = sess.run(
                [train_accuracy, train_summ, val_accuracy, val_summ])
            for label_name in labels:
                writer.add_summary(train_s[label_name], epoch)
                writer.add_summary(val_s[label_name], epoch)
            if epoch % 100 == 0:
                print("epoch", epoch)
                print("training accuracy:\n", train_accu, 
                      "\nvalidation accuracy\n", val_accu)
    print("training time:", time.time() - t0)
    sess.close()

In [None]:
for PCA_variance in [0.6, 0.7, 0.8, 0.9]:
    print(PCA_variance)
    tcga = load_data.read_data_sets("./data/mRNA_PCA_{0}_variance_MinMaxScaled.csv".format(PCA_variance))
    train_model(tcga, extra=PCA_variance)

In [None]:
tcga = load_data.read_data_sets("./data/mRNA_lognorm_MinMaxScaled.csv")

In [None]:
train_model(tcga, extra="all")