In [11]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import datetime
from tensorflow.python.client import timeline

import load_data

### One hidden layer

In [77]:
def weight_variable(shape):
    initial=tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def fc_layer(input, size_in, size_out, name="fc"):
    with tf.name_scope(name):
        w = weight_variable([size_in, size_out])
        b = bias_variable([size_out])
        act = tf.matmul(input, w) + b
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act


def prep_batch(X, y, num_epoch=0, batch_size=0, data_type="train"):
    dataset = tf.data.Dataset.from_tensor_slices({"X": X, "y": y})
    if data_type == "train":
        dataset=dataset.repeat(num_epoch)
        dataset=dataset.batch(batch_size)
        dataset = dataset.prefetch(2)
    elif data_type == "test":
        dataset=dataset.repeat()
        dataset=dataset.batch(X.shape[0])
    else:
        raise("data type \"{0}\" not supported".format(data_type))
    iterator = dataset.make_initializable_iterator()
    next_batch = iterator.get_next()
    x, y_true = next_batch["X"], next_batch["y"]
    return x, y_true, iterator


def feed_forward(x, N_IN, N_HIDDEN, N_OUT):
    hidden = tf.nn.relu(fc_layer(x, N_IN, N_HIDDEN), name="hidden")
    y_pred = fc_layer(hidden, N_HIDDEN, N_OUT, name="softmax")
    return y_pred


def back_prop(y_true, y_pred, learning_rate):
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        labels=y_true, logits=y_pred), name="cross_entropy_loss")
    tf.summary.scalar("cross_entropy_loss", cross_entropy)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)
    return train_step


def accuracy(y_pred, y_true, name=""):
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    summary = tf.summary.scalar(name, accuracy)
    return accuracy, summary


def train_model(data, label="tissue", batch_size=1000, 
                num_epoch=1000, learning_rate=1e-3, extra=""):

    # set up parameters
    tf.reset_default_graph()
    LOGDIR = "/tmp/tcga_{0}/".format(str(datetime.datetime.today().date()))
    N_IN = data.train.X.shape[1]
    N_OUT = data.train.y[label].shape[1]
    N_HIDDEN = int(np.mean(N_IN + N_OUT)/2)

    # set up train step and training accuracy
    x, y_true, train_iterator = prep_batch(data.train.X, 
                                     data.train.y[label],
                                     num_epoch=1000, batch_size=1000)
    y_pred = feed_forward(x, N_IN, N_HIDDEN, N_OUT)
    train_step = back_prop(y_true, y_pred, learning_rate)
    train_accuracy, train_summ = accuracy(y_true, y_pred, name="train_accuracy")

    # set up test accuracy
    x_test, y_test_true, test_iterator = prep_batch(
        data.test.X, data.test.y[label], data_type="test")
    y_test_pred = feed_forward(x_test, N_IN, N_HIDDEN, N_OUT)
    test_accuracy, test_summ = accuracy(y_test_true, y_test_pred, name="test_accuracy")

    # prepare session and summary writer
    sess = tf.Session()
    summ = tf.summary.merge_all()
    writer = tf.summary.FileWriter(LOGDIR + "pca{0}_{1}".format(extra, label))
    writer.add_graph(sess.graph)
    
    # initializing
    t0 = time.time()
    sess.run(tf.global_variables_initializer())
    sess.run([train_iterator.initializer, test_iterator.initializer])
    
    
    epoch = 0
    train_size = data.train.X.shape[0]

    
    # training
    for i in range(100000000):
        try:
            sess.run(train_step)
            
            if (i*batch_size) % 50000 == 0:
                epoch += 1
                [train_accu, train_s, test_accu, test_s, s] = sess.run(
                    [train_accuracy, train_summ, test_accuracy, test_summ, summ])
                writer.add_summary(train_s, epoch)
                writer.add_summary(test_s, epoch)
                writer.add_summary(s, epoch)
                if epoch % 100 == 0:
                    print("epoch", epoch, 
                          "training accuracy", train_accu, 
                          "test_accuracy", test_accu,
                          )
        except tf.errors.OutOfRangeError:
            break
    print("training time:", time.time() - t0)

In [78]:
for pca_var in [0.6, 0.7, 0.8, 0.9]:
    tcga = load_data.read_data_sets("./data/mRNA_PCA_{0}_variance_MinMaxScaled.csv".format(pca_var))
    for label_name in ["tissue", "gender", "tumor"]:
        print(pca_var, label_name)
        tcga.train.reset_epoch()
        train_model(tcga, label=label_name, extra=pca_var)
        break

0.6 tissue
epoch 100 training accuracy 0.906 test_accuracy 0.00461255
training time: 12.918889999389648
0.7 tissue
epoch 100 training accuracy 0.959 test_accuracy 0.00184502
training time: 13.913526773452759
0.8 tissue
epoch 100 training accuracy 0.986 test_accuracy 0.0341328
training time: 17.034378051757812
0.9 tissue


KeyboardInterrupt: 

In [None]:
tcga = load_data.read_data_sets("./data/mRNA_lognorm_MinMaxScaled.csv")
for label_name in ["tissue", "gender", "tumor"]:
    print(label_name)
    tcga.train.reset_epoch()
    train_model(tcga, label=label_name, extra="_all")