In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import datetime
from tensorflow.python.client import timeline

import load_data

### One hidden layer

In [2]:
def weight_variable(shape):
    initial=tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def fc_layer(input, size_in, size_out, name="fc"):
    with tf.name_scope(name):
        w = weight_variable([size_in, size_out])
        b = bias_variable([size_out])
        act = tf.matmul(input, w) + b
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act
    

def prep_dataset(X, y, num_epoch=0, batch_size=0):
#     X_placeholder = tf.placeholder(tf.float32, X_shape)
#     y_placeholder = tf.placeholder(tf.float32, y_shape)
    dataset = tf.data.Dataset.from_tensor_slices({"X": X, "y": y})
    dataset=dataset.repeat(num_epoch)
    dataset=dataset.batch(batch_size)
    dataset = dataset.prefetch(2)
    return dataset


def train_model(data, label="tissue", batch_size=1000, 
                num_epoch=1000, learning_rate=1e-3, extra=""):
    tf.reset_default_graph()
    LOGDIR = "/tmp/tcga_{0}/".format(str(datetime.datetime.today().date()))
    N_IN = data.train.X.shape[1]
    N_OUT = data.train.y[label].shape[1]
    N_HIDDEN = int(np.mean(N_IN + N_OUT)/2)

    # dataset ##################################
#     X_placeholder = tf.placeholder(tf.float32, data.train.X.shape)
#     y_placeholder = tf.placeholder(tf.float32, data.train.y[label].shape)
#     dataset = tf.data.Dataset.from_tensor_slices({"X": X_placeholder, "y": y_placeholder})
#     dataset=dataset.repeat(num_epoch)
#     dataset=dataset.batch(batch_size)
#     dataset = dataset.prefetch(2)
#     iterator = dataset.make_initializable_iterator()
#     next_element = iterator.get_next()
    ###################################
    train_dataset = prep_dataset(data.train.X, 
                                 data.train.y[label],
                                 num_epoch=1000, batch_size=1000)
    iterator = train_dataset.make_initializable_iterator()
    next_batch = iterator.get_next()
    x, y_true = next_batch["X"], next_batch["y"]
    hidden = tf.nn.relu(fc_layer(x, N_IN, N_HIDDEN), name="hidden")
    y_pred = fc_layer(hidden, N_HIDDEN, N_OUT, name="softmax")
    
    xent = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            labels=y_true, logits=y_pred), name="xent")
    tf.summary.scalar("xent", xent)

    train_step = tf.train.AdamOptimizer(learning_rate).minimize(xent)

    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    train_accu_summ = tf.summary.scalar("train_accuracy", accuracy)
    test_accu_summ = tf.summary.scalar("test_accuracy", accuracy)

    sess = tf.Session()
    summ = tf.summary.merge_all()
    writer = tf.summary.FileWriter(LOGDIR + "pca{0}_{1}".format(extra, label))
    writer.add_graph(sess.graph)
    
    # training
    t0 = time.time()
    sess.run(tf.global_variables_initializer())
    sess.run(iterator.initializer)
    
#     tf_options_profiler_on = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
#     tf_options_profiler_off = tf.RunOptions()
#     tf_run_metadata = tf.RunMetadata()
    

    epoch = 0
    for i in range(100000000):
        try:
            sess.run(train_step)#, options=tf_options_profiler_on, run_metadata=tf_run_metadata)
#             fetched_timeline = timeline.Timeline(tf_run_metadata.step_stats)
#             chrome_trace = fetched_timeline.generate_chrome_trace_format()
#             filename = './profile/tf_profile_step_%02d.json' % i
#             print('Profile step, writing profiler output to ', filename)
#             with open(filename, 'w') as f:
#                 f.write(chrome_trace)
#             if (i*batch_size) % 9000 == 0:
#                 epoch += 1
#                 [train_accuracy, train_s, s] = sess.run([accuracy, train_accu_summ, summ],
#                     feed_dict={x: data.train.X, y_true: data.train.y[label]})
#                 [test_accuracy, test_s] = sess.run([accuracy, test_accu_summ],
#                     feed_dict={x: data.test.X, y_true: data.test.y[label]})
#                 writer.add_summary(train_s, epoch)
#                 writer.add_summary(test_s, epoch)
#                 writer.add_summary(s, epoch)
#                 if epoch % 100 == 0:
#                     print("epoch", epoch, 
#                           "training accuracy", train_accuracy, 
#                           "test_accuracy", test_accuracy)
        except tf.errors.OutOfRangeError:
            break
    print("training time:", time.time() - t0)

In [None]:
for pca_var in [0.6, 0.7, 0.8, 0.9]:
    tcga = load_data.read_data_sets("./data/mRNA_PCA_{0}_variance_MinMaxScaled.csv".format(pca_var))
    for label_name in ["tissue", "gender", "tumor"]:
        print(pca_var, label_name)
        tcga.train.reset_epoch()
        train_model(tcga, label=label_name, extra=pca_var)
        break

In [None]:
tcga = load_data.read_data_sets("./data/mRNA_lognorm_MinMaxScaled.csv")
for label_name in ["tissue", "gender", "tumor"]:
    print(label_name)
    tcga.train.reset_epoch()
    train_model(tcga, label=label_name, extra="_all")