In [16]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import datetime

import load_data

### One hidden layer

In [17]:
def weight_variable(shape):
    initial=tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def fc_layer(input, size_in, size_out, name="fc"):
    with tf.name_scope(name):
        w = weight_variable([size_in, size_out])
        b = bias_variable([size_out])
        act = tf.matmul(input, w) + b
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act
    

def train_model(data, label="tissue", learning_rate=1e-3, extra=""):
    tf.reset_default_graph()
    LOGDIR = "/tmp/tcga_{0}/".format(str(datetime.datetime.today().date()))
    N_IN = data.train.X.shape[1]
    N_OUT = data.train.y[label].shape[1]
    N_HIDDEN = int(np.mean(N_IN + N_OUT))
    
    x = tf.placeholder(tf.float32, [None, N_IN], name="x")
    y_true = tf.placeholder(tf.float32, [None, N_OUT], name="labels")
    hidden = tf.nn.relu(fc_layer(x, N_IN, N_HIDDEN), name="hidden")
    y_pred = fc_layer(hidden, N_HIDDEN, N_OUT, name="softmax")
    
    xent = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            labels=y_true, logits=y_pred), name="xent")
    tf.summary.scalar("xent", xent)

    train_step = tf.train.AdamOptimizer(learning_rate).minimize(xent)

    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    train_accu_summ = tf.summary.scalar("train_accuracy", accuracy)
    test_accu_summ = tf.summary.scalar("test_accuracy", accuracy)

    sess = tf.Session()
    summ = tf.summary.merge_all()
    writer = tf.summary.FileWriter(LOGDIR + "pca{0}_{1}".format(extra, label))
    writer.add_graph(sess.graph)
    
    # training
    t0 = time.time()
    sess.run(tf.global_variables_initializer())

    for i in range(20001):
        batch_x, batch_y = data.train.next_batch(100)
        sess.run(train_step, feed_dict={x: batch_x, y_true: batch_y[label]})
        if i % 5 == 0:
            [train_accuracy, train_s, s] = sess.run([accuracy, train_accu_summ, summ],
                feed_dict={x: tcga.train.X, y_true: tcga.train.y[label]})
            [test_accuracy, test_s] = sess.run([accuracy, test_accu_summ],
                feed_dict={x: tcga.test.X, y_true: tcga.test.y[label]})
            writer.add_summary(train_s, i)
            writer.add_summary(test_s, i)
            writer.add_summary(s, i)
            if i % 1000 == 0:
                print("step", i, "training accuracy", train_accuracy, 
                      "test_accuracy", test_accuracy)                
    print("training time:", time.time() - t0)

In [18]:
for PCA_variance in [0.6, 0.7, 0.8, 0.9]:
    tcga = load_data.read_data_sets("./data/mRNA_PCA_{0}_variance_MinMaxScaled.csv".format(PCA_variance))
    for label_name in ["tissue", "gender", "tumor"]:
        print(PCA_variance, label_name)
        tcga.train.reset_epoch()
        train_model(tcga, label=label_name, extra=PCA_variance)

0.6 tissue
step 0 training accuracy 0.00207445 test_accuracy 0.000922509
step 1000 training accuracy 0.813761 test_accuracy 0.79428
step 2000 training accuracy 0.871615 test_accuracy 0.857011
step 3000 training accuracy 0.8904 test_accuracy 0.888376
step 4000 training accuracy 0.898813 test_accuracy 0.885609
step 5000 training accuracy 0.904691 test_accuracy 0.894834
step 6000 training accuracy 0.906304 test_accuracy 0.892989
step 7000 training accuracy 0.910222 test_accuracy 0.898524
step 8000 training accuracy 0.909761 test_accuracy 0.902214
step 9000 training accuracy 0.91391 test_accuracy 0.896679
step 10000 training accuracy 0.915754 test_accuracy 0.897601
step 11000 training accuracy 0.917598 test_accuracy 0.899446
step 12000 training accuracy 0.917714 test_accuracy 0.898524
step 13000 training accuracy 0.916331 test_accuracy 0.898524
step 14000 training accuracy 0.918866 test_accuracy 0.902214
step 15000 training accuracy 0.919903 test_accuracy 0.899446
step 16000 training accur

step 6000 training accuracy 0.972456 test_accuracy 0.942804
step 7000 training accuracy 0.976605 test_accuracy 0.948339
step 8000 training accuracy 0.980177 test_accuracy 0.946494
step 9000 training accuracy 0.980984 test_accuracy 0.944649
step 10000 training accuracy 0.98617 test_accuracy 0.947417
step 11000 training accuracy 0.983289 test_accuracy 0.952952
step 12000 training accuracy 0.985133 test_accuracy 0.950185
step 13000 training accuracy 0.984096 test_accuracy 0.947417
step 14000 training accuracy 0.987553 test_accuracy 0.942804
step 15000 training accuracy 0.987784 test_accuracy 0.946494
step 16000 training accuracy 0.990319 test_accuracy 0.947417
step 17000 training accuracy 0.990665 test_accuracy 0.946494
step 18000 training accuracy 0.985594 test_accuracy 0.95203
step 19000 training accuracy 0.99297 test_accuracy 0.948339
step 20000 training accuracy 0.992394 test_accuracy 0.940959
training time: 419.54449462890625
0.8 gender
step 0 training accuracy 0.49176 test_accuracy 