In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import sys
import datetime
from pprint import pprint as pp

sys.path.insert(0, '/home/molly/Desktop/DeepTCGA/')
import load_data

In [11]:
def fc_layer(A_prev, size_in, size_out, name="fully-connected"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], mean=0, stddev=0.1))
        b = tf.Variable(tf.constant(0.1, shape=[size_out]))
        act = tf.matmul(A_prev, w) + b
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act, w, b


def build_model1(x, N_IN, N_HIDDENS):
    parameters = {}
    
    a1, w1, b1 = fc_layer(x, N_IN, N_HIDDENS[0], name="fc1")
    parameters.update({"a1":a1, "w1": w1, "b1": b1})
    hidden1 = tf.nn.relu(a1, name="hidden1")
    
    a2, w2, b2 = fc_layer(hidden1, N_HIDDENS[0], N_HIDDENS[1], name="fc2")
    parameters.update({"a2":a2, "w2": w2, "b2": b2})
    hidden2 = tf.nn.relu(a2, name="hidden2")
    
    a3, w3, b3 = fc_layer(hidden2, N_HIDDENS[1], N_HIDDENS[2], name="fc3")
    parameters.update({"a3":a3, "w3": w3, "b3": b3})
    hidden3 = tf.nn.relu(a3, name="hidden3")
    
    x_recon, w4, b4 = fc_layer(hidden3, N_HIDDENS[2], N_IN, name="fc4")
    parameters.update({"w4": w4, "b4": b4})
    return x_recon, parameters


def back_prop(x, x_recon, learning_rate):
    loss = tf.reduce_mean(tf.square(x_recon - x))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_step


def feed_forward1(x, parameters):
    w1, b1 = parameters["w1"], parameters["b1"]
    hidden1 = tf.nn.relu(tf.matmul(x, w1) + b1)
    w2, b2 = parameters["w2"], parameters["b2"]
    hidden2 = tf.nn.relu(tf.matmul(hidden1, w2) + b2)
    w3, b3 = parameters["w3"], parameters["b3"]
    hidden3 = tf.nn.relu(tf.matmul(hidden2, w3) + b3)
    w4, b4 = parameters["w4"], parameters["b4"]
    x_recon = tf.matmul(hidden3, w4) + b4
    return x_recon, hidden2


def mse(x, x_recon, name=""):
    mse = tf.reduce_mean(tf.square(x_recon-x))
    mse_summary = tf.summary.scalar(name + "_mse", mse)
    return mse, mse_summary

In [14]:
def train_model(data, batch_size=128, num_epoch=1000, learning_rate=1e-3, extra=""):
    tf.reset_default_graph()
    LOGDIR = "/tmp/tcga_{0}".format(datetime.datetime.today().date())
    N_IN = data.train.num_features
    N_OUT = data.train.num_features
    N_HIDDENS = [1000, 2, 1000]
    
    # train step
    (train_batch, train_iter, val_all, val_iter, 
        train_all, train_iter_all) = data.prep_train_batch(batch_size=batch_size, fold=1)
    x = train_batch["X"]
    x_recon, parameters = build_model1(x, N_IN, N_HIDDENS)
    train_step = back_prop(x, x_recon, learning_rate)
    
    # mse
    x_train, x_val = train_all["X"], val_all["X"]
    x_train_recon1, hidden2 = feed_forward1(x_train, parameters)
    x_val_recon1, _ = feed_forward1(x_val, parameters)
    train_mse, train_summ = mse(x_train, x_train_recon1, name="train")
    val_mse, val_summ = mse(x_val, x_val_recon1, name="valiation")
    
    # run
    sess = tf.Session()
    summ = tf.summary.merge_all()
    writer = tf.summary.FileWriter(LOGDIR + "ae_{0}".format(extra))
    writer.add_graph(sess.graph)                            
    sess.run(tf.global_variables_initializer())

    for epoch in range(num_epoch):
        sess.run([train_iter.initializer])
        t0 = time.time()
        try:
            while True:
                sess.run(train_step)
        except tf.errors.OutOfRangeError:
            if epoch % 10 == 0:
                sess.run([train_iter_all.initializer, val_iter.initializer])
                [train_error, train_s, val_error, val_s] = sess.run(
                [train_mse, train_summ, val_mse, val_summ])
                writer.add_summary(train_s, epoch)
                writer.add_summary(val_s, epoch)
                print("epoch", epoch)
                print("training mse:", train_error, "validation mse", val_error)
                print("epoch time:", time.time()-t0)
    train_latent = sess.run(train_latent)
    val_latent = sess.run(val_latent)
    np.save("./results/AE/train_latent_complex2.npy", train_latent)
    np.save("./results/AE/val_latent_complex2.npy", val_latent)    
    sess.close()

In [16]:
tcga = load_data.read_data_sets("../data/mRNA_lognorm_StandardScaled.hdf")

FileNotFoundError: File b'./data/labels.csv' does not exist

In [20]:
train_model(tcga, num_epoch=40, extra="2_hidden")

epoch 0
training mse: 1.0829372 validation mse 1.0847688
epoch time: 2.5375454425811768
epoch 10
training mse: 0.82770574 validation mse 0.8237596
epoch time: 1.0215234756469727
epoch 20
training mse: 0.8236215 validation mse 0.82046366
epoch time: 1.0327022075653076
epoch 30
training mse: 0.82131755 validation mse 0.81819594
epoch time: 1.0371551513671875


In [30]:
train_model1(tcga, num_epoch=1, extra="10_hidden")

epoch 0
training mse: 0.015309443 validation mse 0.015218355
epoch time: 3.01918888092041


In [None]:
/home/molly/Desktop/DeepTCGA/data/