In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import datetime
from pprint import pprint as pp
import sys
import pickle
import glob
import os

sys.path.insert(0, '/home/molly/Desktop/DeepTCGA/')
import load_data

  from ._conv import register_converters as _register_converters


In [2]:
def fc_layer(A_prev, size_in, size_out, name="fully-connected"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], mean=0, stddev=0.1))
        b = tf.Variable(tf.constant(0.1, shape=[size_out]))
        act = tf.matmul(A_prev, w) + b
        return act, w, b

    
def build_model(x, N_IN, N_HIDDEN):
    parameters = {}
    z1, w1, b1 = fc_layer(x, N_IN, N_HIDDEN, name="fc1")
    parameters.update({"w1": w1, "b1": b1})
    hidden = tf.nn.leaky_relu(z1, name="hidden")
    x_recon, w2, b2 = fc_layer(hidden, N_HIDDEN, N_IN, name="fc2")
    parameters.update({"w2": w2, "b2": b2})
    return x_recon, parameters


def back_prop(x, x_recon, learning_rate):
    loss = tf.reduce_mean(tf.square(x_recon - x))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_step


def feed_forward(x, parameters):
    w1, b1 = parameters["w1"], parameters["b1"]
    hidden = tf.nn.leaky_relu(tf.matmul(x, w1) + b1)
    w2, b2 = parameters["w2"], parameters["b2"]
    x_recon = tf.matmul(hidden, w2) + b2
    return x_recon, hidden


def mse(x, x_recon, name=""):
    mse = tf.reduce_mean(tf.square(x_recon-x))
    return mse


def create_folder(result_folder):
    assert(result_folder!="")
    path = "../results/AE/{0}/".format(result_folder)
    if len(glob.glob(path)) == 0:
        os.mkdir(path)
    return path

In [3]:
def train_model(data, n_hidden, batch_size=128, 
                num_epoch=1000, learning_rate=1e-3, result_folder=""):
    tf.reset_default_graph()
    folder = create_folder(result_folder)
    N_IN = data.train.num_features
    N_OUT = data.train.num_features
    N_HIDDEN = n_hidden
    
    # train step
    (train_batch, train_iter, val_all, val_iter, 
        train_all, train_iter_all) = data.prep_batch(batch_size=batch_size, 
                                                     count_by="epoch")
    x = train_batch["X"]
    x_recon, parameters = build_model(x, N_IN, N_HIDDEN)
    train_step = back_prop(x, x_recon, learning_rate)
    
    # mse
    x_train, x_val = train_all["X"], val_all["X"]
    x_train_recon, train_latent = feed_forward(x_train, parameters)
    x_val_recon, val_latent = feed_forward(x_val, parameters)
    train_mse = mse(x_train, x_train_recon, name="train")
    val_mse = mse(x_val, x_val_recon, name="valiation")
    
    # run
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    train_errors = []
    val_errors = []
    try:
        for epoch in range(num_epoch):
            sess.run([train_iter.initializer])
            sess.run([train_iter_all.initializer, val_iter.initializer])
            t0 = time.time()
            try:
                while True:
                    sess.run(train_step)
            except tf.errors.OutOfRangeError:
                [train_error, val_error] = sess.run([train_mse, val_mse])             
                train_errors.append(train_error)
                val_errors.append(val_error)
                if epoch % 10 == 0:
                    print("epoch", epoch)
                    print("training mse:", train_error, "validation mse", val_error)
                    print("epoch time:", time.time()-t0)
    finally:
        train_latent = sess.run(train_latent)
        val_latent = sess.run(val_latent)
        parameter_trained = sess.run(parameters)
        parameter_trained["total_epoch"] = epoch
        np.save("{0}/train_latent.npy".format(folder), train_latent)
        np.save("{0}/val_latent.npy".format(folder), val_latent)
        np.save("{0}/train_mse.npy".format(folder), np.array(train_errors))
        np.save("{0}/val_mse.npy".format(folder), np.array(val_errors))
        with open("{0}/parameters.pkl".format(folder), "wb") as f:
            pickle.dump(parameter_trained, f)
        sess.close()

In [None]:
tcga = load_data.read_data_sets("../data/mRNA_lognorm_StandardScaled.hdf")

In [None]:
nodes = [5]
for node in nodes:
    train_model(tcga, node, num_epoch=2000, result_folder=str(node))

epoch 0
training mse: 1.2010224 validation mse 1.2268168
epoch time: 1.5647170543670654
epoch 10
training mse: 0.7047268 validation mse 0.7029766
epoch time: 1.040672779083252
epoch 20
training mse: 0.693651 validation mse 0.693178
epoch time: 1.1418237686157227
epoch 30
training mse: 0.6898328 validation mse 0.69069594
epoch time: 1.1333420276641846
epoch 40
training mse: 0.6861566 validation mse 0.68947136
epoch time: 1.1392028331756592
epoch 50
training mse: 0.6841282 validation mse 0.6890901
epoch time: 1.1313567161560059
epoch 60
training mse: 0.68205935 validation mse 0.6898403
epoch time: 1.1300930976867676
epoch 70
training mse: 0.6802823 validation mse 0.69079053
epoch time: 1.147254228591919
epoch 80
training mse: 0.67901516 validation mse 0.69230014
epoch time: 1.1144766807556152
epoch 90
training mse: 0.67907095 validation mse 0.69431996
epoch time: 1.1205041408538818
epoch 100
training mse: 0.6777765 validation mse 0.69578797
epoch time: 1.1108019351959229
epoch 110
traini