In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import datetime
from pprint import pprint as pp
import sys
import pickle
import glob
import os

sys.path.insert(0, '/home/molly/Desktop/DeepTCGA/')
import load_data

  from ._conv import register_converters as _register_converters


In [2]:
def fc_layer(A_prev, size_in, size_out, name="fully-connected"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], mean=0, 
                                            stddev=(2/(size_in+size_out))**0.5)) # xavier init
        b = tf.Variable(tf.constant(0.1, shape=[size_out]))
        act = tf.matmul(A_prev, w) + b
        return act, w, b

    
def build_model(x, N_IN, N_HIDDEN):
    parameters = {}
    z1, w1, b1 = fc_layer(x, N_IN, N_HIDDEN, name="fc1")
    parameters.update({"w1": w1, "b1": b1})
    hidden = tf.nn.leaky_relu(z1, name="hidden")
    x_recon, w2, b2 = fc_layer(hidden, N_HIDDEN, N_IN, name="fc2")
    parameters.update({"w2": w2, "b2": b2})
    return x_recon, parameters


def back_prop(x, x_recon, learning_rate):
    loss = tf.reduce_mean(tf.square(x_recon - x))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_step


def feed_forward(x, parameters):
    w1, b1 = parameters["w1"], parameters["b1"]
    hidden = tf.nn.leaky_relu(tf.matmul(x, w1) + b1)
    w2, b2 = parameters["w2"], parameters["b2"]
    x_recon = tf.matmul(hidden, w2) + b2
    return x_recon, hidden


def mse(x, x_recon, name=""):
    mse = tf.reduce_mean(tf.square(x_recon-x))
    return mse


def create_folder(result_folder):
    assert(result_folder!="")
    path = "../results/AE/{0}/".format(result_folder)
    if len(glob.glob(path)) == 0:
        os.mkdir(path)
    return path

In [3]:
def train_model(data, n_hidden, batch_size=128, 
                num_epoch=1000, learning_rate=1e-3, result_folder=""):
    tf.reset_default_graph()
    folder = create_folder(result_folder)
    N_IN = data.train.num_features
    N_OUT = data.train.num_features
    N_HIDDEN = n_hidden
    
    # train step
    (train_batch, train_iter, val_all, val_iter, 
        train_all, train_iter_all) = data.prep_batch(batch_size=batch_size, 
                                                     count_by="epoch")
    x = train_batch["X"]
    x_recon, parameters = build_model(x, N_IN, N_HIDDEN)
    train_step = back_prop(x, x_recon, learning_rate)
    
    # mse
    x_train, x_val = train_all["X"], val_all["X"]
    x_train_recon, train_latent = feed_forward(x_train, parameters)
    x_val_recon, val_latent = feed_forward(x_val, parameters)
    train_mse = mse(x_train, x_train_recon, name="train")
    val_mse = mse(x_val, x_val_recon, name="valiation")
    
    # run
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    train_errors = []
    val_errors = []
    try:
        for epoch in range(num_epoch):
            sess.run([train_iter.initializer])
            sess.run([train_iter_all.initializer, val_iter.initializer])
            t0 = time.time()
            try:
                while True:
                    sess.run(train_step)
            except tf.errors.OutOfRangeError:
                [train_error, val_error] = sess.run([train_mse, val_mse])             
                train_errors.append(train_error)
                val_errors.append(val_error)
                if epoch % 100 == 0:
                    print("epoch", epoch)
                    print("training mse:", train_error, "validation mse", val_error)
                    print("epoch time:", time.time()-t0)
    finally:
        train_latent = sess.run(train_latent)
        val_latent = sess.run(val_latent)
        parameter_trained = sess.run(parameters)
        parameter_trained["total_epoch"] = epoch
        np.save("{0}/train_latent.npy".format(folder), train_latent)
        np.save("{0}/val_latent.npy".format(folder), val_latent)
        np.save("{0}/train_mse.npy".format(folder), np.array(train_errors))
        np.save("{0}/val_mse.npy".format(folder), np.array(val_errors))
        with open("{0}/parameters.pkl".format(folder), "wb") as f:
            pickle.dump(parameter_trained, f)
        sess.close()

In [4]:
tcga = load_data.read_data_sets("../data/mRNA_lognorm_StandardScaled.hdf")

In [None]:
node = 1
while True:
    print(node)
    train_model(tcga, node, num_epoch=2000, result_folder=str(node)+"_xavier")
    node = node*2
    if node > tcga.train.num_features:
        break

1
epoch 0
training mse: 0.9288027 validation mse 0.920726
epoch time: 1.8214318752288818
epoch 100
training mse: 0.9015733 validation mse 0.89922047
epoch time: 1.1737089157104492
epoch 200
training mse: 0.9012543 validation mse 0.9009505
epoch time: 1.1910698413848877
epoch 300
training mse: 0.9007941 validation mse 0.90210164
epoch time: 1.206174373626709
epoch 400
training mse: 0.9007501 validation mse 0.90387535
epoch time: 1.2055046558380127
epoch 500
training mse: 0.90109617 validation mse 0.90474474
epoch time: 1.204657793045044
epoch 600
training mse: 0.90147597 validation mse 0.9052212
epoch time: 1.1957979202270508
epoch 700
training mse: 0.9006156 validation mse 0.9056421
epoch time: 1.1958692073822021
epoch 800
training mse: 0.9007385 validation mse 0.9067758
epoch time: 1.206639051437378
epoch 900
training mse: 0.90072614 validation mse 0.9068158
epoch time: 1.187070608139038
epoch 1000
training mse: 0.9010424 validation mse 0.9074488
epoch time: 1.1961452960968018
epoch 1

epoch 1100
training mse: 0.47096768 validation mse 0.5373689
epoch time: 1.1746745109558105
epoch 1200
training mse: 0.46929577 validation mse 0.539224
epoch time: 1.1948516368865967
epoch 1300
training mse: 0.46847075 validation mse 0.53654355
epoch time: 1.1912126541137695
epoch 1400
training mse: 0.46780154 validation mse 0.53936553
epoch time: 1.2054550647735596
epoch 1500
training mse: 0.46947774 validation mse 0.54254216
epoch time: 1.198253870010376
epoch 1600
training mse: 0.46810532 validation mse 0.5417088
epoch time: 1.2022898197174072
epoch 1700
training mse: 0.46739495 validation mse 0.54137874
epoch time: 1.164736032485962
epoch 1800
training mse: 0.4704939 validation mse 0.5427393
epoch time: 1.1891558170318604
epoch 1900
training mse: 0.4681948 validation mse 0.5423079
epoch time: 1.2490248680114746
32
epoch 0
training mse: 0.46947825 validation mse 0.4675183
epoch time: 1.2243082523345947
epoch 100
training mse: 0.3819173 validation mse 0.410375
epoch time: 1.196905374