In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import datetime
from pprint import pprint as pp
import sys
import pickle
import glob
import os

sys.path.insert(0, '/home/molly/Desktop/DeepTCGA/')
import load_data

  from ._conv import register_converters as _register_converters


In [2]:
def fc_layer(A_prev, size_in, size_out, name="fully-connected"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], mean=0, stddev=0.1))
        b = tf.Variable(tf.constant(0.1, shape=[size_out]))
        act = tf.matmul(A_prev, w) + b
        return act, w, b

    
def build_model(x, N_IN, N_HIDDEN):
    parameters = {}
    z1, w1, b1 = fc_layer(x, N_IN, N_HIDDEN, name="fc1")
    parameters.update({"w1": w1, "b1": b1})
    hidden = tf.nn.leaky_relu(z1, name="hidden")
    x_recon, w2, b2 = fc_layer(hidden, N_HIDDEN, N_IN, name="fc2")
    parameters.update({"w2": w2, "b2": b2})
    return x_recon, parameters


def back_prop(x, x_recon, learning_rate):
    loss = tf.reduce_mean(tf.square(x_recon - x))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_step


def feed_forward(x, parameters):
    w1, b1 = parameters["w1"], parameters["b1"]
    hidden = tf.nn.leaky_relu(tf.matmul(x, w1) + b1)
    w2, b2 = parameters["w2"], parameters["b2"]
    x_recon = tf.matmul(hidden, w2) + b2
    return x_recon, hidden


def mse(x, x_recon, name=""):
    mse = tf.reduce_mean(tf.square(x_recon-x))
    return mse


def create_folder(result_folder):
    assert(result_folder!="")
    path = "../results/AE/{0}/".format(result_folder)
    if len(glob.glob(path)) == 0:
        os.mkdir(path)
    return path

In [3]:
def train_model(data, n_hidden, batch_size=128, 
                num_epoch=1000, learning_rate=1e-3, result_folder=""):
    tf.reset_default_graph()
    folder = create_folder(result_folder)
    N_IN = data.train.num_features
    N_OUT = data.train.num_features
    N_HIDDEN = n_hidden
    
    # train step
    (train_batch, train_iter, val_all, val_iter, 
        train_all, train_iter_all) = data.prep_batch(batch_size=batch_size, 
                                                     count_by="epoch")
    x = train_batch["X"]
    x_recon, parameters = build_model(x, N_IN, N_HIDDEN)
    train_step = back_prop(x, x_recon, learning_rate)
    
    # mse
    x_train, x_val = train_all["X"], val_all["X"]
    x_train_recon, train_latent = feed_forward(x_train, parameters)
    x_val_recon, val_latent = feed_forward(x_val, parameters)
    train_mse = mse(x_train, x_train_recon, name="train")
    val_mse = mse(x_val, x_val_recon, name="valiation")
    
    # run
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    train_errors = []
    val_errors = []
    try:
        for epoch in range(num_epoch):
            sess.run([train_iter.initializer])
            sess.run([train_iter_all.initializer, val_iter.initializer])
            t0 = time.time()
            try:
                while True:
                    sess.run(train_step)
            except tf.errors.OutOfRangeError:
                [train_error, val_error] = sess.run([train_mse, val_mse])             
                train_errors.append(train_error)
                val_errors.append(val_error)
                if epoch % 10 == 0:
                    print("epoch", epoch)
                    print("training mse:", train_error, "validation mse", val_error)
                    print("epoch time:", time.time()-t0)
    finally:
        train_latent = sess.run(train_latent)
        val_latent = sess.run(val_latent)
        parameter_trained = sess.run(parameters)
        parameter_trained["total_epoch"] = epoch
        np.save("{0}/train_latent.npy".format(folder), train_latent)
        np.save("{0}/val_latent.npy".format(folder), val_latent)
        np.save("{0}/train_mse.npy".format(folder), np.array(train_errors))
        np.save("{0}/val_mse.npy".format(folder), np.array(val_errors))
        with open("{0}/parameters.pkl".format(folder), "wb") as f:
            pickle.dump(parameter_trained, f)
        sess.close()

In [4]:
tcga = load_data.read_data_sets("../data/mRNA_lognorm_StandardScaled.hdf")

In [None]:
# nodes = [8000, 4000, 2000, 1000, 500, 250, 100, 50, 20, 10, 2]
nodes = [30]
for node in nodes:
    train_model(tcga, node, num_epoch=2000, result_folder=str(node))

epoch 0
training mse: 2.147404 validation mse 2.3339798
epoch time: 1.0410974025726318
epoch 10
training mse: 0.5224542 validation mse 0.5387411
epoch time: 0.9793815612792969
epoch 20
training mse: 0.42539293 validation mse 0.43312728
epoch time: 1.0048871040344238
epoch 30
training mse: 0.41680562 validation mse 0.42557195
epoch time: 0.9785070419311523
epoch 40
training mse: 0.41191417 validation mse 0.4230699
epoch time: 1.0073378086090088
epoch 50
training mse: 0.4070146 validation mse 0.4221987
epoch time: 1.0017268657684326
epoch 60
training mse: 0.40418398 validation mse 0.42401752
epoch time: 0.9873602390289307
epoch 70
training mse: 0.4005258 validation mse 0.4261755
epoch time: 1.001347303390503
epoch 80
training mse: 0.39749116 validation mse 0.42972773
epoch time: 1.006939172744751
epoch 90
training mse: 0.39512327 validation mse 0.4326905
epoch time: 1.0010077953338623
epoch 100
training mse: 0.3959415 validation mse 0.4365219
epoch time: 0.998211145401001
epoch 110
train

epoch 900
training mse: 0.3856703 validation mse 0.48687193
epoch time: 1.0198471546173096
epoch 910
training mse: 0.38486874 validation mse 0.488266
epoch time: 0.9986875057220459
epoch 920
training mse: 0.3865753 validation mse 0.483037
epoch time: 1.0087831020355225
epoch 930
training mse: 0.38560155 validation mse 0.48641622
epoch time: 1.0260546207427979
epoch 940
training mse: 0.38589373 validation mse 0.48560143
epoch time: 1.0310359001159668
epoch 950
training mse: 0.3881636 validation mse 0.4891222
epoch time: 1.0181708335876465
epoch 960
training mse: 0.385978 validation mse 0.48674446
epoch time: 1.017151117324829
epoch 970
training mse: 0.38574135 validation mse 0.48545823
epoch time: 1.0135536193847656
epoch 980
training mse: 0.38625982 validation mse 0.4883809
epoch time: 1.011533260345459
epoch 990
training mse: 0.38682503 validation mse 0.48574737
epoch time: 1.0091025829315186
epoch 1000
training mse: 0.38634038 validation mse 0.4880699
epoch time: 1.0271189212799072
e

epoch 1800
training mse: 0.38540247 validation mse 0.48204586
epoch time: 1.015505075454712
epoch 1810
training mse: 0.3842194 validation mse 0.48181108
epoch time: 1.0251555442810059
epoch 1820
training mse: 0.38428345 validation mse 0.4828677
epoch time: 1.0400404930114746
epoch 1830
training mse: 0.38608885 validation mse 0.48465884
epoch time: 1.0320336818695068
epoch 1840
training mse: 0.3851698 validation mse 0.48002112
epoch time: 1.0207934379577637
epoch 1850
training mse: 0.38514328 validation mse 0.48272198
epoch time: 1.0351228713989258
epoch 1860
training mse: 0.38718817 validation mse 0.48528612
epoch time: 1.020256519317627
epoch 1870
training mse: 0.38586614 validation mse 0.4842464
epoch time: 1.0222084522247314
epoch 1880
training mse: 0.38477266 validation mse 0.4822013
epoch time: 1.0188400745391846
epoch 1890
training mse: 0.3843815 validation mse 0.48033
epoch time: 1.0377070903778076
