In [11]:
import gzip
import numpy as np
import pandas
import pickle
from copy import deepcopy

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [4]:
f = gzip.open("mnist.pkl.gz","rb")

In [5]:
train_set, valid_set, test_set = pickle.load(f,encoding='latin1')

In [7]:
X_train = np.vstack((train_set[0], valid_set[0]))
Y_train = np.hstack((train_set[1], valid_set[1]))

print(X_train.shape)
print(Y_train.shape)


(60000, 784)
(60000,)


In [22]:
class PermutedMnistGenerator():
    def __init__(self, max_iter=10):
        f = gzip.open('mnist.pkl.gz', 'rb')
        train_set, valid_set, test_set = pickle.load(f,encoding='latin1')
        f.close()

        self.X_train = np.vstack((train_set[0], valid_set[0]))
        self.Y_train = np.hstack((train_set[1], valid_set[1]))
        self.X_test = test_set[0]
        self.Y_test = test_set[1]
        self.max_iter = max_iter
        self.cur_iter = 0

    def get_dims(self):
        # Get data input and output dimensions
        return self.X_train.shape[1], 10

    def next_task(self):
        if self.cur_iter >= self.max_iter:
            raise Exception('Number of tasks exceeded!')
        else:
            np.random.seed(self.cur_iter)
            perm_inds = list(range(self.X_train.shape[1]))
            np.random.shuffle(perm_inds)

            # Retrieve train data
            next_x_train = deepcopy(self.X_train)
            next_x_train = next_x_train[:,perm_inds]
            next_y_train = np.eye(10)[self.Y_train]

            # Retrieve test data
            next_x_test = deepcopy(self.X_test)
            next_x_test = next_x_test[:,perm_inds]
            next_y_test = np.eye(10)[self.Y_test]

            self.cur_iter += 1

            return next_x_train, next_y_train, next_x_test, next_y_test

In [23]:
data_gen = PermutedMnistGenerator(5)

In [14]:
in_dim, out_dim = data_gen.get_dims()
print(in_dim)
print(out_dim)

784
10


In [15]:
import tensorflow as tf

In [39]:
class Cla_NN(object):
    def __init__(self, input_size, hidden_size, output_size, training_size):
        # input and output placeholders
        self.x = tf.placeholder(tf.float32, [None, input_size])
        self.y = tf.placeholder(tf.float32, [None, output_size])
        self.task_idx = tf.placeholder(tf.int32)
        
    def assign_optimizer(self, learning_rate=0.001):
        self.train_step = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)

    def assign_session(self):
        # Initializing the variables
        init = tf.global_variables_initializer()

        # launch a session
        self.sess = tf.Session()
        self.sess.run(init)

    def train(self, x_train, y_train, task_idx, no_epochs=1000, batch_size=100, display_epoch=5):
        N = x_train.shape[0]
        if batch_size > N:
            batch_size = N

        sess = self.sess
        costs = []
        # Training cycle
        for epoch in range(no_epochs):
            perm_inds = list(range(x_train.shape[0]))
            np.random.shuffle(perm_inds)
            cur_x_train = x_train[perm_inds]
            cur_y_train = y_train[perm_inds]

            avg_cost = 0.
            total_batch = int(np.ceil(N * 1.0 / batch_size))
            # Loop over all batches
            for i in range(total_batch):
                start_ind = i*batch_size
                end_ind = np.min([(i+1)*batch_size, N])
                batch_x = cur_x_train[start_ind:end_ind, :]
                batch_y = cur_y_train[start_ind:end_ind, :]
                # Run optimization op (backprop) and cost op (to get loss value)
                _, c = sess.run(
                    [self.train_step, self.cost], 
                    feed_dict={self.x: batch_x, self.y: batch_y, self.task_idx: task_idx})
                # Compute average loss
                avg_cost += c / total_batch
            # Display logs per epoch step
            if epoch % display_epoch == 0:
                print("Epoch:", '%04d' % (epoch+1), "cost=", \
                    "{:.9f}".format(avg_cost))
            costs.append(avg_cost)
        print("Optimization Finished!")
        return costs

    def prediction(self, x_test, task_idx):
        # Test model
        prediction = self.sess.run([self.pred], feed_dict={self.x: x_test, self.task_idx: task_idx})[0]
        return prediction

    def prediction_prob(self, x_test, task_idx):
        prob = self.sess.run([tf.nn.softmax(self.pred)], feed_dict={self.x: x_test, self.task_idx: task_idx})[0]
        return prob

    def get_weights(self):
        weights = self.sess.run([self.weights])[0]
        return weights

    def close_session(self):
        self.sess.close()

In [40]:
class Vanilla_NN(Cla_NN):
    def __init__(self, input_size, hidden_size, output_size, training_size, prev_weights=None, learning_rate=0.001):

        super(Vanilla_NN, self).__init__(input_size, hidden_size, output_size, training_size)
        # init weights and biases
        self.W, self.b, self.W_last, self.b_last, self.size = self.create_weights(
                input_size, hidden_size, output_size, prev_weights)
        self.no_layers = len(hidden_size) + 1
        self.pred = self._prediction(self.x, self.task_idx)
        self.cost = - self._logpred(self.x, self.y, self.task_idx)
        self.weights = [self.W, self.b, self.W_last, self.b_last]

        self.assign_optimizer(learning_rate)
        self.assign_session()

    def _prediction(self, inputs, task_idx):
        act = inputs
        for i in range(self.no_layers-1):
            pre = tf.add(tf.matmul(act, self.W[i]), self.b[i])
            act = tf.nn.relu(pre)
        pre = tf.add(tf.matmul(act, tf.gather(self.W_last, task_idx)), tf.gather(self.b_last, task_idx))
        return pre

    def _logpred(self, inputs, targets, task_idx):
        pred = self._prediction(inputs, task_idx)
        log_lik = - tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=targets))
        return log_lik

    def create_weights(self, in_dim, hidden_size, out_dim, prev_weights):
        hidden_size = deepcopy(hidden_size)
        hidden_size.append(out_dim)
        hidden_size.insert(0, in_dim)
        no_params = 0
        no_layers = len(hidden_size) - 1
        W = []
        b = []
        W_last = []
        b_last = []
        for i in range(no_layers-1):
            din = hidden_size[i]
            dout = hidden_size[i+1]
            if prev_weights is None:
                Wi_val = tf.truncated_normal([din, dout], stddev=0.1)
                bi_val = tf.truncated_normal([dout], stddev=0.1)
            else:
                Wi_val = tf.constant(prev_weights[0][i])
                bi_val = tf.constant(prev_weights[1][i])
            Wi = tf.Variable(Wi_val)
            bi = tf.Variable(bi_val)
            W.append(Wi)
            b.append(bi)

        if prev_weights is not None:
            prev_Wlast = prev_weights[2]
            prev_blast = prev_weights[3]
            no_prev_tasks = len(prev_Wlast)
            for j in range(no_prev_tasks):
                W_j = prev_Wlast[j]
                b_j = prev_blast[j]
                Wi = tf.Variable(W_j)
                bi = tf.Variable(b_j)
                W_last.append(Wi)
                b_last.append(bi)

        din = hidden_size[-2]
        dout = hidden_size[-1]
        Wi_val = tf.truncated_normal([din, dout], stddev=0.1)
        bi_val = tf.truncated_normal([dout], stddev=0.1)
        Wi = tf.Variable(Wi_val)
        bi = tf.Variable(bi_val)
        W_last.append(Wi)
        b_last.append(bi)
            
        return W, b, W_last, b_last, hidden_size

In [41]:
x_coresets, y_coresets = [], []
x_testsets, y_testsets = [], []

all_acc = np.array([])

In [42]:
data_gen.max_iter

5

In [43]:
x_train, y_train, x_test, y_test = data_gen.next_task()
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(60000, 784)
(60000, 10)
(10000, 784)
(10000, 10)


In [44]:
x_testsets.append(x_test)
y_testsets.append(y_test)

In [45]:
head = 0
batch_size = None
bsize = x_train.shape[0] if (batch_size is None) else batch_size
print(bsize)
hidden_size = [100, 100]
task_id = 0
no_epochs = 100

60000


In [46]:
ml_model = Vanilla_NN(in_dim, hidden_size, out_dim, x_train.shape[0])

In [47]:
ml_model.train(x_train, y_train, task_id, no_epochs, bsize)

Epoch: 0001 cost= 2.447080135
Epoch: 0006 cost= 2.092496395
Epoch: 0011 cost= 1.759548306
Epoch: 0016 cost= 1.370782137
Epoch: 0021 cost= 1.003718853
Epoch: 0026 cost= 0.735086977
Epoch: 0031 cost= 0.569361389
Epoch: 0036 cost= 0.473297894
Epoch: 0041 cost= 0.414886385
Epoch: 0046 cost= 0.375195563
Epoch: 0051 cost= 0.346212775
Epoch: 0056 cost= 0.324049175
Epoch: 0061 cost= 0.306318909
Epoch: 0066 cost= 0.291606694
Epoch: 0071 cost= 0.279080749
Epoch: 0076 cost= 0.267867088
Epoch: 0081 cost= 0.257729560
Epoch: 0086 cost= 0.248397321
Epoch: 0091 cost= 0.239673004
Epoch: 0096 cost= 0.231451526
Optimization Finished!


[2.447080135345459,
 2.361976146697998,
 2.2880008220672607,
 2.220153570175171,
 2.1555726528167725,
 2.092496395111084,
 2.0295581817626953,
 1.9655098915100098,
 1.8994592428207397,
 1.8308889865875244,
 1.759548306465149,
 1.685410976409912,
 1.6087504625320435,
 1.530160903930664,
 1.4505127668380737,
 1.3707821369171143,
 1.2920174598693848,
 1.215173363685608,
 1.1411036252975464,
 1.0704680681228638,
 1.0037188529968262,
 0.9411620497703552,
 0.8829575777053833,
 0.8292375206947327,
 0.7799839973449707,
 0.735086977481842,
 0.694403886795044,
 0.6577697992324829,
 0.6249490976333618,
 0.5956205725669861,
 0.5693613886833191,
 0.5457699298858643,
 0.5246057510375977,
 0.5056639909744263,
 0.48865562677383423,
 0.4732978940010071,
 0.4593808650970459,
 0.4467409551143646,
 0.43522709608078003,
 0.42465683817863464,
 0.41488638520240784,
 0.4058114290237427,
 0.39735567569732666,
 0.38946717977523804,
 0.38210174441337585,
 0.37519556283950806,
 0.3686915338039398,
 0.362567514181

In [48]:
mf_weights = ml_model.get_weights()

In [50]:
len(mf_weights)

4

In [54]:
for val in mf_weights:
    print(len(val))

2
2
1
1


In [63]:
print(mf_weights[0][0].shape)
print(mf_weights[0][1].shape)

(784, 100)
(100, 100)


In [64]:
print(mf_weights[1][0].shape)
print(mf_weights[1][1].shape)

(100,)
(100,)


In [66]:
print(mf_weights[2][0].shape)
print(mf_weights[3][0].shape)

(100, 10)
(10,)


In [67]:
print(mf_weights[1][0])
print(mf_weights[1][1])

[ 0.14400598 -0.06807813  0.15992862  0.04808508 -0.07416071  0.03008125
  0.14971207  0.03849918  0.137127    0.09682845 -0.1083013   0.03426158
  0.14379996 -0.03349213  0.11760319  0.06680155  0.1118591   0.03698244
  0.0767782   0.04276764  0.16816375 -0.02994343  0.01764071 -0.11195186
 -0.0388854  -0.1788747   0.15569273  0.07509556  0.01936444 -0.04185652
 -0.05301884  0.09146661 -0.11130648  0.08092329 -0.14070068  0.2234264
 -0.00266611 -0.10498814  0.0369416   0.03933246 -0.03040194 -0.03673282
  0.1090413   0.09109873  0.10439321  0.11114012  0.01988952  0.09917659
  0.02881826 -0.07612276 -0.08050232  0.17495959 -0.05906625  0.02364575
  0.05356662 -0.05707119 -0.09027763 -0.08782687  0.07584154  0.14798243
 -0.01086231  0.10410107  0.09893222 -0.04288416  0.13437946  0.13890302
  0.06619617 -0.06930479  0.23255861  0.03252388  0.12364687 -0.05095023
  0.01807847  0.1402511  -0.03833523 -0.06870493 -0.13954757 -0.03303906
  0.03370167  0.05378075  0.16298634  0.01089726 -0.

In [68]:
mf_variances = None

In [69]:
ml_model.close_session()

In [70]:
class MFVI_NN(Cla_NN):
    def __init__(self, input_size, hidden_size, output_size, training_size, 
        no_train_samples=10, no_pred_samples=100, prev_means=None, prev_log_variances=None, learning_rate=0.001, 
        prior_mean=0, prior_var=1):

        super(MFVI_NN, self).__init__(input_size, hidden_size, output_size, training_size)
        m, v, self.size = self.create_weights(
            input_size, hidden_size, output_size, prev_means, prev_log_variances)
        self.W_m, self.b_m, self.W_last_m, self.b_last_m = m[0], m[1], m[2], m[3]
        self.W_v, self.b_v, self.W_last_v, self.b_last_v = v[0], v[1], v[2], v[3]
        self.weights = [m, v]

        m, v = self.create_prior(input_size, hidden_size, output_size, prev_means, prev_log_variances, prior_mean, prior_var)
        self.prior_W_m, self.prior_b_m, self.prior_W_last_m, self.prior_b_last_m = m[0], m[1], m[2], m[3]
        self.prior_W_v, self.prior_b_v, self.prior_W_last_v, self.prior_b_last_v = v[0], v[1], v[2], v[3]

        self.no_layers = len(self.size) - 1
        self.no_train_samples = no_train_samples
        self.no_pred_samples = no_pred_samples
        self.pred = self._prediction(self.x, self.task_idx, self.no_pred_samples)
        self.cost = tf.div(self._KL_term(), training_size) - self._logpred(self.x, self.y, self.task_idx)
        
        self.assign_optimizer(learning_rate)
        self.assign_session()

    def _prediction(self, inputs, task_idx, no_samples):
        return self._prediction_layer(inputs, task_idx, no_samples)

    # this samples a layer at a time
    def _prediction_layer(self, inputs, task_idx, no_samples):
        K = no_samples
        act = tf.tile(tf.expand_dims(inputs, 0), [K, 1, 1])        
        for i in range(self.no_layers-1):
            din = self.size[i]
            dout = self.size[i+1]
            eps_w = tf.random_normal((K, din, dout), 0, 1, dtype=tf.float32)
            eps_b = tf.random_normal((K, 1, dout), 0, 1, dtype=tf.float32)
            
            weights = tf.add(tf.multiply(eps_w, tf.exp(0.5*self.W_v[i])), self.W_m[i])
            biases = tf.add(tf.multiply(eps_b, tf.exp(0.5*self.b_v[i])), self.b_m[i])
            pre = tf.add(tf.einsum('mni,mio->mno', act, weights), biases)
            act = tf.nn.relu(pre)
        din = self.size[-2]
        dout = self.size[-1]
        eps_w = tf.random_normal((K, din, dout), 0, 1, dtype=tf.float32)
        eps_b = tf.random_normal((K, 1, dout), 0, 1, dtype=tf.float32)

        Wtask_m = tf.gather(self.W_last_m, task_idx)
        Wtask_v = tf.gather(self.W_last_v, task_idx)
        btask_m = tf.gather(self.b_last_m, task_idx)
        btask_v = tf.gather(self.b_last_v, task_idx)
        weights = tf.add(tf.multiply(eps_w, tf.exp(0.5*Wtask_v)), Wtask_m)
        biases = tf.add(tf.multiply(eps_b, tf.exp(0.5*btask_v)), btask_m)
        act = tf.expand_dims(act, 3)
        weights = tf.expand_dims(weights, 1)
        pre = tf.add(tf.reduce_sum(act * weights, 2), biases)

        return pre

    def _logpred(self, inputs, targets, task_idx):
        pred = self._prediction(inputs, task_idx, self.no_train_samples)
        targets = tf.tile(tf.expand_dims(targets, 0), [self.no_train_samples, 1, 1])
        log_lik = - tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=targets))
        return log_lik

    def _KL_term(self):
        kl = 0
        for i in range(self.no_layers-1):
            din = self.size[i]
            dout = self.size[i+1]
            m, v = self.W_m[i], self.W_v[i]
            m0, v0 = self.prior_W_m[i], self.prior_W_v[i]
            const_term = -0.5 * dout * din
            log_std_diff = 0.5 * tf.reduce_sum(np.log(v0) - v)
            mu_diff_term = 0.5 * tf.reduce_sum((tf.exp(v) + (m0 - m)**2) / v0)
            kl += const_term + log_std_diff + mu_diff_term

            m, v = self.b_m[i], self.b_v[i]
            m0, v0 = self.prior_b_m[i], self.prior_b_v[i]
            const_term = -0.5 * dout
            log_std_diff = 0.5 * tf.reduce_sum(np.log(v0) - v)
            mu_diff_term = 0.5 * tf.reduce_sum((tf.exp(v) + (m0 - m)**2) / v0)
            kl += const_term + log_std_diff + mu_diff_term

        no_tasks = len(self.W_last_m)
        din = self.size[-2]
        dout = self.size[-1]
        for i in range(no_tasks):
            m, v = self.W_last_m[i], self.W_last_v[i]
            m0, v0 = self.prior_W_last_m[i], self.prior_W_last_v[i]
            const_term = -0.5 * dout * din
            log_std_diff = 0.5 * tf.reduce_sum(np.log(v0) - v)
            mu_diff_term = 0.5 * tf.reduce_sum((tf.exp(v) + (m0 - m)**2) / v0)
            kl += const_term + log_std_diff + mu_diff_term

            m, v = self.b_last_m[i], self.b_last_v[i]
            m0, v0 = self.prior_b_last_m[i], self.prior_b_last_v[i]
            const_term = -0.5 * dout
            log_std_diff = 0.5 * tf.reduce_sum(np.log(v0) - v)
            mu_diff_term = 0.5 * tf.reduce_sum((tf.exp(v) + (m0 - m)**2) / v0)
            kl += const_term + log_std_diff + mu_diff_term
        return kl

    def create_weights(self, in_dim, hidden_size, out_dim, prev_weights, prev_variances):
        hidden_size = deepcopy(hidden_size)
        hidden_size.append(out_dim)
        hidden_size.insert(0, in_dim)
        no_params = 0
        no_layers = len(hidden_size) - 1
        W_m = []
        b_m = []
        W_last_m = []
        b_last_m = []
        W_v = []
        b_v = []
        W_last_v = []
        b_last_v = []
        for i in range(no_layers-1):
            din = hidden_size[i]
            dout = hidden_size[i+1]
            if prev_weights is None:
                Wi_m_val = tf.truncated_normal([din, dout], stddev=0.1)
                bi_m_val = tf.truncated_normal([dout], stddev=0.1)
                Wi_v_val = tf.constant(-6.0, shape=[din, dout])
                bi_v_val = tf.constant(-6.0, shape=[dout])
            else:
                Wi_m_val = prev_weights[0][i]
                bi_m_val = prev_weights[1][i]
                if prev_variances is None:
                    Wi_v_val = tf.constant(-6.0, shape=[din, dout])
                    bi_v_val = tf.constant(-6.0, shape=[dout])
                else:
                    Wi_v_val = prev_variances[0][i]
                    bi_v_val = prev_variances[1][i]

            Wi_m = tf.Variable(Wi_m_val)
            bi_m = tf.Variable(bi_m_val)
            Wi_v = tf.Variable(Wi_v_val)
            bi_v = tf.Variable(bi_v_val)
            W_m.append(Wi_m)
            b_m.append(bi_m)
            W_v.append(Wi_v)
            b_v.append(bi_v)

        # if there are previous tasks
        if prev_weights is not None and prev_variances is not None:
            prev_Wlast_m = prev_weights[2]
            prev_blast_m = prev_weights[3]
            prev_Wlast_v = prev_variances[2]
            prev_blast_v = prev_variances[3]
            no_prev_tasks = len(prev_Wlast_m)
            for i in range(no_prev_tasks):
                W_i_m = prev_Wlast_m[i]
                b_i_m = prev_blast_m[i]
                Wi_m = tf.Variable(W_i_m)
                bi_m = tf.Variable(b_i_m)

                W_i_v = prev_Wlast_v[i]
                b_i_v = prev_blast_v[i]
                Wi_v = tf.Variable(W_i_v)
                bi_v = tf.Variable(b_i_v)
                
                W_last_m.append(Wi_m)
                b_last_m.append(bi_m)
                W_last_v.append(Wi_v)
                b_last_v.append(bi_v)

        din = hidden_size[-2]
        dout = hidden_size[-1]

        # if point estimate is supplied
        if prev_weights is not None and prev_variances is None:
            Wi_m_val = prev_weights[2][0]
            bi_m_val = prev_weights[3][0]
        else:
            Wi_m_val = tf.truncated_normal([din, dout], stddev=0.1)
            bi_m_val = tf.truncated_normal([dout], stddev=0.1)
        Wi_v_val = tf.constant(-6.0, shape=[din, dout])
        bi_v_val = tf.constant(-6.0, shape=[dout])

        Wi_m = tf.Variable(Wi_m_val)
        bi_m = tf.Variable(bi_m_val)
        Wi_v = tf.Variable(Wi_v_val)
        bi_v = tf.Variable(bi_v_val)
        W_last_m.append(Wi_m)
        b_last_m.append(bi_m)
        W_last_v.append(Wi_v)
        b_last_v.append(bi_v)
            
        return [W_m, b_m, W_last_m, b_last_m], [W_v, b_v, W_last_v, b_last_v], hidden_size

    def create_prior(self, in_dim, hidden_size, out_dim, prev_weights, prev_variances, prior_mean, prior_var):
        hidden_size = deepcopy(hidden_size)
        hidden_size.append(out_dim)
        hidden_size.insert(0, in_dim)
        no_params = 0
        no_layers = len(hidden_size) - 1
        W_m = []
        b_m = []
        W_last_m = []
        b_last_m = []
        W_v = []
        b_v = []
        W_last_v = []
        b_last_v = []
        for i in range(no_layers-1):
            din = hidden_size[i]
            dout = hidden_size[i+1]
            if prev_weights is not None and prev_variances is not None:
                Wi_m = prev_weights[0][i]
                bi_m = prev_weights[1][i]
                Wi_v = np.exp(prev_variances[0][i])
                bi_v = np.exp(prev_variances[1][i])
            else:
                Wi_m = prior_mean
                bi_m = prior_mean
                Wi_v = prior_var
                bi_v = prior_var

            W_m.append(Wi_m)
            b_m.append(bi_m)
            W_v.append(Wi_v)
            b_v.append(bi_v)

        # if there are previous tasks
        if prev_weights is not None and prev_variances is not None:
            prev_Wlast_m = prev_weights[2]
            prev_blast_m = prev_weights[3]
            prev_Wlast_v = prev_variances[2]
            prev_blast_v = prev_variances[3]
            no_prev_tasks = len(prev_Wlast_m)
            for i in range(no_prev_tasks):
                Wi_m = prev_Wlast_m[i]
                bi_m = prev_blast_m[i]
                Wi_v = np.exp(prev_Wlast_v[i])
                bi_v = np.exp(prev_blast_v[i])
                
                W_last_m.append(Wi_m)
                b_last_m.append(bi_m)
                W_last_v.append(Wi_v)
                b_last_v.append(bi_v)

        din = hidden_size[-2]
        dout = hidden_size[-1]
        Wi_m = prior_mean
        bi_m = prior_mean
        Wi_v = prior_var
        bi_v = prior_var
        W_last_m.append(Wi_m)
        b_last_m.append(bi_m)
        W_last_v.append(Wi_v)
        b_last_v.append(bi_v)
            
        return [W_m, b_m, W_last_m, b_last_m], [W_v, b_v, W_last_v, b_last_v]

In [71]:
mf_model = MFVI_NN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [72]:
mf_model.train(x_train, y_train, head, no_epochs, bsize)

Epoch: 0001 cost= 4.277204037
Epoch: 0006 cost= 4.177356720
Epoch: 0011 cost= 4.128238678
Epoch: 0016 cost= 4.085588932
Epoch: 0021 cost= 4.102339268
Epoch: 0026 cost= 4.066158295
Epoch: 0031 cost= 4.070765972
Epoch: 0036 cost= 4.042218685
Epoch: 0041 cost= 4.037230968
Epoch: 0046 cost= 4.022535324
Epoch: 0051 cost= 4.022497654
Epoch: 0056 cost= 3.999462366
Epoch: 0061 cost= 4.005934715
Epoch: 0066 cost= 3.991479874
Epoch: 0071 cost= 3.997089863
Epoch: 0076 cost= 3.974993944
Epoch: 0081 cost= 3.974561930
Epoch: 0086 cost= 3.962573290
Epoch: 0091 cost= 3.960590363
Epoch: 0096 cost= 3.957558155
Optimization Finished!


[4.2772040367126465,
 4.245637893676758,
 4.3589372634887695,
 4.224806308746338,
 4.164858341217041,
 4.177356719970703,
 4.210019111633301,
 4.13646125793457,
 4.159753799438477,
 4.2135396003723145,
 4.128238677978516,
 4.247989654541016,
 4.147328853607178,
 4.117985725402832,
 4.148529052734375,
 4.0855889320373535,
 4.132907390594482,
 4.108402252197266,
 4.0854973793029785,
 4.105916976928711,
 4.102339267730713,
 4.074819564819336,
 4.076578617095947,
 4.09417724609375,
 4.069313049316406,
 4.066158294677734,
 4.0602312088012695,
 4.0648112297058105,
 4.083983898162842,
 4.084514617919922,
 4.070765972137451,
 4.059077739715576,
 4.0660719871521,
 4.071957588195801,
 4.046168804168701,
 4.0422186851501465,
 4.06585693359375,
 4.058926105499268,
 4.057159423828125,
 4.046431064605713,
 4.037230968475342,
 4.04102897644043,
 4.035149097442627,
 4.037797451019287,
 4.032679080963135,
 4.02253532409668,
 4.031850814819336,
 4.027780055999756,
 4.030267238616943,
 4.047558784484863,