### **Based off of optimized.ipynb**
Implements parallelization, graph execution, and fixed F(t-1) problem

Currently uses `Strategy 1`

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
tf.config.list_physical_devices('GPU')

In [None]:
# Define da model

from keras import layers

INPUT_SIZE = 30
LEARNING_RATE = 0.00001
TANH_CONSTANT = 0.01

input = layers.Input(shape=(INPUT_SIZE,))
# x = layers.Dense(30, activation='elu')(input)
# x = layers.GRU(20, return_sequences=True)(input)
# x = layers.GRU(20, return_sequences=False)(input)
# x = layers.Dropout(0.2)(x)
x = layers.Dense(30, activation='elu')(input)
x = layers.Dropout(0.5)(x)
x = layers.Dense(20, activation='elu')(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(1, activation='linear')(x)
x = layers.Add()([tf.math.tanh(x), x * tf.constant(TANH_CONSTANT, dtype=tf.float32)]) # This bit is the activation function: tanh(x) + x * TANH_CONSTANT
F_model = keras.Model(inputs=input, outputs=x)
F_model.summary()

F_model.compile(optimizer=keras.optimizers.Adam(LEARNING_RATE), loss='mse')

In [None]:
# Following 2 functions take whole arrays, of shape (instances, size)

def gen_price_series(size=10000, k=3, a=0.9, instances=1):
    p_series = np.ndarray((instances, size,), dtype=np.float32)
    b_series = np.ndarray((instances, size,), dtype=np.float32)
    p_series[:, 0] = 0
    b_series[:, 0] = 0

    for i in range(1, size):
        p_series[:, i] =  p_series[:, i-1] + b_series[:, i-1] + k * np.random.normal(size=instances)
        b_series[:, i] = a * b_series[:, i-1] + np.random.normal(size=instances)

    # shape: (instances)
    R = np.max(p_series, axis=1) - np.min(p_series, axis=1)
    z_series = np.exp(p_series / np.repeat(R[:, np.newaxis], size, axis=1))

    return z_series

def calc_price_returns(zt):
    # returns the set rt, with the first element (0) being NaN
    rt = np.ndarray(zt.shape, dtype=np.float32)
    rt[:, 0] = np.nan
    rt[:, 1:] = zt[:, 1:] - zt[:, :-1]
    return rt



# Following 2 functions take slices of the arrays, of shape (instances,)

def calc_return(mu, rt, Ft_curr, Ft_prev, rft=tf.constant(0.0), delta=tf.constant(0.0)):
    '''Calculates the returns (Rt) on-line.'''

    return mu * (rft + Ft_prev * (rt - rft) - delta * tf.math.abs(Ft_curr - Ft_prev))

def calc_DSR(n, Rt, At_prev, Bt_prev):
    '''Calculates the differential Sharpe ratios (DSR, Dt) on-line.'''

    At_curr = At_prev + n * (Rt - At_prev)
    Bt_curr = Bt_prev + n * (tf.math.square(Rt) - Bt_prev)

    dDt_dRt = (Bt_prev - At_prev * Rt) / tf.math.pow((Bt_prev - tf.math.square(At_prev)), 1.5)

    return dDt_dRt, At_curr, Bt_curr


from math import prod
# Repeat grades of shape (instances,) to shape (instances,) + grad.shape
def reshape_grad(grad, shape):
    # takes a tf.Tensor grad of shape (instances,) and the target shape (variables)
    instances = grad.shape[0]
    total_elements = prod(shape)
    grad = tf.repeat(grad, int(total_elements / instances))
    grad = tf.reshape(grad, shape)
    
    return grad

In [None]:
# Simulate trading using the outputs of the model
# Takes SLICES of the arrays, of shape (instances,)

def test_performance(zt, Ft):
    # zt: (instances, size)
    # Ft: (instances, size)

    instances = zt.shape[0]
    size = zt.shape[1]
    Ft = np.sign(Ft)

    values = np.ones((instances, size))
    owned = np.zeros((instances,))
    money = np.ones((instances,))
    values[:, 0] = money

    values_ideal = np.ones((instances, size))
    owned_ideal = np.zeros((instances,))
    money_ideal = np.ones((instances,))
    values_ideal[:, 0] = money_ideal

    for t in range(INPUT_SIZE, size - 1):

        # Model Ft

        # buy if Ft 1, owned 0 --> owned 1
        # sell if Ft -1, owned 1 --> owned 0

        # hold if Ft 0, owned 0 or 1
        # hold if Ft 1, owned 1
        # hold if Ft -1, owned 0

        # model
        buy = np.clip(Ft[:, t] * (1 - owned), 0, 1) # 1 if BUY, 0 if not
        sell = np.clip(-Ft[:, t] * owned, 0, 1) # 1 if SELL, 0 if not
        decision = buy - sell # 1 if BUY, -1 if SELL, 0 if HOLD
        owned = np.clip(owned + decision, 0, 1)
        money -= decision * zt[:, t]
        values[:, t] = money + owned * zt[:, t]

        # ideal
        deltas_ideal = np.sign(zt[:, t + 1] - zt[:, t])
        buy_ideal = np.clip(deltas_ideal * (1 - owned_ideal), 0, 1)
        sell_ideal = np.clip(-deltas_ideal * owned_ideal, 0, 1)
        decision_ideal = buy_ideal - sell_ideal
        owned_ideal = np.clip(owned_ideal + decision_ideal, 0, 1)
        money_ideal -= decision_ideal * zt[:, t]
        values_ideal[:, t] = money_ideal + owned_ideal * zt[:, t]
    
    values[:, -1] = np.nan
    values_ideal[:, -1] = np.nan

    return (values[-1] / zt[-1], values_ideal[-1] / zt[-1]), (values, values_ideal)


In [None]:
@tf.function
def calc_grads(zt, rt, At, Bt, N, MU, F_model, F_prev, rft=tf.constant(0.0), transaction_cost=tf.constant(0.0), random=tf.constant(1.0)):
    '''
    zt: zt[:, t - INPUT_SIZE + 1:t + 1]
    rt: rt[:, t]
    At: At[:, t-1]
    Bt: Bt[:, t-1]
    '''

    # print("TRACING calc_grads()")

    INSTANCES = zt.shape[0]

    with tf.GradientTape(persistent=True) as tape:
        F_curr = tf.reshape(F_model(zt), (INSTANCES,))
        
        # calculate the gradient
        # Rt = calc_return(MU, rt, F_curr, F_prev, rft, transaction_cost)

        # THIS WORKS
        # Rt = calc_return(MU, rt, F_curr, F_prev, rft=tf.constant(0.0), delta=tf.constant(0.0))
        # THIS DOES NOT
        # UPDATE: THIS WORKS NOW??? NO CHANGES WERE MADE ???
        # Rt = calc_return(MU, rt, F_curr, F_prev, tf.constant(0.0), tf.constant(0.0))
        # ???????
        Rt = calc_return(MU, rt, F_curr, F_prev, rft, transaction_cost)

        dDt_dRt, At_new, Bt_new = calc_DSR(N, Rt, At, Bt)

    # calculate derivatives.
    dRt_dFcurr = tape.gradient(Rt, F_curr) # shape: (instances,)
    dRt_dFprev = tape.gradient(Rt, F_prev) # shape: (instances,)
    dF_dTheta = tape.jacobian(F_curr, F_model.trainable_variables) # shape: (instances, MODEL VAR SHAPE)
    # a = dRt_dFcurr.numpy()
    # b = dRt_dFprev.numpy()
    # c = dF_dTheta.numpy()
    # if np.isnan(a).any() or np.isnan(b).any():
    #     print("NAN")
    # if tf.reduce_sum(dF_dTheta[0]) == 0:
    #     a = dRt_dFcurr.numpy()
    #     b = dRt_dFprev.numpy()
    #     c = [d.numpy() for d in dF_dTheta]

    

    return Rt, F_curr, At_new, Bt_new, dDt_dRt, dRt_dFcurr, dRt_dFprev, dF_dTheta

In [None]:
import warnings, sys
warnings.filterwarnings("ignore")

# GENERATION VARS
import time

'''
A N0TE ON THE PARAMETERS:

These parameters are appropriate for one data point per DAY.

The artificial price series start at a value of 1.0, and normally scales between 0.5-2.0. 
A conversion of 1.0 to $100 is used, so the principal is $100.
A transaction cost of 0.1 ($10) is therefore imposed (based on RBC standard order fees).
A risk-free return of 4.00% for a $5000 3-year gov. bond is used (converted to $100 principal), 
    which converts to 0.04 / 252 (1.59E-4, $0.0159 a day).
'''

# HYPERPARAMETERS
if True:
    LEARNING_RATE = 0.001
    MAX_GRAD = 100 * LEARNING_RATE

    MU = tf.constant(3.0)
    N = tf.constant(0.01)
    # RISKFREE_RETURN = tf.constant(1.59e-4)
    RISKFREE_RETURN = tf.constant(0.0)
    # TRANS_COST = tf.constant(0.01)
    TRANS_COST = tf.constant(0.0)
    SERIES_LENGTH = 1000
    TRADING_DELAY = 200
    K = 3
    A = 0.9

    EPISODES = 100
    INSTANCES = 10

    START_EPSILON = 0.5
    END_EPSILON = 0.01
    DECAY_PERIOD = 6000 # in number of timesteps
    epsilon = START_EPSILON

BASELINE_SERIES = gen_price_series(size=SERIES_LENGTH, k=K, a=A, instances=INSTANCES)

for ep in range(EPISODES):

    # DEFINE A BUNCH OF STUFF
    if True:
        # generate price series
        zt = gen_price_series(size=SERIES_LENGTH, k=K, a=A, instances=INSTANCES)
        rt = calc_price_returns(zt)
        Ft = np.zeros((INSTANCES, SERIES_LENGTH,), dtype=np.float32)

        # breaks if init at one; MUST INIT ZERO
        At = np.zeros((INSTANCES, SERIES_LENGTH,), dtype=np.float32)
        Bt = np.zeros((INSTANCES, SERIES_LENGTH,), dtype=np.float32)
        dF_dTheta_prev = None

        Rt_series = np.zeros((INSTANCES, SERIES_LENGTH,))
        SR_series = np.zeros((INSTANCES, SERIES_LENGTH,))
        DSR_series = np.zeros((INSTANCES, SERIES_LENGTH,))
        dD_series = np.ones((INSTANCES, SERIES_LENGTH,))
        autodiff_series = np.ones((INSTANCES, SERIES_LENGTH,))
        bench_series = np.zeros((INSTANCES, SERIES_LENGTH,))
        grad_series = np.zeros((SERIES_LENGTH,))
        gradC_series = np.zeros((SERIES_LENGTH,))
        trade_count = np.zeros((INSTANCES,))

    for t in range(INPUT_SIZE, SERIES_LENGTH):
        if (epsilon > END_EPSILON) and (t >= TRADING_DELAY):
            epsilon -= (START_EPSILON - END_EPSILON) / DECAY_PERIOD
        random = np.where(np.random.rand(INSTANCES) < epsilon, np.random.rand(INSTANCES) * 2 - 1, 1).astype(np.float32)
        random = 0

        Rt, Ft[:, t], At[:, t], Bt[:, t], dDt_dRt, dRt_dFcurr, dRt_dFprev, dF_dTheta = calc_grads(zt[:, t - INPUT_SIZE + 1:t + 1], rt[:, t], At[:, t-1], Bt[:, t-1], N, MU, F_model, tf.Variable(Ft[:, t-1]), RISKFREE_RETURN, TRANS_COST, random)
        # print(Ft[0, t])

        pos_changes = np.where(Ft[:, t] * Ft[:, t-1] < 0, 1, 0)
        trade_count += pos_changes


        '''DIAGNOSTICS. -----------------'''
        Rt_series[:, t] = Rt
        SR_series[:, t] = np.mean(Rt_series[:, INPUT_SIZE:t], axis=1) / np.std(Rt_series[:, INPUT_SIZE:t], axis=1)
        DSR_series[:, t] = (Bt[:, t-1] * (Rt - At[:, t-1]) - (1/2) * (At[:, t-1] * (Rt ** 2 - Bt[:, t-1]))) / ((Bt[:, t-1] - At[:, t-1] ** 2) ** 1.5)
        dD_series[:, t] = dDt_dRt
        bench_series[:, t] = F_model(zt[:, -INPUT_SIZE:]).numpy().reshape(INSTANCES)
        '''---------------------------------'''
        
        if np.where(SR_series[:, t] > 1.5, True, False).any():
            a = np.mean(Rt_series[:, INPUT_SIZE:t], axis=1)
            b = np.std(Rt_series[:, INPUT_SIZE:t], axis=1)
            pass

        '''THIS BIT APPLIES THE GRADIENTS. --------------'''
        if (t != INPUT_SIZE) and t >= TRADING_DELAY:
            # multiply derivatives together.
            gradient_update = []

            
            # if t > SERIES_LENGTH - 5:
            #     print("\n\n\n")
            #     print("==========================================================================")
            #     print("==========================================================================")

            # print("===============================================================================")

            grad_mean = np.ndarray((len(dF_dTheta),), dtype=np.float32)
            # print(dF_dTheta[-1], dF_dTheta_prev[-1], "\n====\n")

            for i in range(len(dF_dTheta)):
                total_elements = prod(dF_dTheta[i].shape)

                # expand the "scalar" derivatives to the shape of the model variables. (jacobians)
                dDt_dRt_exp = reshape_grad(dDt_dRt, dF_dTheta[i].shape)
                dRt_dFcurr_exp = reshape_grad(dRt_dFcurr, dF_dTheta[i].shape)
                dRt_dFprev_exp = reshape_grad(dRt_dFprev, dF_dTheta[i].shape)

                grad = dDt_dRt_exp * (dRt_dFcurr_exp * dF_dTheta[i] + dRt_dFprev_exp * dF_dTheta_prev[i])
                grad = tf.reduce_sum(grad, axis=0)
                grad *= LEARNING_RATE / INSTANCES # divide by instances since the gradients are summed over all instances.

                gradient_update.append(grad)
                grad_mean[i] = tf.reduce_mean(grad)
                

                # if t > SERIES_LENGTH - 5:
                #     tf.print(grad, output_stream=sys.stdout, summarize=-1)
                #     print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
                #     tf.print(dDt_dRt_exp, output_stream=sys.stdout, summarize=-1)
                #     print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
                #     tf.print(dRt_dFcurr_exp, output_stream=sys.stdout, summarize=-1)
                #     print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
                #     tf.print(dF_dTheta[i], output_stream=sys.stdout, summarize=-1)
                #     print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
                #     tf.print(dRt_dFprev_exp, output_stream=sys.stdout, summarize=-1)
                #     print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
                #     tf.print(dF_dTheta_prev[i], output_stream=sys.stdout, summarize=-1)
                #     print()
                #     print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
                #     print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
                #     print()
            grad_series[t] = np.mean(grad_mean)

            e = [np.sum(i.numpy()) for i in dF_dTheta]
            # print(e)

            if np.abs(grad_series[t]) == 0 or sum(e) == 0:
                a = dDt_dRt.numpy()
                b = dRt_dFcurr.numpy()
                c = dRt_dFprev.numpy()
                d = [i.numpy() for i in dF_dTheta]
                e = [i.numpy() for i in dF_dTheta_prev]
                f = Ft[0, t]
                apsdfhqphqpwef = 0

            # gradient_update = tf.clip_by_global_norm(gradient_update, 1)[0]
            # actually add the grads
            vars = F_model.trainable_variables
            grad_mean = np.ndarray((len(dF_dTheta),), dtype=np.float32)
            for i in range(len(vars)):
                grad = np.clip(gradient_update[i], -MAX_GRAD, MAX_GRAD)
                grad_mean[i] = tf.reduce_mean(grad)
                vars[i].assign_add(grad)
            gradC_series[t] = np.mean(grad_mean)
            '''--------------------------------------------'''

        if t >= TRADING_DELAY - 1:
            # need to continously update so that the first training step has a t-1 value.
            dF_dTheta_prev = dF_dTheta

    '''PLOT DIAGNOSTICS. -----------------'''
    if ep % 1 == 0: 
        print("Episode: ", ep)
        print("Mean gradient: ", np.mean(grad_series))
        print("Mean clipped gradient: ", np.mean(grad_series))
        # test performance
        deltas, val_series = test_performance(zt, Ft)

        for i in range(1):
            print("# of trades: ", trade_count[i])
            print("mean SR: ", np.mean(SR_series[i, INPUT_SIZE + 2:]))
            print("mean DSR: ", np.mean(DSR_series[i, INPUT_SIZE + 2:]))

            fig, ax = plt.subplots(3, 3, figsize=(10, 6))
            fig.tight_layout()

            ax[0, 0].plot(zt[i])
            ax[0, 0].plot(val_series[0][i])
            ax[0, 0].set_title("price")

            ax[1, 0].plot(Ft[i] * 0, color='k')
            ax[1, 0].plot(Ft[i])
            # ax[1, 0].set_ylim([-1, 1])
            ax[1, 0].set_title("decision Ft")

            # ax[2, 0].plot(bench_series[i] * 0, color='k')
            # ax[2, 0].plot(bench_series[i])
            # ax[2, 0].set_ylim([-1, 1])
            # ax[2, 0].set_title("benchmark Ft")
            ax[2, 0].plot(grad_series * 0, color='k')
            ax[2, 0].plot(grad_series)
            # ax[2, 0].set_ylim([-1, 1])
            ax[2, 0].set_title("mean gradient")

        
            SR_cut = SR_series.copy()
            SR_cut[:, :TRADING_DELAY] = np.nan
            ax[0, 1].plot(SR_cut[i])
            ax[0, 1].set_xlim([0, SERIES_LENGTH])
            ax[0, 1].set_title("sharpe ratio")

            DSR_cut = DSR_series.copy()
            DSR_cut[:, :TRADING_DELAY] = np.nan
            ax[1, 1].plot(DSR_series[i] * 0, color='k')
            ax[1, 1].plot(DSR_cut[i])
            ax[1, 1].set_xlim([0, SERIES_LENGTH])
            ax[1, 1].set_title("derivative sharpe ratio")

            dD_cut = dD_series.copy()
            dD_cut[:, :TRADING_DELAY] = np.nan
            ax[2, 1].plot(dD_cut[i])
            ax[2, 1].set_xlim([0, SERIES_LENGTH])
            ax[2, 1].set_title("dD/dR")

            ax[0, 2].hist(grad_series, bins=100)
            ax[0, 2].set_title("mean gradient")
            ax[1, 2].hist(SR_cut[i, int(SERIES_LENGTH / 2):], bins=100)
            ax[1, 2].set_title("SR")
            ax[2, 2].hist(DSR_cut[i, int(SERIES_LENGTH / 2):], bins=100)
            ax[2, 2].set_title("derivative SR")


            plt.show()

        print("==========================================================================")

    '''-----------------------------------'''



In [None]:
a = np.mean(Rt_series[:, INPUT_SIZE:t], axis=1)
b = np.std(Rt_series[:, INPUT_SIZE:t], axis=1)
c = a / b
d = np.ndarray((INSTANCES, SERIES_LENGTH,))
e = np.ndarray((INSTANCES, SERIES_LENGTH,))

for i in range(INPUT_SIZE, SERIES_LENGTH):
    d[:, i] = np.mean(Rt_series[:, INPUT_SIZE:i], axis=1)
    e[:, i] = np.std(Rt_series[:, INPUT_SIZE:i], axis=1)

for i in range(INSTANCES):
    plt.plot(Rt_series[i])
    plt.show()
    plt.plot(d[i])
    plt.show()
    plt.plot(e[i])
    plt.show()
# plt.plot(Rt_series[0])
# plt.show()
# plt.plot(d[0])
# plt.show()

# np.mean(Rt_series[:, INPUT_SIZE:t], axis=1) / np.std(Rt_series[:, INPUT_SIZE:t], axis=1)

In [None]:
keras.models.save_model(F_model, "model.h5")

In [None]:
F_model = keras.models.load_model("divergent_model.h5")

In [None]:
vars = F_model.trainable_variables
for i in range(len(vars)):
    tf.print(vars[i], output_stream=sys.stdout, summarize=-1)
    print("\n")


# MODEL MAKING HEALTHY DECISIONS

In [None]:
input = layers.Input(shape=(INPUT_SIZE,))
# x = layers.Dense(20, activation='elu')(input)
x = layers.Dense(10, activation='elu')(input)
x = layers.Dense(1, activation='linear')(x)
x = layers.Add()([tf.math.tanh(x), x * tf.constant(0.01)])
A_model = keras.Model(inputs=input, outputs=x)
vars = A_model.trainable_variables
for i in range(len(vars)):
    tf.print(vars[i], output_stream=sys.stdout, summarize=-1)
    print("\n")

In [None]:
vars = A_model.trainable_variables

# vars[-1].assign_add([-20.,])

for i in range(len(vars)):
    tf.print(vars[i], output_stream=sys.stdout, summarize=-1)
    print("\n")

In [None]:
while True:
    with tf.GradientTape(persistent=True) as tape:
        input = np.random.uniform(0.5, 2, INPUT_SIZE).reshape(1, INPUT_SIZE)
        # input = test
        # input = np.array([1.3615544390032746, 0.9911959411327966, 1.860840540643432, 1.9566621882886455, 1.0216394170198488, 1.320837858199272, 0.74359758712787, 1.5175825299156631, 0.6565231677349852, 1.0815482211370182, 1.329412514047963, 0.7756110246482257, 1.9846218430759657, 0.6300063569059443, 1.6438240132480755, 1.1823738080614201, 0.7346359086574064, 1.7163596409200301, 1.5332444743534628, 1.437000217920835, 1.9873757199427518, 1.9491195850175478, 1.9819059912505954, 1.759786091395387, 0.5987470203000758, 0.68860981057173, 0.7821889719299655, 1.5952623264224697, 1.9790404298610664, 0.5405285287583309,]).reshape(1, 30)

        # Fout = F_model(input)
        Aout = A_model(input)


    # dF = tape.gradient(Fout, F_model.trainable_variables)
    dA = tape.gradient(Aout, A_model.trainable_variables)


    '''print GRADS'''
    # for i in range(len(dF)):
    #     tf.print(dF[i], output_stream=sys.stdout, summarize=-1)
    #     print("\n")
    # print("====================================")
    for i in range(len(dA)):
        tf.print(dA[i], output_stream=sys.stdout, summarize=-1)
        print("\n")

        
    # print(F_model(input))
    print(A_model(input), end="\r")


    # if A_model(input) == 1:
    #     break

    # vars = A_model.trainable_variables
    # for i in range(len(vars)):
    #     vars[i].assign_add(0.001 * dA[i])

    break

In [None]:
extractor = keras.Model(inputs=A_model.inputs,
                        outputs=[layer.output for layer in A_model.layers])
# extractor = keras.Model(inputs=A_model.inputs, outputs=A_model.layers[-1].output)

# Testing
test = np.random.uniform(-1, 1, INPUT_SIZE).reshape(1, INPUT_SIZE)
# test = np.array([1.657776,1.7772449,0.8355866,1.2937615,1.5042442,1.2313355 ,
#  0.6413103,1.6870579, 1.2624699 ,1.4338475,0.5683779 ,1.5942571 ,
#  1.8920268,1.2122076, 0.517571  ,1.1931471,0.69882214,0.7847162 ,
#  1.7646049,1.6217262, 1.6088332 ,1.1099463,0.646141  ,1.8243487 ,
#  1.785189 ,1.1896547, 0.60746706,1.7007248,0.5227392 ,1.684452  ] ).reshape(1, INPUT_SIZE)

print("TEST: ", test)
test = tf.constant(test, dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    # for whatever reason, calling A_model(test) instead of extractor(test) causes the gradients to be wrong,
    # even though they should be the same.
    tape.watch(test)
    output = A_model(test)
    output_e = extractor(test)
grad = tape.jacobian(output, A_model.layers[1].trainable_variables)
# grad_e = tape.gradient(output_e, extractor.layers[1].trainable_variables)


features = [e.numpy() for e in extractor(test)]
# print(features)
elu_in = np.tensordot(A_model.trainable_variables[0], features[0][0], axes=([0], [0])) + A_model.trainable_variables[1].numpy()
# tanh_in = np.tensordot(A_model.trainable_variables[2], features[1][0], axes=([0], [0])) + A_model.trainable_variables[3].numpy()
print("Dense ELU input:", elu_in, "\n")
tf.print(grad[1], output_stream=sys.stdout, summarize=-1)
# tf.print(grad_e[1], output_stream=sys.stdout, summarize=-1)
print()
print("Dense ELU output:", features[1], "\n")
print("Dense TANH input:", tanh_in)
print("Dense TANH output:", features[2], "\n")
print("Dense A_MODEL output:", output.numpy(), "\n")

# print(features)

In [None]:
a = np.arange(6).reshape(3, 2)
b = np.arange(3)
print(a)
print(b)
c = np.tensordot(a, b, axes=([0], [0]))
print(c)

In [None]:
size = 3
layer = keras.layers.Dense(1, activation='linear', input_shape=(size,))
layer.build((size,))
vars = layer.trainable_variables
for i in range(len(vars)):
    tf.print(vars[i], output_stream=sys.stdout, summarize=-1)
    print("\n")

In [None]:

# inp = np.ones(size).reshape(1, size)
inp = np.random.uniform(-1, 1, size).reshape(1, size)
with tf.GradientTape(persistent=True) as tape:
    output = layer(inp)

print(inp)
intermediate = np.dot(vars[0].numpy()[:, 0], inp[0]) + vars[1].numpy()[0]
print("intermediate: ", intermediate)
print("out: ", output)

D = tape.gradient(output, layer.trainable_variables)
for i in range(len(D)):
    tf.print(D[i], output_stream=sys.stdout, summarize=-1)
    # print("\n")



In [None]:
a = inp[0] * np.power(np.cosh(np.dot(vars[0][:, 0].numpy(), inp[0]) + vars[1].numpy()[0]), -2)
print(a)