In [43]:
module GraphDifferention

    using ExportAll
    using LinearAlgebra:diagm

    abstract type GraphNode end
    abstract type Operator <: GraphNode end

    #=
        Structure types
    =#

    struct Constant{T} <: GraphNode
        output :: T
    end

    mutable struct Variable <: GraphNode
        output :: Any
        gradient :: Any
        name :: String
        Variable(output; name="?") = new(output, nothing, name)
    end

    mutable struct ScalarOperator{F} <: Operator
        inputs :: Any
        output :: Any
        gradient :: Any
        name :: String
        ScalarOperator(fun, inputs...; name="?") = new{typeof(fun)}(inputs, nothing, nothing, name)
    end

    mutable struct BroadcastedOperator{F} <: Operator
        inputs :: Any
        output :: Any
        gradient :: Any
        name :: String
        BroadcastedOperator(fun, inputs...; name="?") = new{typeof(fun)}(inputs, nothing, nothing, name)
    end

    ### Pretty-printing
    ## It helps tracking what happens

    import Base: show, summary

    show(io::IO, x::ScalarOperator{F}) where {F} = print(io, "op ", x.name, "(", F, ")");
    show(io::IO, x::BroadcastedOperator{F}) where {F} = print(io, "op.", x.name, "(", F, ")");
    show(io::IO, x::Constant) = print(io, "const ", x.output)
    show(io::IO, x::Variable) = begin
        print(io, "var ", x.name);
        print(io, "\n ┣━ ^ "); summary(io, x.output)
        print(io, "\n ┗━ ∇ ");  summary(io, x.gradient)
    end

    ### Graph building

    function visit(node::GraphNode, visited, order)
        if node ∈ visited
        else
            push!(visited, node)
            push!(order, node)
        end
        return nothing
    end
        
    function visit(node::Operator, visited, order)
        if node ∈ visited
        else
            push!(visited, node)
            for input in node.inputs
                visit(input, visited, order)
            end
            push!(order, node)
        end
        return nothing
    end
    
    function topological_sort(head::GraphNode)
        visited = Set()
        order = Vector()
        visit(head, visited, order)
        return order
    end

    ### Forward pass

    reset!(node::Constant) = nothing
    reset!(node::Variable) = node.gradient = nothing
    reset!(node::Operator) = node.gradient = nothing

    compute!(node::Constant) = nothing
    compute!(node::Variable) = nothing
    compute!(node::Operator) =
        node.output = forward(node, [input.output for input in node.inputs]...)

    function forward!(order::Vector)
        for node in order
            compute!(node)
            reset!(node)
        end
        return last(order).output
    end

    ### Backward pass

    update!(node::Constant, gradient) = nothing
    update!(node::GraphNode, gradient) = if isnothing(node.gradient)
        node.gradient = gradient else node.gradient .+= gradient
    end

    function backward!(order::Vector, seed = ones(length(last(order).output)))
        result = last(order)
        result.gradient = seed
        #@assert length(result.output) == 1 "Gradient is defined only for scalar functions"
        for node in reverse(order)
            backward!(node)
        end
        return nothing
    end

    function backward!(node::Constant) end
    function backward!(node::Variable) end
    function backward!(node::Operator)
        inputs = node.inputs
        gradients = backward(node, [input.output for input in inputs]..., node.gradient)
        for (input, gradient) in zip(inputs, gradients)
            update!(input, gradient)
        end
        return nothing
    end

    # Scalar operators

    import Base: ^
    ^(x::GraphNode, n::GraphNode) = ScalarOperator(^, x, n)
    ^(x::GraphNode, n::Number) = ScalarOperator(^, x, Constant(n))
    forward(::ScalarOperator{typeof(^)}, x, n) = return x^n
    backward(::ScalarOperator{typeof(^)}, x, n, g) = tuple(g * n * x ^ (n-1), g * log(abs(x)) * x ^ n)

    import Base: sin
    sin(x::GraphNode) = ScalarOperator(sin, x)
    forward(::ScalarOperator{typeof(sin)}, x) = return sin(x)
    backward(::ScalarOperator{typeof(sin)}, x, g) = tuple(g * cos(x))
    
    import Base: tanh
    tanh(x::GraphNode) = ScalarOperator(tanh,x)
    forward(::ScalarOperator{typeof(tanh)}, x) = return tanh.(x)
    backward(node::ScalarOperator{typeof(tanh)}, x, g) = let
        𝟏 = ones(length(node.output))
        x_sqr = node.output.^2
        derivative_tanh_vector = 𝟏 .- x_sqr
        tuple(derivative_tanh_vector .* g)
    end
    # Broadcast operators

    import Base: *
    import LinearAlgebra: mul!
    # x * y (aka matrix multiplication)
    *(A::GraphNode, x::GraphNode) = BroadcastedOperator(mul!, A, x)
    forward(::BroadcastedOperator{typeof(mul!)}, A, x) = return A * x
    backward(::BroadcastedOperator{typeof(mul!)}, A, x, g) = tuple(g * x', A' * g)

    # x .* y (element-wise multiplication)
    Base.Broadcast.broadcasted(*, x::GraphNode, y::GraphNode) = BroadcastedOperator(*, x, y)
    forward(::BroadcastedOperator{typeof(*)}, x, y) = return x .* y
    backward(node::BroadcastedOperator{typeof(*)}, x, y, g) = let
        𝟏 = ones(length(node.output))
        Jx = diagm(y .* 𝟏)
        Jy = diagm(x .* 𝟏)
        tuple(Jx' * g, Jy' * g)
    end

    softmax(x) = let     
        shiftx = x .- maximum(x)
        exps = exp.(shiftx)
        return exps ./ sum(exps)
    end
    
    softmax(x::GraphNode) = BroadcastedOperator(softmax,x)
    forward(::BroadcastedOperator{typeof(softmax)},x) = return softmax(x)
    backward(node::BroadcastedOperator{typeof(softmax)},x,g)  = let
        vector_of_derivatives = Vector()
        for i in 1:length(node.output)
            yi = node.output[i]
            ∑dLdYj_times_Yj = sum(g.* node.output)
            dLdYi = g[i]
            result  =  -yi*(∑dLdYj_times_Yj - dLdYi)
            push!(vector_of_derivatives,result)
        end
        tuple(vector_of_derivatives)
    end

    crossentropy(output,target) =  sum(-target.*log.(output))
    crossentropy(x::GraphNode,y::GraphNode) = BroadcastedOperator(crossentropy,x,y)
    forward(::BroadcastedOperator{typeof(GraphDifferention.crossentropy)},x,y) = return crossentropy(x,y)
    backward(node::BroadcastedOperator{typeof(GraphDifferention.crossentropy)},x,y,g)  = let
        𝟏 = ones(length(x))
        tuple(g.*(-(y./x) + (𝟏.-y)./(𝟏.-x)))
    end


    Base.Broadcast.broadcasted(-, x::GraphNode, y::GraphNode) = BroadcastedOperator(-, x, y)
    forward(::BroadcastedOperator{typeof(-)}, x, y) = return x .- y
    backward(::BroadcastedOperator{typeof(-)}, x, y, g) = tuple(g,-g)

    Base.Broadcast.broadcasted(+, x::GraphNode, y::GraphNode) = BroadcastedOperator(+, x, y)
    forward(::BroadcastedOperator{typeof(+)}, x, y) = return x .+ y
    backward(::BroadcastedOperator{typeof(+)}, x, y, g) = tuple(g, g)

    import Base: sum
    sum(x::GraphNode) = BroadcastedOperator(sum, x)
    forward(::BroadcastedOperator{typeof(sum)}, x) = return sum(x)
    backward(::BroadcastedOperator{typeof(sum)}, x, g) = let
        𝟏 = ones(length(x))
        J = 𝟏'
        tuple(J' * g)
    end

    Base.Broadcast.broadcasted(/, x::GraphNode, y::GraphNode) = BroadcastedOperator(/, x, y)
    forward(::BroadcastedOperator{typeof(/)}, x, y) = return x ./ y
    backward(node::BroadcastedOperator{typeof(/)}, x, y::Real, g) = let
        𝟏 = ones(length(node.output))
        Jx = diagm(𝟏 ./ y)
        Jy = (-x ./ y .^2)
        tuple(Jx' * g, Jy' * g)
    end

    import Base: max
    Base.Broadcast.broadcasted(max, x::GraphNode, y::GraphNode) = BroadcastedOperator(max, x, y)
    forward(::BroadcastedOperator{typeof(max)}, x, y) = return max.(x, y)
    backward(::BroadcastedOperator{typeof(max)}, x, y, g) = let
        Jx = diagm(isless.(y, x))
        Jy = diagm(isless.(x, y))
        tuple(Jx' * g, Jy' * g)
    end

    @exportAll()
end

Main.GraphDifferention

In [44]:
module AutomaticDifferention

    using ..GraphDifferention
    using ExportAll
    using Distributions
    using LinearAlgebra:I


    abstract type NetworkLayer end

    struct DenseLayerSoftmax <: NetworkLayer
        ordered_computation_graph :: Vector{GraphNode}
        w_handle :: GraphNode
        x_handle :: GraphNode
        output_handle :: GraphNode
        prediction_handle :: GraphNode
        DenseLayerSoftmax(input_output_pair,test_data) = let 
            
            input_num = input_output_pair.first
            output_num = input_output_pair.second

            @assert length(test_data) == output_num

            ordered_graph, w_handle, x_handle, output_handle, prediction_handle = constructSoftmaxDense(input_num,output_num,test_data)
            new(ordered_graph,w_handle,x_handle,output_handle, prediction_handle)
        end
    end

    struct RnnVanillaTanh <: NetworkLayer
        ordered_computation_graph :: Vector{GraphNode}
        w_handle :: GraphNode
        u_handle :: GraphNode
        x_handle :: GraphNode
        h_handle :: GraphNode
        output_handle :: GraphNode
        RnnVanillaTanh(input_output_pair) = let 
            
            input_num = input_output_pair.first
            output_num = input_output_pair.second

            ordered_graph, h_handle, w_handle, u_handle, x_handle, output_handle = constructVanillaTanhRnn(input_num,output_num)
            new(ordered_graph,w_handle,u_handle,x_handle,h_handle,output_handle)
        end
    end

    handle_batching_preperations!(layer::DenseLayerSoftmax) = println("Loss = $(layer.output_handle.output)")
    handle_batching_preperations!(layer::RnnVanillaTanh) = load_output_as_h!(layer)

    function constructVanillaTanhRnn(input_number,outputs_number)
        x = Variable(ones(input_number,1), name = "x-rnn")
        u = Variable(rand(Uniform(-0.01,0.01),outputs_number,input_number), name = "u-rnn")

        h = Variable(rand(Uniform(-0.01,0.01),outputs_number,1), name = "h-rnn")
        w = Variable(rand(Uniform(-0.01,0.01),outputs_number,outputs_number), name = "w-rnn")

        b = Constant(rand(Uniform(-0.01,0.01),outputs_number,1))
        o = (u*x .+ w*h) .+ b

        activation = tanh(o)
        order = topological_sort(activation)

        return order, h, w, u, x, last(order)
    end

    function constructSoftmaxDense(input_number,outputs_number,test_data)
        b = Constant(rand(Uniform(-0.01,0.01),outputs_number,1))
        x = Variable(ones(input_number,1), name = "x-dense")

        w = Variable(rand(Uniform(-0.01,0.01),outputs_number,input_number), name = "w-dense")
        test = Constant(test_data)

        o = (w*x) .+ b
        activation = softmax(o)
        loss = crossentropy(activation,test)
        order = topological_sort(loss)

        return order, w, x, last(order), activation
    end

    function load_output_as_h!(layer::RnnVanillaTanh)
        if !isnothing(layer.output_handle.output)
            layer.h_handle.output = layer.output_handle.output
        end
    end
    
    function run_through_batched_data!(batched_data,network)
        for data_batch in batched_data
            load_batch_of_data!(data_batch, network)
        end
        backward_net!(network...)
        println("Gradient:")
        @show last(network).w_handle.gradient
    end

    function backward_net!(layers...)
        @assert length(layers) > 1 "This function can be run for at least two layers."
        reversed_layers = reverse(layers)
        last_layer = reversed_layers[1]
        other_layers = reversed_layers[2:end]

        backward!(last_layer.ordered_computation_graph)
        gradient_from_last_layer = last_layer.x_handle.gradient

        for iter in eachindex(other_layers)

            backward!(other_layers[iter].ordered_computation_graph,gradient_from_last_layer)
            current_layer_gradient = other_layers[iter].x_handle.gradient

            if iter + 1 < length(other_layers)
                backward!(other_layers[iter+1],current_layer_gradient)
                gradient_from_last_layer = other_layers[iter+1].x_handle.gradient
            end
        end

        return last(other_layers).output_handle.gradient
    end

    function load_batch_of_data!(input_batch,network)
        for layer in network
            handle_batching_preperations!(layer)
        end
        forward_net!(input_batch,network...)
    end

    function backward_net!(layer)
        backward!(layer.ordered_computation_graph)
        gradient_from_last_layer = layer.x_handle.gradient

        return gradient_from_last_layer
    end

    function forward_net!(input, layers...)
        @assert length(layers) > 1 "This function can be run for at least two layers."
        first_layer = layers[1]
        other_layers = layers[2:end]
        first_layer.x_handle.output = input

        output_from_first_layer = forward!(first_layer.ordered_computation_graph)
        other_layers[1].x_handle.output = output_from_first_layer

        for iter in eachindex(other_layers)
            
            current_layer_output = forward!(other_layers[iter].ordered_computation_graph)

            if iter + 1 < length(other_layers)
                other_layers[iter+1].x_handle.output = current_layer_output
            end
        end

        return last(other_layers).output_handle.output
    end

    function forward_net!(input, layer)
        layer.x_handle.output = input

        output_from_layer = forward!(layer.ordered_computation_graph)

        return output_from_layer
    end

    function learning_step(xᵢ, ∇fxᵢ, α = 0.001)
        # steepest descent
        xᵢ₊₁ = xᵢ + α*∇fxᵢ # this will be the new weight matrix
        return xᵢ₊₁
    end

    update_learnin_step!(layer::DenseLayerSoftmax) = let 
        ∇w = layer.w_handle.gradient;
        w = layer.w_handle.output;
        layer.w_handle.output = learning_step(w,∇w)
    end

    update_learnin_step!(layer::RnnVanillaTanh) = let 
        ∇w = layer.w_handle.gradient;
        w = layer.w_handle.output;
        layer.w_handle.output = learning_step(w,∇w)

        ∇h = layer.h_handle.gradient;
        h = layer.h_handle.output;
        layer.h_handle.output = learning_step(h,∇h)
    end

    function update_net_weights!(layers)
        for layer in layers
            update_learnin_step!(layer)
        end
    end
    @exportAll
end # module AutomaticDifferention



Main.AutomaticDifferention

In [45]:
using .AutomaticDifferention
using MLDatasets, Flux
using Statistics:mean

train_data = MLDatasets.MNIST(split=:train)
test_data  = MLDatasets.MNIST(split=:test)

# Prepare data
x1dim = reshape(train_data.features, 28 * 28, :)
yhot  = Flux.onehotbatch(train_data.targets, 0:9)

in1 = x1dim[:,1][1:196]
in2 = x1dim[:,1][197:392]
in3 = x1dim[:,1][393:588]
in4 = x1dim[:,1][589:end]

yhot1 = yhot[:,1]

# Prepare network
rnn = RnnVanillaTanh(196 => 64)
dense = DenseLayerSoftmax(64 => 10, yhot1)
network = [rnn, dense]

2-element Vector{NetworkLayer}:
 RnnVanillaTanh(Main.GraphDifferention.GraphNode[var u-rnn
 ┣━ ^ 64×196 Matrix{Float64}
 ┗━ ∇ Nothing, var x-rnn
 ┣━ ^ 196×1 Matrix{Float64}
 ┗━ ∇ Nothing, op.?(typeof(LinearAlgebra.mul!)), var w-rnn
 ┣━ ^ 64×64 Matrix{Float64}
 ┗━ ∇ Nothing, var h-rnn
 ┣━ ^ 64×1 Matrix{Float64}
 ┗━ ∇ Nothing, op.?(typeof(LinearAlgebra.mul!)), op.?(typeof(+)), const [-0.0010684570063299505; -0.0008121146893986225; … ; -0.009087544330006997; -0.008334445911758998;;], op.?(typeof(+)), op ?(typeof(tanh))], var w-rnn
 ┣━ ^ 64×64 Matrix{Float64}
 ┗━ ∇ Nothing, var u-rnn
 ┣━ ^ 64×196 Matrix{Float64}
 ┗━ ∇ Nothing, var x-rnn
 ┣━ ^ 196×1 Matrix{Float64}
 ┗━ ∇ Nothing, var h-rnn
 ┣━ ^ 64×1 Matrix{Float64}
 ┗━ ∇ Nothing, op ?(typeof(tanh)))
 DenseLayerSoftmax(Main.GraphDifferention.GraphNode[var w-dense
 ┣━ ^ 10×64 Matrix{Float64}
 ┗━ ∇ Nothing, var x-dense
 ┣━ ^ 64×1 Matrix{Float64}
 ┗━ ∇ Nothing, op.?(typeof(LinearAlgebra.mul!)), const [0.008141210787035306; 0.00496599986824273;

In [46]:
run_through_batched_data!([in1, in2, in3, in4],network)
@show rnn.u_handle.gradient

64×196 Matrix{Float64}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 ⋮                        ⋮              ⋱       ⋮                        ⋮
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

In [47]:
@show rnn.w_handle.gradient

64×64 Matrix{Float64}:
  1.11044e-5   -1.56344e-5    4.89347e-6  …  -1.52271e-5    8.082e-6
 -9.24676e-5    0.00013019   -4.07485e-5      0.000126798  -6.72998e-5
  8.29198e-5   -0.000116747   3.6541e-5      -0.000113706   6.03507e-5
  2.24796e-6   -3.16502e-6    9.9063e-7      -3.08256e-6    1.63611e-6
 -1.15813e-6    1.6306e-6    -5.10366e-7      1.58812e-6   -8.42914e-7
 -0.000126822   0.000178559  -5.58877e-5  …   0.000173907  -9.23035e-5
  7.97202e-5   -0.000112242   3.5131e-5      -0.000109318   5.8022e-5
 -0.000101836   0.000143379  -4.48768e-5      0.000139644  -7.4118e-5
  5.77299e-5   -8.12808e-5    2.54404e-5     -7.91633e-5    4.2017e-5
 -7.35979e-5    0.000103622  -3.24331e-5      0.000100923  -5.35661e-5
  ⋮                                       ⋱                
  3.05268e-6   -4.29802e-6    1.34525e-6  …  -4.18605e-6    2.2218e-6
 -1.53544e-5    2.16183e-5   -6.76638e-6      2.10551e-5   -1.11753e-5
 -5.74262e-5    8.08532e-5   -2.53065e-5      7.87468e-5   -4.17959e-5


In [48]:
@show dense.w_handle.gradient

10×64 Matrix{Any}:
 -0.00039551    0.00229417  -0.000599908  …  -0.00908922  -0.00274198
 -0.000394195   0.00228654  -0.000597912     -0.00905898  -0.00273286
 -0.000389247   0.00225784  -0.000590407     -0.00894528  -0.00269856
 -0.000387545   0.00224797  -0.000587825     -0.00890616  -0.00268676
 -0.000392614   0.00227737  -0.000595515     -0.00902266  -0.00272191
  0.00351726   -0.020402     0.00533496   …   0.0808301    0.0243844
 -0.000388983   0.00225631  -0.000590007     -0.00893921  -0.00269673
 -0.000388356   0.00225267  -0.000589056     -0.0089248   -0.00269238
 -0.000388641   0.00225433  -0.000589489     -0.00893136  -0.00269436
 -0.00039217    0.0022748   -0.000594841     -0.00901245  -0.00271882

In [49]:
update_net_weights!(network)
@show rnn.u_handle.gradient

64×196 Matrix{Float64}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 ⋮                        ⋮              ⋱       ⋮                        ⋮
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

In [50]:
@show rnn.w_handle.gradient

64×64 Matrix{Float64}:
  1.11044e-5   -1.56344e-5    4.89347e-6  …  -1.52271e-5    8.082e-6
 -9.24676e-5    0.00013019   -4.07485e-5      0.000126798  -6.72998e-5
  8.29198e-5   -0.000116747   3.6541e-5      -0.000113706   6.03507e-5
  2.24796e-6   -3.16502e-6    9.9063e-7      -3.08256e-6    1.63611e-6
 -1.15813e-6    1.6306e-6    -5.10366e-7      1.58812e-6   -8.42914e-7
 -0.000126822   0.000178559  -5.58877e-5  …   0.000173907  -9.23035e-5
  7.97202e-5   -0.000112242   3.5131e-5      -0.000109318   5.8022e-5
 -0.000101836   0.000143379  -4.48768e-5      0.000139644  -7.4118e-5
  5.77299e-5   -8.12808e-5    2.54404e-5     -7.91633e-5    4.2017e-5
 -7.35979e-5    0.000103622  -3.24331e-5      0.000100923  -5.35661e-5
  ⋮                                       ⋱                
  3.05268e-6   -4.29802e-6    1.34525e-6  …  -4.18605e-6    2.2218e-6
 -1.53544e-5    2.16183e-5   -6.76638e-6      2.10551e-5   -1.11753e-5
 -5.74262e-5    8.08532e-5   -2.53065e-5      7.87468e-5   -4.17959e-5


In [51]:
@show dense.w_handle.gradient

10×64 Matrix{Any}:
 -0.00039551    0.00229417  -0.000599908  …  -0.00908922  -0.00274198
 -0.000394195   0.00228654  -0.000597912     -0.00905898  -0.00273286
 -0.000389247   0.00225784  -0.000590407     -0.00894528  -0.00269856
 -0.000387545   0.00224797  -0.000587825     -0.00890616  -0.00268676
 -0.000392614   0.00227737  -0.000595515     -0.00902266  -0.00272191
  0.00351726   -0.020402     0.00533496   …   0.0808301    0.0243844
 -0.000388983   0.00225631  -0.000590007     -0.00893921  -0.00269673
 -0.000388356   0.00225267  -0.000589056     -0.0089248   -0.00269238
 -0.000388641   0.00225433  -0.000589489     -0.00893136  -0.00269436
 -0.00039217    0.0022748   -0.000594841     -0.00901245  -0.00271882

In [52]:
run_through_batched_data!([in1, in2, in3, in4],network)

@show rnn.u_handle.gradient

64×196 Matrix{Float64}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 ⋮                        ⋮              ⋱       ⋮                        ⋮
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

In [53]:
@show rnn.w_handle.gradient


64×64 Matrix{Float64}:
  1.06865e-5   -1.5046e-5     4.71451e-6  …  -1.46488e-5    7.77346e-6
 -9.00499e-5    0.000126786  -3.9727e-5       0.000123438  -6.55032e-5
  8.22988e-5   -0.000115872   3.63074e-5     -0.000112813   5.9865e-5
  8.11535e-6   -1.1426e-5     3.58022e-6     -1.11243e-5    5.90319e-6
 -3.21213e-6    4.5225e-6    -1.41708e-6      4.40309e-6   -2.33653e-6
 -0.000131461   0.00018509   -5.79961e-5  …   0.000180203  -9.56261e-5
  7.84105e-5   -0.000110398   3.4592e-5      -0.000107483   5.70366e-5
 -9.52004e-5    0.000134037  -4.19992e-5      0.000130498  -6.92498e-5
  6.50935e-5   -9.16482e-5    2.8717e-5      -8.92284e-5    4.73497e-5
 -7.54611e-5    0.000106245  -3.32909e-5      0.00010344   -5.48912e-5
  ⋮                                       ⋱                
 -2.45706e-6    3.45941e-6   -1.08397e-6  …   3.36807e-6   -1.78729e-6
 -9.35445e-6    1.31706e-5   -4.12687e-6      1.28228e-5   -6.80453e-6
 -6.06972e-5    8.54584e-5   -2.67775e-5      8.3202e-5    -4.4151

In [54]:
@show dense.w_handle.gradient

10×64 Matrix{Any}:
 -0.00039542    0.00229354  -0.000599344  …  -0.00908875  -0.00274226
 -0.000394104   0.00228591  -0.00059735      -0.00905851  -0.00273314
 -0.000389157   0.00225722  -0.000589852     -0.0089448   -0.00269883
 -0.000387455   0.00224735  -0.000587272     -0.00890568  -0.00268703
 -0.000392524   0.00227675  -0.000594955     -0.00902219  -0.00272218
  0.00351645   -0.0203964    0.00532994   …   0.0808258    0.0243868
 -0.000388893   0.00225569  -0.000589452     -0.00893874  -0.002697
 -0.000388266   0.00225205  -0.000588501     -0.00892432  -0.00269265
 -0.000388552   0.00225371  -0.000588934     -0.00893088  -0.00269463
 -0.00039208    0.00227417  -0.000594282     -0.00901198  -0.0027191

In [56]:
for i = 1:5
    run_through_batched_data!([in1, in2, in3, in4],network)
    update_net_weights!(network)
end