### Implementation of Neural Ordinary Differential Equations in Julia

We'll deal with Neural ordinary differential equations(ODEs) problems when studying deep learning, which is a subset of maching learning. To let the computer learning automatically, we would use the algorithms utilize neural network architecture, inspired by biological neural networks found in human brain.

In this part I will do a brief introduction to neural ODEs and show the implementation in Julia.

In [1]:
### =============================================
### Layer Struct 构造单独的一层神经元
### =============================================

# A Layer consists of weights, bias and an activation function to produce the 
# output by its given input. And we'll save the result before applying the activation 
# function to compute the backward pass.

struct Layer{WS, BS, Z, F}
    # 设定参数来构造一层神经元
    W::WS  # weights
    b::BS  # bias
    z::Z   # intermediate state 用来保存intermediate state的参数
    σ::F   # activation function
end

In [2]:
# The Layer constructor takes in the dimensions of input, output, and intermediate 
# state, and an activation function.

Layer(in::Int, out::Int,  σ::Function) =    # Layer函数接收一个输入维度 输出维度 一个激活函数σ
    # 返回一个包含初始化权重、偏置、中间状态向量和激活函数的Layer对象
    Layer(rand(Float32, out, in) .- 0.5f0,  # weights 对这些参数进行初始化设置
    zeros(Float32, out),                    # biases
    Array{Float32}[],                       # intermediate state vector
    σ)                                      # activation function


Layer

In [None]:
# Layer output is computed by the formula σ(W * X + b)
# 定义一个计算神经网络层输出的函数 函数使用给定的权重矩阵(W)、偏置(bias)、和激活函数来处理输入数据X
function (l::Layer)(X)
    W, b, z, σ = l.W, l.b, l.z, l.σ
    temp = W * X .+ b
    empty!(z)
    push!(z, temp)   # store intermediate state for back propagation
    return σ.(temp)  # apply the activation function element-wise
end

In [None]:
# Define a function "update!" to update with partial derivatives and learning rate
# 根据梯度和学习率(η)更新神经网络层的权重(W)和偏置(b)
function update!((l::Layer, dW, db, η))
    l.W .-= η * dW
    l.b .-= η * db
end

In [9]:
# 定义了一个"derive"函数，用于计算神经网络层的权重、偏置和输入的梯度（偏导数）
# 这些梯度用于反向传播算法中，以便调整网络的参数，从而最小化损失函数。
function derive(l::Layer, ∂Cost∂a_out, a_in)
    dσ = derive(l.σ) # 计算激活函数的导数

    # 计算损失函数相对于中间结果"z"的导数"∂Cost∂z"
    ∂Cost∂z = ∂Cost∂a_out .* dσ.(l.z[1])

    # 计算整个批次的权重梯度"∂Cost∂W"
    ∂W(∂Cost∂z, a_in) = ∂Cost∂z * a_in'
    ∂Cost∂W = sum(∂W.(eachcol(∂Cost∂z), eachcol(a_in)))

    # 计算偏置梯度"∂Cost∂b"
    # 计算上一层输入的梯度"∂Cost∂a_in"
    ∂Cost∂b = sum(eachcol(∂Cost∂z)) # Cost wrt input from last layer
    ∂Cost∂a_in = l.W' * ∂Cost∂z
    return ∂Cost∂W, ∂Cost∂b, ∂Cost∂a_in
end

derive (generic function with 1 method)

In [11]:
# 定义"back-propagate"函数 
# 用于在反向传播过程中更新神经网络层的权重和偏置 并计算损失函数相对于上一层输入的梯度
function back_propagate!(l::Layer, ∂Cost∂a_out, a_in, η)
    ∂Cost∂W, ∂Cost∂b, ∂Cost∂a_in = derive(l, ∂Cost∂a_out, a_in)  # gradients
    
    update!(l, ∂Cost∂W, ∂Cost∂b, η) # update parameters return ∂Cost∂a_in 
    return ∂Cost∂a_in               # Cost wrt input from last layer
end

back_propagate! (generic function with 1 method)

In [12]:
# Model Struct 构造模型
# Model包含了多层神经元(Layers)
# we'll store each layer's outputs in an array for the backward pass
struct Model{LS, OS}
    layers::LS  # Layers 多层神经元
    a::OS       # Layer outputs

    # Model Constructor
    # 构造函数接受任意数量的层作为参数，并创建一个Model实例
    Model(layers...) = new{typeof(layers), Vector{Array{Float32}}}(layers, [])

    # Example 示例
    # 假设我们有两个层 layer1 和 layer2，可以这样创建一个模型:
    # layer1 = Layer(3, 5, relu)
    # layer2 = Layer(5, 2, sigmoid)
    # model = Model(layer1, layer2)
end


In [14]:
# 定义一个用于评估Model对象的方法 通过按顺序评估每一层来处理输入数据X 并存储每层的输出
# 通过逐层处理输入数据，并存储每层的输出，最终返回模型的输出结果
# 有助于理解神经网络的前向传播过程，并为后续的反向传播提供必要的数据

function (m::Model)(X)  # 类似于Model对象的call函数
    # store model input
    # 清空m.a并将数据X输入进去
    empty!(m.a)
    push!(m.a, X)

    # evaluate each layer and store their outputs
    for layer in m.layers
        push!(m.a, layer(m.a[end]))
    end

    # 移除并返回数组 m.a 的最后一个元素 即模型的最终输出
    return pop!(m.a)
end

# Example 示例
# 假如有一个两层神经元的模型model
# X = [1.0, 2.0, 3.0]
# 那么在调用model(X)后 函数执行以下操作：
# 1. 清空model.a 并将数据X存储在model.a中
# 2. layer1处理数据X 计算输出 存储在model.a中
#    layer2处理layer1的输出 计算输出 存储在model.a中
# 3. 返回model.a的最后一个元素 即model2的输出 作为模型的最后输出

In [15]:
# 定义一个用于对Model对象进行反向传播的方法 通过对每一层进行反向传播来更新模型的参数
function back_propagate!(m::Model, ∂Cost∂aL, η) 
    # Back propagate through each layer ∂Cost∂a_out = ∂Cost∂aL
    for layer in reverse(m.layers)
        a_in = pop!(m.a)    # retrieve layer input
        ∂Cost∂a_out = back_propagate!(layer, ∂Cost∂a_out, a_in, η) 
    end
end

# 这段代码实现了对神经网络模型的反向传播
# 通过逐层反向传播计算梯度并更新参数 模型得以优化 损失函数逐渐减小
# 每层的输入激活值和梯度依次传递 实现了完整的反向传播过程

back_propagate! (generic function with 2 methods)

In [16]:
# 定义了一个训练神经网络模型的函数。
# 通过在每个批次上进行前向传播、计算损失和反向传播来更新模型参数，函数逐步优化模型以最小化损失函数。
function train!(m::Model, Cost, dataset, η)
    # store cost of each batch in dataset
    costs = Float32[]

    # user-defined derivative function of Cost
    dCost = derive(Cost)

    # Train Model on each batch in dataset
    for batch in dataset 
        X, Y = batch
        out = m(X)

        # Calculate cost
        cost = Cost(out, Y)
        push!(costs, cost)

        # Back propagation
        ∂Cost∂out = dCost(out, Y)
        back_propagate!(m, ∂Cost∂out, η) 
    end

    # Return average cost of all batches
    return sum(costs) / length(dataset) 
end
    

train! (generic function with 1 method)