In [320]:
import numpy as np

## Linear regression formulation
Assuming a 1D Lagragian of the form $L_\theta(x,y,u,v) = \frac{1}{2} (u^2+v^2)- \sum_{k_1,k_2} \theta_{k_1,k_2} x^{k_1} y^{k_2}$ the EL-equations are given by
$$
    \ddot{x} = - \sum_{k_1=1,k_2=0}^{N,N} \theta_{k_1,k_2} k_1 x^{k_1-1} y^{k_2} \\
    \ddot{y} = - \sum_{k_1=0,k_2=1}^{N,N} \theta_{k_1,k_2} k_2 x^{k_1} y^{k_2-1}
$$   

This inspires the cost function $C = C^x + C^y$ where
$$
    C^x(\theta ) = \frac{1}{2} \sum_i \left( \ddot{x}_i + \sum_{k_1=1,k_2=0}^{N,N} \theta_{k_1,k_2} k_1 x_i^{k_1-1} y_i^{k_2} \right)^2 \\
    C^y(\theta ) = \frac{1}{2} \sum_i \left( \ddot{y}_i + \sum_{k_1=1,k_2=0}^{N,N} \theta_{k_1,k_2} k_2 x_i^{k_1} y_i^{k_2-1} \right)^2
$$

Taking the gradient we find
$\partial C / \partial \theta_{k_1,k_2} = \partial C^x / \partial \theta_{k_1,k_2} + \partial C^y / \partial \theta_{k_1,k_2} $ with
$$
    \frac{\partial C^x}{\partial \theta_{j_1,j_2} } = \sum_i \left( \ddot{x}_i + \sum_{k_1=1,k_2=0}^{N,N} \theta_{k_1,k_2} k_1 x_i^{k_1-1} y_i^{k_2} \right) x^{j_1-1} y^{j_2}\\
    \frac{\partial C^y}{\partial \theta_{j_1,j_2} } = \sum_i \left( \ddot{y}_i + \sum_{k_1=1,k_2=0}^{N,N} \theta_{k_1,k_2} k_2 x_i^{k_1} y_i^{k_2-1} \right) x^{j_1} y^{j_2-1}
$$

## Nonlinear regression formulation
Again, we assume a Lagrangian of the form $L_\theta(x,y,u,v) = \frac{1}{2} (u^2 + v^2) - V(x,y;\theta)$ with
$$
    V(x,y;\theta) = \sum_{k_1,k_2} \theta_{k_1,k_2} x^{k_1} y^{k_2}.
$$
Given observations $\{ (q_i(0),\dot{q}_i(0) , q_i(1) ) \}_i$We consider the cost function
$$
    Q(\theta) = \sum_{i} \| q_i(1) - \hat{q}_i(1) \|_{L^1}
$$
Where $\hat{q}_i(1)$ is obtained from $q_i(0)$ and $\dot{q}_i(0)$ by solving the Euler-Lagrange equations 

The gradient of $Q$ is
$$
    \frac{ \partial Q}{\partial \theta} =
       - \sum_i {\rm sign}( x_i(1) - \hat{x}_i(1) ) \frac{ \partial \hat{x}_i(1) }{\partial \theta}
       + {\rm sign}( y_i(1) - \hat{y}_i(1) ) \frac{ \partial \hat{y}_i(1) }{\partial \theta}
$$

Explicitly, we get $\partial_\theta \hat{x}(1)$ and $\partial_\theta \hat{y}(1)$ from solving the ODE
$$
    \frac{dx}{dt} = u \\
    \frac{dy}{dt} = v \\
    \frac{du}{dt} = - \sum_{k_1=1,k_2=0} \theta_{k_1,k_2} k_1 x^{k_1-1} y^{k_2} \\
    \frac{dv}{dt} = - \sum_{k_1=0,k_2=1} \theta_{k_1,k_2} k_2 x^{k_1} y^{k_2-1} \\
    \frac{d}{dt} \left( \frac{\partial x}{\partial \theta_{k_1,k_2}} \right) = \frac{\partial u}{\partial \theta_{k_1,k_2} } \\
    \frac{d}{dt} \left( \frac{\partial y}{\partial \theta_{k_1,k_2}} \right) = \frac{\partial v}{\partial \theta_{k_1,k_2} } \\
    \frac{d}{dt} \left( \frac{\partial u}{\partial \theta_{k_1,k_2}} \right) = - k_1 x^{k_1-1} y^{k_2} \\
    \frac{d }{dt} \left( \frac{\partial v}{\partial \theta_{k_1,k_2}} \right) = - k_2 x^{k_1} y^{k_2-1}
$$
with the initial condition $\delta q(0) = 0$ and $\delta \dot{q}(0) = 0$
We can use stochastic gradient descent to minimize $Q$.

In [321]:
N=5

def state_to_vars(s):
    # returns variables x,y,u,v,delta_x,delta_y,delta_u,delta_v
    global N
    x = s[0]
    y = s[1]
    u = s[2]
    v = s[3]
    N_sq = N*N
    r = np.arange(4,N_sq+4)
    delta_x = s[r].reshape((N,N))
    delta_y = s[r+N_sq].reshape((N,N))
    delta_u = s[r+2*N_sq].reshape((N,N))
    delta_v = s[r+3*N_sq].reshape((N,N))
    return x,y,u,v,delta_x,delta_y,delta_u,delta_v

def vars_to_state(x,y,u,v,delta_x,delta_y,delta_u,delta_v):
    # returns a state
    f = lambda x: list( x.flatten() )
    return np.array([x,y,u,v] + f(delta_x) + f(delta_y) + f(delta_u) + f(delta_v))

def ode_func(theta,s):
    # a function which can be input into odeint for a fixed theta
    global N
    x,y,u,v,delta_x,delta_y,delta_u,delta_v = state_to_vars(s)
    x_dot = u
    y_dot = v
    pow_x = np.vander([x],N=N,increasing=True)[0] #pow_x[j] = x^j
    pow_y = np.vander([y],N=N,increasing=True)[0]
    u_dot = - np.einsum('ij,i,i,j',theta[1:N,0:N], np.arange(1,N),pow_x[0:N-1],pow_y[0:N])
    v_dot = - np.einsum('ij,j,i,j',theta[0:N,1:N], np.arange(1,N),pow_x[0:N],pow_y[0:N-1])
    delta_x_dot = delta_u
    delta_y_dot = delta_v
    delta_u_dot = np.zeros_like(theta)
    delta_u_dot[1:N,0:N] = - np.einsum( 'i,i,j->ij',np.arange(1,N) , pow_x[0:N-1], pow_y )
    delta_v_dot = np.zeros_like(theta)
    delta_v_dot[0:N,1:N] = - np.einsum( 'j,i,j->ij',np.arange(1,N) , pow_x, pow_y[0:N-1] )
    s_out = vars_to_state(x_dot,y_dot,u_dot,v_dot,\
                          delta_x_dot,delta_y_dot,\
                          delta_u_dot,delta_v_dot)
    return s_out
    
    
def partial_Q(theta,x0,y0,u0,v0,x1,y1):
    #produces the Q_i and the gradient of Q_i(\theta,x)
    global N
    from scipy.integrate import odeint
    s0 = np.zeros(4+4*N*N)
    s0[0:4] = np.array([x0,y0,u0,v0])
    s1 = odeint( lambda s,t: ode_func(theta,s) , s0 , [0.0,0.1], rtol=0.01)[1]
    x,y,u,v,delta_x,delta_y,delta_u,delta_v = state_to_vars(s1)
    Q = x
    dQ = delta_x
    return Q,dQ
    

In [322]:
print "Testing the Gradient:"
theta = np.random.randn(N,N)
x0,y0,u0,v0,x1,y1 = list(np.random.randn(6))
Q,dQ = partial_Q(theta,x0,y0,u0,v0,x1,y1)
theta_perturbed = theta.copy()
h = 0.00001
i = 2
j = 4
theta_perturbed[i,j] += h
P,dP = partial_Q(theta_perturbed,x0,y0,u0,v0,x1,y1)
print (P - Q)/(h)
print (dQ[i,j] + dP[i,j])/2

Testing the Gradient:
-0.00114560798625
-0.00139262155152


##Training
Now we need to implement Stochastic Gradient Descent (SGD).  The basic idea is to increment $\theta$ by
$\theta \mapsto \theta - \eta \nabla Q_i(\theta)$
over some dataset $\{ (x0,y0,u0,v0,x1,y1)_i \}_{i=1}^N$
Let us choose randomly sample $1000$ data-points for training.