# Setup
In this notebook, we will setup the necessary data structures and functions that will be constantly used during our set of reinforcement learning experiments.

## Import libraries

In [5]:
import numpy as np 
import plotly.graph_objects as go
from tqdm.notebook import tqdm
import plotly.express as px
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
import matplotlib.pyplot as plt
import seaborn as sns
import os
from wand.image import Image as WImage
# sns.set(palette="husl",font_scale=1)
# %config InlineBackend.figure_format = 'retina'
import copy
np.random.seed(0)
%load_ext line_profiler

## Define constants

In [3]:
L = 2*np.pi # periodic domain size 

# define boundaries of simulation box
x0 = 0      
x1 = L
z0 = 0
z1 = L 

# define reinforcement learning problem 
N_states = 12 # number of states - one for each coarse-grained degree of vorticity 
N_actions = 4 # number of actions - one for each coarse-grained swimming direction

# numerical parameters
dt = 0.01 # timestep size

## Utility functions

In [4]:
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

# Runga-Kutta 4(5) integration for one step 
    # see https://stackoverflow.com/questions/54494770/how-to-set-fixed-step-size-with-scipy-integrate
def DoPri45Step(f,t,x,h):

    k1 = f(t,x)
    k2 = f(t + 1./5*h, x + h*(1./5*k1) )
    k3 = f(t + 3./10*h, x + h*(3./40*k1 + 9./40*k2) )
    k4 = f(t + 4./5*h, x + h*(44./45*k1 - 56./15*k2 + 32./9*k3) )
    k5 = f(t + 8./9*h, x + h*(19372./6561*k1 - 25360./2187*k2 + 64448./6561*k3 - 212./729*k4) )
    k6 = f(t + h, x + h*(9017./3168*k1 - 355./33*k2 + 46732./5247*k3 + 49./176*k4 - 5103./18656*k5) )

    v5 = 35./384*k1 + 500./1113*k3 + 125./192*k4 - 2187./6784*k5 + 11./84*k6
    k7 = f(t + h, x + h*v5)
    v4 = 5179./57600*k1 + 7571./16695*k3 + 393./640*k4 - 92097./339200*k5 + 187./2100*k6 + 1./40*k7;

    return v4,v5

## Define useful data structures
### Define a dictionary of the possible states and their assigned indices

In [2]:
direction_states = ["right","down","left","up"] # coarse-grained directions
vort_states = ["w+", "w0", "w-"] # coarse-grained levels of vorticity 
product_states = [(x,y) for x in direction_states for y in vort_states]  # all possible states
state_lookup_table = {product_states[i]:i for i in range(len(product_states))} # returns index of given state
# print(product_states) # to view mapping

### Define an agent class for reinforcement learning

In [6]:
class Agent:
    def __init__(self, Ns):
        self.r = np.zeros(Ns) # reward for each stage
        self.t = 0            # time
        
    # calculate reward given from entering a new state after a selected action is undertaken
    def calc_reward(self):
        # enforce implementation by subclass
        if self.__class__ == AbstractClass:
                raise NotImplementedError
                
    def update_state(self):
        # enforce implementation by subclass
        if self.__class__ == AbstractClass:
                raise NotImplementedError
                
    def take_random_action(self):
        # enforce implementation by subclass
        if self.__class__ == AbstractClass:
                raise NotImplementedError
                
    def take_greedy_action(self, Q):
        # enforce implementation by subclass
        if self.__class__ == AbstractClass:
                raise NotImplementedError

### Define swimmer class derived from agent

In [7]:
class Swimmer(Agent):
    def __init__(self, Ns):
        # call init for superclass
        super().__init__(Ns)
        
        # local position within the periodic box. X = [x, z]^T with 0 <= x < 2 pi and 0 <= z < 2 pi
        self.X = np.array([np.random.uniform(0, L), np.random.uniform(0, L)])
        
        # absolute position. -inf. <= x_total < inf. and -inf. <= z_total < inf.
        self.X_total = self.X
        
        # particle orientation 
        self.theta = np.random.uniform(0, 2*np.pi) # polar angle theta in the x-z plane 
        self.p = np.array([np.cos(self.theta), np.sin(self.theta)]) # p = [px, pz]^T
        
        # translational and rotational velocity
        self.U = np.zeros(2)
        self.W = np.zeros(2)
        
        # preferred swimming direction (equal to [1,0], [0,1], [-1,0], or [0,-1])
        self.ka = np.array([0,1])
        
        # history of local and global position. Only store information for this episode. 
        self.history_X = [self.X]
        self.history_X_total = [self.X_total]
        
        # local vorticity at the current location
        _, _, self.w = tgv(self.X[0], self.X[1])
        
        # update coarse-grained state
        self.update_state()
        
    def reinitialize(self):
        self.X = np.array([np.random.uniform(0, L), np.random.uniform(0, L)])
        self.X_total = self.X
        
        self.theta = np.random.uniform(0, 2*np.pi) # polar angle theta in the x-z plane 
        self.p = np.array([np.cos(self.theta), np.sin(self.theta)]) # p = [px, pz]^T
        
        self.U = np.zeros(2)
        self.W = np.zeros(2)

        self.ka = np.array([0,1])

        self.history_X = [self.X]
        self.history_X_total = [self.X_total]
        
        self.t = 0 
        
    def update_kinematics(self, Φ, Ψ, D0 = 0, Dr = 0, int_method = "euler"):
        if int_method == "rk45":
            y0 = np.concatenate((self.X,self.p))
            _, v5 = DoPri45Step(self.calc_velocity_rk45,self.t,y0,dt)
            y = y0 + dt*v5
            self.X = y[:2]
            self.p = y[2:]
            dx = self.X - self.history_X[-1]
            self.X_total = self.X_total + dx 
            
            # check if still in the periodic box
            self.check_in_box()
            
            # ensure the vector p has unit length 
            self.p /= (self.p[0]**2 + self.p[1]**2)**(1/2)

            # update polar angle
            x = self.p[0]
            yy = self.p[1]
            self.theta = np.arctan2(yy,x) if yy >= 0 else (np.arctan2(yy,x) + 2*np.pi)

            # store positions
            self.history_X.append(self.X)
            self.history_X_total.append(self.X_total)
            
        elif int_method == "euler":
            # calculate new translational and rotational velocity 
            self.calc_velocity(Φ, Ψ)

            self.update_position(int_method, D0)
            self.update_orientation(int_method, Dr)
        else:
            raise Exception("Integration method must be 'Euler' or 'rk45'")

        self.t = self.t + dt
    
    def calc_velocity_rk45(self, t, y):
        x = y[0]
        z = y[1]
        px = y[2]
        pz = y[3]
        ux, uz, self.w = tgv(x, z)
        
        U0 = ux + Φ*px
        U1 = uz + Φ*pz
        
        ka_dot_p = self.ka[0]*px + self.ka[1]*pz
        W0 = 1/2/Ψ*(self.ka[0] - ka_dot_p*px) + 1/2*pz*self.w
        W1 = 1/2/Ψ*(self.ka[1] - ka_dot_p*pz) + 1/2*-px*self.w
        
        return np.array([U0, U1, W0, W1])
        
    
    def update_position(self, int_method, D0):
        # use explicit euler to update 
        dx = dt*self.U
        if D0 > 0: dx = dx + np.sqrt(2*D0*dt)*np.random.normal(size=2)
        self.X = self.X + dx
        self.X_total = self.X_total + dx
        
        # check if still in the periodic box
        self.check_in_box()
        
        # store positions
        self.history_X.append(self.X)
        self.history_X_total.append(self.X_total)
        
    
    def update_orientation(self, int_method, Dr):
        self.p = self.p + dt*self.W 
        
        # ensure the vector p has unit length 
        self.p /= (self.p[0]**2 + self.p[1]**2)**(1/2)         
        
        # if rotational diffusion is present
        if Dr > 0:
            px = self.p[0]
            pz = self.p[1]
            cross = px*pz
            A = np.array([[1-px**2, -cross], [-cross, 1-pz**2]])
            v = np.sqrt(2*Dr*dt)*np.random.normal(size=2)
            self.p[0] = self.p[0] + A[0,0]*v[0] + A[0,1]*v[1]
            self.p[1] = self.p[1] + A[1,0]*v[0] + A[1,1]*v[1]
            self.p /= (self.p[0]**2 + self.p[1]**2)**(1/2)

        # update polar angle
        x = self.p[0]
        y = self.p[1]
        self.theta = np.arctan2(y,x) if y >= 0 else (np.arctan2(y,x) + 2*np.pi)
        

    def calc_velocity(self, Φ, Ψ):
        ux, uz, self.w = tgv(self.X[0], self.X[1])
        
        # careful - computing in the following way is significantly slower: self.U = np.array(ux, uz) + Φ*self.p
        self.U[0] = ux + Φ*self.p[0]
        self.U[1] = uz + Φ*self.p[1]
        
        px = self.p[0]
        pz = self.p[1]
        ka_dot_p = self.ka[0]*px + self.ka[1]*pz
        self.W[0] = 1/2/Ψ*(self.ka[0] - ka_dot_p*px) + 1/2*pz*self.w 
        self.W[1] = 1/2/Ψ*(self.ka[1] - ka_dot_p*pz) + 1/2*-px*self.w 
        
        
    def check_in_box(self): 
        if self.X[0] < x0:
            self.X[0] += L 
        elif self.X[0] > x1:
            self.X[0] -= L 
        if self.X[1] < z0:
            self.X[1] += L 
        elif self.X[1] > z1:
            self.X[1] -= L    
            
    def calc_reward(self, n):
        self.r[n] = self.history_X_total[-1][1]-self.history_X_total[-2][1]
        
    def update_state(self):
        if self.w < -0.33:
            w_state = "w-"
        elif self.w >= -0.33 and self.w <= 0.33:
            w_state = "w0"
        elif self.w > 0.33:
            w_state = "w+"
        else:
            raise Exception("Invalid value of w detected: ", w)

        if self.theta >= np.pi/4 and self.theta < 3*np.pi/4:
            p_state = "up"
        elif self.theta >= 3*np.pi/4 and self.theta < 5*np.pi/4:
            p_state = "left"
        elif self.theta >= 5*np.pi/4 and self.theta < 7*np.pi/4:
            p_state = "down"
        elif (self.theta >= 7*np.pi/4 and self.theta <= 2*np.pi) or (self.theta >= 0 and self.theta < np.pi/4):
            p_state = "right"
        else:
            raise Exception("Invalid value of theta detected: ", theta)

        self.my_state = (p_state, w_state)
        
    def take_greedy_action(self, Q):
        state_index = state_lookup_table[self.my_state]
        action_index = np.argmax(Q[state_index])  # find largest entry in this row of Q (i.e. this state)
        if action_index == 0:   # up
            self.ka = [0, 1]
        elif action_index == 1: # down
            self.ka = [0, -1]
        elif action_index == 2: # right
            self.ka = [1, 0]
        else:                   # left
            self.ka = [-1, 0]
        return action_index
            
    def take_random_action(self):
        action_index = np.random.randint(4)
        if action_index == 0:   # up
            self.ka = [0, 1]
        elif action_index == 1: # down
            self.ka = [0, -1]
        elif action_index == 2: # right
            self.ka = [1, 0]
        else:                   # left
            self.ka = [-1, 0]
        return action_index

## Define Taylor-Green vortex

In [8]:
# given position, return local velocity and vorticity 
def tgv(x, z):
    ux = -1/2*np.cos(x)*np.sin(z)
    uz = 1/2*np.sin(x)*np.cos(z)
    w = -np.cos(x)*np.cos(z)
    return ux, uz, w

In [3]:
# visualize the flow field

# x = np.linspace(0,L,100)
# z = np.linspace(0,L,100)
# xv, zv = np.meshgrid(x, z)
# ux, uz, w = tgv(xv, zv)

# fig = go.Figure(data = go.Contour(x = x, y = z, z=w))

# fig.update_layout(
#     title=r"$\text{Vorticity }(w)$",
#     xaxis_title="$x$",
#     yaxis_title="$z$"
# )

# fig.show() 

Does the above make sense? consider $x=0$, $y=0$. There, the vorticity is large and negative. This makes sense in light of the streamplot shown below (considering the right-hand rule and noting that positive $y$ would point into the page since the coordinate-system is right-handed). 

In [4]:
# plt.streamplot(x,z,ux,uz)

## Reinforcement learning procedures

The function shown below defines our reinforcement learning procedure, using either Q-learning or double Q-learning.

In [12]:
def training(alpha0, Φ, Ψ, Ns=4000, Ne=5000, gamma=0.999, eps0=0.0, D0=0, Dr=0, n_updates=1000, \
             RIC=False, method="Qlearning", lr_decay=None, omega=0.85, eps_decay=False, Qin=None):
    # n_updates - how often to plot the trajectory undertaken by the particle during the learning process
    # Ne - number of episodes
    # Ns - number of steps in an episode
    # alpha0 - learning rate (or starting learning rate when employing LR decay)
    # gamma - discount factor, i.e. how much we weigh future to present rewards. Close to 0 = myopic view. 
    # eps0 - fraction of the time we allow for exploration in selecting the following action. 0 = always greedy. 
    # D0 - translational diffusivity
    # Dr - rotational diffusivity 
    # RIC - Reset of Initial Conditions. First time a state-action pair is encountered, set Q[s,a] = reward
    # method - choose from Q-learning, Double Q-learning (, or Expected SARSA
    # lr_decay - whether or not to use learning rate decay. Options are none, or polynomial (lr=1/#(s,a)**omega)
    # omega - exponent used in lr_decay: lr = 1/#(s,a)**omega
    # eps_decay - whether or not to decay epsilon linearly: eff_eps = eps0/k for the k-th step
    # Qin - initial Q matrix. Useful for testing performance after an extensive exploration phase. 

    # if using the expected SARSA method, turn on epsilon decay since eps = 0 is simply Q-learning anyway
    if method=="expSARSA": 
        eps_decay = True
        if eps0 == 0: eps0 = 1
    
    # Total reward for each episode
    hist_R_tot_smart = np.zeros(Ne)
    hist_R_tot_naive= np.zeros(Ne)   

    # learning gain per episode
    Σ = np.zeros(Ne)            
    
    smart_stored_histories = []       # store position = f(t) every so often for an episode (smart particles)
    naive_stored_histories = []       # store position = f(t) every so often for an episode (naive particles)
    
    # number of times each state-action pair has been explored
    state_action_counter = np.zeros((N_states,N_actions))
    
    # initialize a naive and a smart gyrotactic particle
    naive = Swimmer(Ns)
    smart = Swimmer(Ns)
    
    # initialize Q matrix to large value 
    if method=="doubleQ":
        Q1 = L*Ns*np.ones((12, 4))
        Q2 = L*Ns*np.ones((12, 4))
    else:
        Q = L*Ns*np.ones((12, 4))   # 12 states, 4 possible actions. Each column is an action, ka.
        
    if Qin is not None: Q = Qin
        
    # store average Q for each episode to track convergence
    avg_Q_history = np.zeros((Ne,12,4))
    
    # store initial position and orientation for each episode
    initial_coords = np.zeros((Ne,3))

    # iterate over episodes
    k = 0
    for ep in tqdm(range(Ne)):  

        # assign random orientation and position 
        smart.reinitialize()
        naive.reinitialize()
        naive = copy.deepcopy(smart) # have naive and smart share initial conditions for visualization purposes
        
        # store initialization
        initial_coords[ep,0:2] = smart.X
        initial_coords[ep,2] = smart.theta
        
        # save selected actions and particle orientation for last episodes
        if ep == Ne - 1: 
            chosen_actions = np.zeros(Ns)
            theta_history = np.zeros(Ns)

        # iterate over stages within an episode
        for stage in range(Ns): 

            # select an action eps-greedily. Note naive never changes its action/strategy (i.e. trying to swim up)
            Qinput = Q1 + Q2 if method=="doubleQ" else Q
            k = k + 1 # k-th update 
            
            eff_eps = eps0/k**omega if eps_decay else eps0 # decrease amount of exploration as time proceeds
            if np.random.uniform(0, 1) < eff_eps:
                action = smart.take_random_action()
            else:
                action = smart.take_greedy_action(Qinput)
                
            # record action and orientation on last episode
            if ep == Ne - 1: 
                chosen_actions[stage] = action
                theta_history[stage] = smart.theta

            # record index of the prior state
            old_s = state_lookup_table[smart.my_state]

            # given selected action, update the state
            naive.update_kinematics(Φ, Ψ, D0, Dr)
            smart.update_kinematics(Φ, Ψ, D0, Dr)
            smart.update_state()      # only need to update smart particle since naive has ka = [0, 1] always

            # calculate reward based on new state
            naive.calc_reward(stage)
            smart.calc_reward(stage)

            new_s = state_lookup_table[smart.my_state]
            state_action_counter[new_s,action] += 1
            
            # employ learning rate decay if applicable 
            alpha = alpha0/(1+state_action_counter[old_s,action])**omega if lr_decay else alpha0
            
            # update Q matrix 
            if method=="doubleQ":
                if np.random.uniform(0, 1) < 0.5: # update Q1
                    if Q1[old_s, action] == L*Ns and RIC==True: # apply Reset of Initial Conditions (RIC)
                        Q1[old_s, action] = smart.r[stage]
                    else:
                        Q1[old_s, action] = Q1[old_s, action] + alpha*(smart.r[stage] + \
                                gamma*np.max(Q2[new_s,:])-Q1[old_s,action])
                else: # update Q2
                    if Q2[old_s, action] == L*Ns and RIC==True: 
                        Q2[old_s, action] = smart.r[stage]
                    else:
                        Q2[old_s, action] = Q2[old_s, action] + alpha*(smart.r[stage] + \
                                gamma*np.max(Q1[new_s,:])-Q2[old_s,action])
            if method=="expSARSA":
                # calculate V, the expected Q value for the next state-actio pair
                V = 0
                greedy_action = np.argmax(Q[new_s]) # would-be greedy action for new state
                for new_action in range(N_actions):
                    pi = (1 - eff_eps) + eff_eps/N_actions if new_action == greedy_action else eff_eps/N_actions
                    V = V + pi*Q[new_s, new_action]
                    
                if Q[old_s, action] == L*Ns and RIC==True: 
                    Q[old_s, action] = smart.r[stage]
                else:
                    Q[old_s, action] = Q[old_s, action] + alpha*(smart.r[stage] + gamma*V - Q[old_s,action])
            else:
                if Q[old_s, action] == L*Ns and RIC==True: 
                    Q[old_s, action] = smart.r[stage]
                else:
                    Q[old_s, action] = Q[old_s, action] + alpha*(smart.r[stage] + \
                            gamma*np.max(Q[new_s,:])-Q[old_s,action])
                    
            # store average Q for each episode to track convergence
            avg_Q_history[ep] = avg_Q_history[ep] + Q1 + Q2 if method=="doubleQ" else avg_Q_history[ep] + Q
            
        avg_Q_history[ep] = avg_Q_history[ep]/Ns
 
                
        # calculate Rtot for this episode
        R_tot_naive = np.sum(naive.r)
        R_tot_smart = np.sum(smart.r)

        # calculate learning gain for this episode
        Σ[ep] = R_tot_smart/R_tot_naive - 1
        hist_R_tot_smart[ep] = R_tot_smart
        hist_R_tot_naive[ep] = R_tot_naive

        # plot trajectory every so often 
        if ep%n_updates==0 or ep==Ne-1:
            smart_history_X_total = np.array(smart.history_X_total)
            smart_stored_histories.append((ep,smart_history_X_total))
            naive_history_X_total = np.array(naive.history_X_total)
            naive_stored_histories.append((ep,naive_history_X_total))
        
        # save optimal policy
        if ep==Ne-1:
            filename = "Policies/Q_alpha_" + str(alpha).replace(".","d") + "_Ns_" + str(Ns) + "_Ne_" + str(Ne) + \
                    "_Φ_" + str(Φ).replace(".","d") + "_Ψ_" + str(Ψ).replace(".","d") + "_eps_" \
                    + str(eff_eps).replace(".","d") + "_epsdecay_" + str(eps_decay)
            if lr_decay: filename = filename + "_omega_" + str(omega)
            if method=="doubleQ": filename = filename + "_" + str(method)
            if RIC: filename = filename + "_RIC_" + str(RIC) 
            Qout = Q1 + Q2 if method=="doubleQ" else Q
            np.save(filename, Qout)
    
    return Qout, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \
        state_action_counter, chosen_actions, avg_Q_history, initial_coords, theta_history 

Create the following function for the case where we'd like to sample a trajectory for a swimmer given an input Q. 

In [12]:
def sample_trajectory(Φ, Ψ, Q, Ns=4000, D0=0, Dr=0): 

    # initialize a naive and a smart gyrotactic particle
    smart = Swimmer(Ns)
    naive = Swimmer(Ns)
    naive = copy.deepcopy(smart) # have naive and smart share initial conditions for visualization purposes

    # save selected actions and particle orientation 
    chosen_actions = np.zeros(Ns)
    theta_history = np.zeros(Ns)

    # iterate over stages within an episode
    for stage in range(Ns): 

        # always select greedy action since we aren't exploring here
        action = smart.take_greedy_action(Q)

        # record action and orientation on last episode
        chosen_actions[stage] = action
        theta_history[stage] = smart.theta

        # given selected action, update the state
        naive.update_kinematics(Φ, Ψ, D0, Dr)
        smart.update_kinematics(Φ, Ψ, D0, Dr)
        smart.update_state()      # need to update so we know what actions to take
        
        # calculate reward based on new state
        naive.calc_reward(stage)
        smart.calc_reward(stage)
            
        # calculate Rtot for this episode
        R_tot_naive = np.sum(naive.r)
        R_tot_smart = np.sum(smart.r)
    
    return smart, naive, R_tot_smart, R_tot_naive, chosen_actions

## Postprocessing
### Create new directory to store figures

In [1]:
def create_figure_dir(directory_name):
    if os.path.exists('Figures/' + directory_name):
        ans = input("Warning: this folder already exists. Overwrite its contents? (y/n)\n")
        if ans != "y": raise Exception("Stopping to prevent overwriting previous figures")
    else:
        os.makedirs('Figures/' + directory_name)

### Plot the total reward as a function of the episode number

In [2]:
def plot_total_reward_vs_episode(hist_R_tot_smart, hist_R_tot_naive, N=500):
    # hist_R_tot_smart, hist_R_tot_naive - total reward for each episode for the smart and naive particles
    # N - how many episodes to conduct the moving average over
    
    Ne = len(hist_R_tot_naive)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(N,Ne+1), y=moving_average(hist_R_tot_smart,N), mode='lines', name = "Smart"))
    fig.add_trace(go.Scatter(x=np.arange(N,Ne+1), y=moving_average(hist_R_tot_naive,N), mode='lines', name = "Naive"))
    fig.update_layout(
        title="Total reward vs. episode #",
        xaxis_title="Episode #",
        yaxis_title="Total reward, Rtot",
    )
    fig.show()
    fig.write_image("Figures/" + directory_name + "/total-reward.pdf") # save figure

### Plot learning gain as a function of episode 

In [3]:
def plot_learning_gain(hist_R_tot_smart, hist_R_tot_naive, N=500):
    # hist_R_tot_smart, hist_R_tot_naive - total reward for each episode for the smart and naive particles
    # N - how many episodes to conduct the moving average over
    
    Ne = len(hist_R_tot_naive)
    fig = go.Figure()
    Σ = moving_average(hist_R_tot_smart,N)/moving_average(hist_R_tot_naive,N)-1
    fig.add_trace(go.Scatter(x=np.arange(N,Ne+1), y=Σ*100, mode='lines')) # multiply by 100 to get %
    fig.update_layout(
        title="Learning gain over time",
        xaxis_title="Episode, E",
        yaxis_title=r"$\text{Learning gain, }\Sigma$",
    )
    fig.show()
    fig.write_image("Figures/" + directory_name + "/learning-gain.pdf") # save figure

### Plot trajectories of smart and naive particle for selected episodes

In [10]:
def plot_select_trajectories(smart_stored_histories, naive_stored_histories, style='matplotlib'):
    for i in range(len(smart_stored_histories)):
            ep, smart_history_X_total = smart_stored_histories[i]
            ep, naive_history_X_total = naive_stored_histories[i]
            Ns = smart_history_X_total.shape[0]-1

            if i == len(smart_stored_histories) - 1: style = "plotly"
            if style == "plotly":
                fig = go.Figure(go.Scatter(x=smart_history_X_total[:,0], y=smart_history_X_total[:,1],mode='markers',
                    name = "smart",
                    marker=dict(size=4,
                        color=np.linspace(0,Ns,Ns+1), #set color equal to a variable
                        colorscale='blues', # one of plotly colorscales
                        showscale=True,
                        colorbar=dict(title="Smart")
                    )))
                fig.add_trace(go.Scatter(x=naive_history_X_total[:,0], y=naive_history_X_total[:,1],mode='markers',
                    name = "naive",
                    marker=dict(size=4,
                        color=np.linspace(0,Ns,Ns+1), #set color equal to a variable
                        colorscale='reds', # one of plotly colorscales
                        showscale=True,
                        colorbar=dict(title="Naive",x=1.15)
                    )))
                fig.update_layout(
                    title="Trajectory for episode " + str(ep),
                    xaxis_title="$x$",
                    yaxis_title="$z$",
                    legend_orientation="h",
                    showlegend = False
                )
                fig.show()
            else:
                plt.figure(figsize=(14,6))
                plt.scatter(smart_history_X_total[:,0], smart_history_X_total[:,1], c=np.linspace(0,Ns,Ns+1), \
                            cmap='Blues')
                plt.scatter(naive_history_X_total[:,0], naive_history_X_total[:,1], c=np.linspace(0,Ns,Ns+1), \
                            cmap='Reds')
                plt.xlabel("x")
                plt.ylabel("z")
                plt.title("Trajectory for episode " + str(ep))
                plt.show()

    if i == len(smart_stored_histories) - 1: fig.write_image("Figures/" + directory_name + "/final-epsiode.pdf") 

## Miscellaneous
### Difference between using explicit euler and RK45 is minimal

In [13]:
# import copy
# test_eul = Swimmer()
# test_rk45 = copy.deepcopy(test_eul)

# for i in range(Ns):
#     test_eul.update_kinematics("euler")
#     test_rk45.update_kinematics("rk45")

# history_eul = np.array(test_eul.history_X_total)
# history_rk45 = np.array(test_rk45.history_X_total)

# fig = go.Figure(go.Scatter(x=history_eul[:,0], y=history_eul[:,1],mode='markers',
#     name = "smart",
#     marker=dict(size=4,
#         color=np.linspace(0,Ns,Ns+1), #set color equal to a variable
#         colorscale='blues', # one of plotly colorscales
#         showscale=True,
#         colorbar=dict(title="Euler")
#     )))
# fig.add_trace(go.Scatter(x=history_rk45[:,0], y=history_rk45[:,1],mode='markers',
#     name = "naive",
#     marker=dict(size=4,
#         color=np.linspace(0,Ns,Ns+1), #set color equal to a variable
#         colorscale='reds', # one of plotly colorscales
#         showscale=True,
#         colorbar=dict(title="RK45",x=1.15)
#     )))
# fig.update_layout(
#     title="Explicit euler vs. RK45",
#     xaxis_title="$x$",
#     yaxis_title="$z$",
#     legend_orientation="h",
#     showlegend = False
# )
# fig.show()
    

### List of colorscales for reference

In [14]:
# list of colorscales
# ['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
#              'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
#              'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
#              'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
#              'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
#              'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
#              'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
#              'orrd', 'oryel', 'peach', 'phase', 'picnic', 'pinkyl', 'piyg',
#              'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn', 'puor',
#              'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu', 'rdgy',
#              'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar', 'spectral',
#              'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn', 'tealrose',
#              'tempo', 'temps', 'thermal', 'tropic', 'turbid', 'twilight',
#              'viridis', 'ylgn', 'ylgnbu', 'ylorbr', 'ylorrd']