# RL experiment for Φ = 0.3, Ψ = 3.0
For Φ = 0.3, Ψ = 3.0, the gyrotactic particle swims relatively slow and easily reoriented by vorticity. 

In [1]:
%run main.ipynb

In [2]:
paper = True # whether or not we want to enable specific plot style configurations when creating plots for a report

In [3]:
my_Φ = 0.3 # swimming number = v_s/u_0       
my_Ψ = 3.0  # stability number = B w_0. B is the characteristic time a perturbed cell takes to return 
            # to orientation ka if w = 0. smaller means swimming more aligned with ka. 
my_alpha0 = 1.0 
method = "expSARSA"
eps_decay = True
my_eps0 = 1.0
my_omega = 0.5

## Trial 1

In [None]:
Q, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \
        state_action_counter, chosen_actions, avg_Q_hist, initial_coords, theta_history \
            = training(alpha0=my_alpha0, Φ=my_Φ, Ψ=my_Ψ, method=method, eps0=my_eps0, \
                       eps_decay=eps_decay, omega=my_omega)

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))

In [None]:
# directory for saving figures files
directory_name = "phi_0d3_psi_3_expSARSA"
create_figure_dir(directory_name)

In [None]:
# check performance after training this agent with this set of parameters
plot_total_reward_vs_episode(hist_R_tot_smart, hist_R_tot_naive, N = 500)

We may have too much exploration present if we consider the following:

In [None]:
print("The state that was visited least had " + str(np.min(state_action_counter)) + " encounters, which is " + \
     str(round(np.min(state_action_counter)/np.sum(state_action_counter)*100,2)) + "% of the total states visited")
print("Pure exploration would have each pair visited ", np.round(np.sum(state_action_counter)/N_actions/N_states),\
      " times, i.e. ", np.round(1/N_actions/N_states*100,2), "% of the time")
# heatmap normalized by total number of encountersb
ax = sns.heatmap(state_action_counter/np.sum(state_action_counter), linewidth=0.5, \
            xticklabels = ["up","down","right","left"], yticklabels = product_states, cmap = 'viridis')

## Trial 2
Let's restart with linear decay of $\epsilon$

In [None]:
Q, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \
        state_action_counter, chosen_actions, avg_Q_hist, initial_coords, theta_history \
            = training(alpha0=my_alpha0, Φ=my_Φ, Ψ=my_Ψ, method=method, eps0=my_eps0, \
                       eps_decay=eps_decay, omega=1.0)

In [None]:
# directory for saving figures files
directory_name = "phi_0d3_psi_3_expSARSA_omega_1"
create_figure_dir(directory_name)

In [None]:
# check performance after training this agent with this set of parameters
plot_total_reward_vs_episode(hist_R_tot_smart, hist_R_tot_naive, N = 1000, paper=True)

In [None]:
plot_learning_gain(hist_R_tot_smart, hist_R_tot_naive, N=1000, paper=True)

In [None]:
plot_select_trajectories(smart_stored_histories, naive_stored_histories)

Let's overlay their relative performance on this last episode onto a streamline plot:

In [None]:
# create underlying quiver plot
plt.figure(figsize=(14,6))
history_X = np.array(smart.history_X_total)
x = np.linspace(np.min(history_X[1:,0]), np.max(history_X[1:,0]), 35)
z = np.linspace(np.min(history_X[1:,1]), np.max(history_X[1:,1]), 20)
X, Z = np.meshgrid(x, z)
ux, uz, w = tgv(X, Z)
plt.quiver(X, Z, ux, uz)

# create scatter plot for policy
cmap = plt.get_cmap('viridis', 4)
naive_history_X = np.array(naive.history_X_total)
plt.scatter(naive_history_X[1:,0], naive_history_X[1:,1],s=3,color='orange')
ax = plt.scatter(history_X[1:,0], history_X[1:,1],s=3,c=chosen_actions[:], cmap=cmap, \
                 vmin = -.5, vmax = 3.5)


cbar = plt.colorbar(ticks=np.arange(0,4))
cbar.ax.set_yticklabels(['Up', 'Down', 'Right','Left']) 
plt.xlabel(r'$x$')
plt.ylabel(r'$z$')
plt.savefig("Figures/" + directory_name + "/final-ep-actions.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# create underlying quiver plot
plt.figure(figsize=(14,6))
history_X = np.array(smart.history_X_total)
x = np.linspace(np.min(history_X[1:,0]), np.max(history_X[1:,0]), 35)
z = np.linspace(np.min(history_X[1:,1]), np.max(history_X[1:,1]), 20)
X, Z = np.meshgrid(x, z)
ux, uz, w = tgv(X, Z)
plt.quiver(X, Z, ux, uz)

# create scatter plot for policy
ax = plt.scatter(history_X[1:,0], history_X[1:,1],s=3,c=theta_history[:]/np.pi*180)
cbar = plt.colorbar()
cbar.set_label(r'$\theta$',rotation=180)
plt.xlabel(r'$x$')
plt.ylabel(r'$z$')
plt.savefig("Figures/" + directory_name + "/final-ep-theta.pdf", format="pdf", bbox_inches="tight")
plt.show()

We can also make comparisons for a particular episode:

In [None]:
i = 4
ep, history_X = smart_stored_histories[i]

# create underlying quiver plot
plt.figure(figsize=(3.5, 2)) if paper else plt.figure(figsize=(14,6))
x = np.linspace(0.75*np.min(history_X[1:,0]), np.max(history_X[1:,0]*1.1), 25)
z = np.linspace(0.75*np.min(history_X[1:,1]), np.max(history_X[1:,1]*1.1), 25)
Z, X = np.meshgrid(x, z)
ux, uz, w = tgv(X, Z)
plt.quiver(X, Z, ux, uz)

# create scatter plot for policy
cmap = plt.get_cmap('viridis', 4)
_, naive_history_X = naive_stored_histories[i]
plt.scatter(naive_history_X[1:,0], naive_history_X[1:,1],s=1,color='orange')
ax = plt.scatter(history_X[1:,0], history_X[1:,1],s=3,c=chosen_actions[:], cmap=cmap, \
                 vmin = -.5, vmax = 3.5)
cbar = plt.colorbar(ticks=np.arange(0,4))
cbar.ax.set_yticklabels(['Up', 'Down', 'Right','Left']) 
plt.xlabel(r'$x$')
plt.ylabel(r'$z$')
plt.savefig("Figures/" + directory_name + "/final-ep-actions.pdf", format="pdf", bbox_inches="tight")
plt.show()

## Learned policy

To visualize the policy learned in this case, we can alculate the matrix Qnorm, which is Q but each of its rows is normalized so as to range from 0 to 1. In this way, we can visualize the best action (column) for each state (row).

In [None]:
# plt.figure(figsize=(3,2))
Qnorm = copy.deepcopy(Q)
max_per_row = np.max(Q,1)
min_per_row = np.min(Q,1)
for row in range(Q.shape[0]):
    Qnorm[row,:] = (Qnorm[row,:]-min_per_row[row])/(max_per_row[row]-min_per_row[row])
ax = sns.heatmap(Qnorm, linewidth=0.5, xticklabels = ["up","down","right","left"], yticklabels = product_states, \
                cmap = 'inferno')
plt.savefig("Figures/" + directory_name + "/final-policy.pdf", format="pdf", bbox_inches="tight")
plt.show()

Of course we can also visualize Q directly to a sense of the relative value of the state-action pairs.

In [None]:
ax = sns.heatmap(Q, linewidth=0.5, xticklabels = ["up","down","right","left"], yticklabels = product_states, \
                cmap = 'inferno')
plt.savefig("Figures/" + directory_name + "/final-policy-unnorm.pdf", format="pdf",  bbox_inches="tight")

For the report, we'd like to visualize a case where the smart particle outperforms the naive. Let's iterate untilw e find such a case:

In [None]:
delta = 0
while delta < 1.0: # 1.0 is approximate difference in total return once we have converged 
    smart, naive, R_tot_smart, R_tot_naive, chosen_actions, history_theta = sample_trajectory(Φ=my_Φ, Ψ=my_Ψ, Q=Q, Ns=4000, D0=0, Dr=0)
    delta = R_tot_smart - R_tot_naive

In [None]:
history_X = np.array(smart.history_X_total)
naive_history_X = np.array(naive.history_X_total)

# create underlying quiver plot
plt.figure(figsize=(3.5, 2)) if paper else plt.figure(figsize=(14,6))
x = np.linspace(1.1*np.min(history_X[1:,0]), np.max(history_X[1:,0]*1.1), 25)
z = np.linspace(0.9*np.min(history_X[1:,1]), np.max(history_X[1:,1]*1.1), 25)
X, Z = np.meshgrid(x, z)
ux, uz, w = tgv(X, Z)
plt.quiver(X, Z, ux, uz)

# create scatter plot for policy
cmap = plt.get_cmap('viridis', 4)
plt.scatter(naive_history_X[1:,0], naive_history_X[1:,1],s=1,color='orange')
ax = plt.scatter(history_X[1:,0], history_X[1:,1],s=3,c=chosen_actions[:], cmap=cmap, \
                 vmin = -.5, vmax = 3.5)
cbar = plt.colorbar(ticks=np.arange(0,4))
cbar.ax.set_yticklabels(['Up', 'Down', 'Right','Left']) 
cbar.ax.set_title(r'$k_a$')
plt.xlabel(r'$x$')
plt.ylabel(r'$z$')
plt.savefig("Figures/" + directory_name + "/final-ep-actions.png", format="png", dpi=600, bbox_inches="tight")
plt.show()

In [None]:
history_X = np.array(smart.history_X_total)
naive_history_X = np.array(naive.history_X_total)

# create underlying quiver plot
plt.figure(figsize=(3.5, 2)) if paper else plt.figure(figsize=(14,6))
x = np.linspace(1.1*np.min(history_X[1:,0]), np.max(history_X[1:,0]*1.1), 25)
z = np.linspace(0.9*np.min(history_X[1:,1]), np.max(history_X[1:,1]*1.1), 25)
X, Z = np.meshgrid(x, z)
ux, uz, w = tgv(X, Z)
plt.quiver(X, Z, ux, uz)

# create scatter plot for policy
plt.scatter(naive_history_X[1:,0], naive_history_X[1:,1],s=1,color='orange')
ax = plt.scatter(history_X[1:,0], history_X[1:,1],s=3,c=theta_history[:]/np.pi*180,cmap='twilight_shifted')
cbar = plt.colorbar()
cbar.ax.set_title(r'$\theta$')
plt.xlabel(r'$x$')
plt.ylabel(r'$z$')
plt.savefig("Figures/" + directory_name + "/final-ep-theta.png", format="png", dpi=600, bbox_inches="tight")
plt.show()