In [1]:
import potentiometer_simpler
import control_delta
import env_interaction
from identity_policy import IdentityLowLevelPolicy
import numpy as np

if __name__=="__main__":
    env = potentiometer_simpler.Potentiometer()
    env.set_resistance(5000, 10000)

    pol = IdentityLowLevelPolicy(env._dim_action) # this will be dim 2 in your case
    reps_init_mode = "uninformed"
    if reps_init_mode == "informed":
        max_num_reps_attempts = 2
        max_reps_param_updates = 12
    else:
        """ Does no of episodes have effect on convergence?? """
        max_num_reps_attempts = 5
        max_reps_param_updates = 20

    num_policy_rollouts_before_reps_update = 10 * pol.num_params()
    env_convergence_criteria = {"env_solved": 0.9}

    if reps_init_mode == "informed":
        policy_params_mean_init = env.controller_gt_params
        policy_params_var_init = np.eye(pol.num_params()) * 0.01
    else:
        policy_params_mean_init = np.zeros(pol.num_params())+0.3
        
        """ TODO: Change std dev and see effect on exploration"""
        policy_params_var_init = np.eye(pol.num_params()) * 0.15

        reps_converged, low_level_policy_params_mean, \
            low_level_policy_params_var, solve_env_info = \
                    env_interaction.solve_env_using_reps(env,
                                        pol,   # this is the pol variable above
                                        policy_params_mean_init,
                                        policy_params_var_init,
                                        num_policy_rollouts_before_reps_update,
                                        max_reps_param_updates,
                                        env_convergence_criteria,
                                        max_num_reps_attempts=max_num_reps_attempts,
                                        debug_info=True,
                                        verbose=True,
                                        )

REPS attempt 1 of 5: 
policy_params_mean_init_this_attempt:
[0.66 0.2  0.   0.4 ]
policy_params_var_init_this_attempt:
[[0.15]]


ValueError: mean and cov must have same length

In [None]:
np.zeros(4)+np.array([0.66, 0.2, 0, 0.4])

In [None]:
a = pouipoi

In [None]:
import pickle

reps_policy = {'reps_converged': reps_converged,
              'low_level_policy_params_mean': low_level_policy_params_mean,
              'low_level_policy_params_var': low_level_policy_params_var,
              'solve_env_info': solve_env_info}

with open('REPS_1_theta_policy.pickle', 'wb') as f:
    pickle.dump(reps_policy, f)

In [None]:
with open('./Expt_0/REPS_1_theta_policy.pickle', 'rb') as handle:
    reps_policy_loaded = pickle.load(handle)

In [None]:
from rl_utils.analysis import reps_solve_info_analysis

In [None]:
import matplotlib.pyplot as plt
from itertools import cycle

verbose = True
path_to_reps_info = 'REPS_1_theta_policy.pickle'

reps_converged = solve_env_info["reps_converged"]
policy_params_mean = solve_env_info["policy_params_mean"]
mean_param_hist = solve_env_info["history"]["policy_params_mean"]
var_diag_param_hist = solve_env_info["history"]["policy_params_var_diag"]
mean_rew_hist = solve_env_info["history"]["mean_reward"]

num_params = len(solve_env_info["policy_params_mean"])
num_reps_attempts = solve_env_info["num_reps_attempts"]

# this might be a ragged array, so we flatten it
assert len(mean_rew_hist) == num_reps_attempts
mean_rew_hist_all_attempts = np.hstack(
    [mean_rew_hist[a] for a in range(num_reps_attempts)]
)
iter_param_updates = range(len(mean_rew_hist_all_attempts))

if verbose:
    print(f'REPS solve info for "{path_to_reps_info}":')
    print(f" -> Solved: {reps_converged}")
    print(f" -> Parameters (mean): {policy_params_mean}")

num_subplots = num_params + 1
fig, ax = plt.subplots(num_subplots, 1, sharex=True)
prop_cycle = plt.rcParams["axes.prop_cycle"]
colors = cycle(prop_cycle.by_key()["color"])
for p in range(num_subplots):

    if p == 0:
        # show reward
        ax[p].plot(
            iter_param_updates,
            mean_rew_hist_all_attempts,
            ".-",
            color=next(colors),
        )
        ax[p].grid()
        ax[p].set_ylabel(f"Reward")

    else:
        idx_p = p - 1
        # this might be a ragged array, so we flatten it
        assert len(mean_param_hist) == num_reps_attempts
        mean_param_hist_all_attempts = np.hstack(
            [
                np.array(mean_param_hist[a])[:, idx_p]
                for a in range(num_reps_attempts)
            ]
        )
        # assert len(mean_param_hist_all_attempts) == (num_reps_param_updates + 1)

        assert len(var_diag_param_hist) == num_reps_attempts
        var_diag_param_hist_all_attempts = np.hstack(
            [
                np.array(var_diag_param_hist[a])[:, idx_p]
                for a in range(num_reps_attempts)
            ]
        )

        stdev_diag_param_hist_all_attempts = np.sqrt(
            var_diag_param_hist_all_attempts
        )

        assert len(mean_param_hist_all_attempts) == len(
            stdev_diag_param_hist_all_attempts
        )
        assert len(mean_param_hist_all_attempts) == len(iter_param_updates)

        mean_p_stdev = (
            mean_param_hist_all_attempts + stdev_diag_param_hist_all_attempts
        )
        mean_m_stdev = (
            mean_param_hist_all_attempts - stdev_diag_param_hist_all_attempts
        )

        this_color = next(colors)

        ax[p].plot(
            iter_param_updates,
            mean_param_hist_all_attempts,
            ".-",
            color=this_color,
        )
        ax[p].fill_between(
            iter_param_updates,
            mean_p_stdev,
            mean_m_stdev,
            alpha=0.25,
            color=this_color,
        )
        ax[p].grid()
        ax[p].set_ylabel(f"Parameter {idx_p}")

    if p == (num_subplots - 1):
        ax[p].set_xlabel("parameter update iteration")

plt.xlim((iter_param_updates[0], iter_param_updates[-1]))
plt.show()


In [None]:
mean_param_hist = solve_env_info["history"]["policy_params_mean"]
var_diag_param_hist = solve_env_info["history"]["policy_params_var_diag"]
mean_param_hist, var_diag_param_hist

In [None]:
from scipy.stats import norm

x_axis = np.arange(-1, 1, 0.001)

for m,v in zip(mean_param_hist[0], var_diag_param_hist[0]):
    norm_vals = np.array(norm.pdf(x_axis,m,v))
    plt.plot((x_axis+1)*180, (norm_vals+1)*180)

plt.ylim(0,20000)
plt.xlim(180,360)
plt.title(f"Ideal Resistance Value for Theta = 312\nObtained Mean Resistance Value for Theta: {(m[0]+1)*180}")
plt.show()

In [None]:
m,v = mean_param_hist[0][0], var_diag_param_hist[0][0]
norm_vals = np.array(norm.pdf(x_axis,m,v))
plt.plot((x_axis+1)*180, (norm_vals+1)*180)