In [1]:
import pickle
import subprocess
import tensorflow as tf
import gym
from sklearn.model_selection import train_test_split, ParameterGrid
import numpy as np
from tqdm import tqdm_notebook, tqdm, trange
from collections import defaultdict
import load_policy

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, InputLayer
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization, Lambda
from tensorflow.keras.optimizers import Adam

In [3]:
from scipy.stats import sem
import matplotlib.pyplot as plt

In [4]:
plt.rc('font', family = "serif")
plt.rc('xtick', labelsize='x-small')
plt.rc('ytick', labelsize='x-small')

In [5]:
experts = ["Ant-v2", "HalfCheetah-v2", "Hopper-v2", "Humanoid-v2", "Reacher-v2", "Walker2d-v2"]

### Generating expert data :

In [6]:
def runExpert(expert, rollouts) :
    process = subprocess.Popen("python run_expert.py experts/{0}.pkl {1} --num_rollouts {2}".format(expert, expert, rollouts), shell = True)
    process.wait()

In [None]:
rollouts = 50
for expert in experts :
    runExpert(expert, rollouts)

### Getting observations, actions from expert data :

In [7]:
def getExpertData(expert) :
    with open("expert_data/{}.pkl".format(expert), "rb") as f :
        data = pickle.load(f)
    return(data)

### Observations and Actions :
Observation is an array of shape (m, dim_obs) where m are the number of observations and dim_obs is the dimension of the observation array
Action is an array of shape (m, 1, dim_act)

For the Ant-v2 case, dim_obs = 111 and dim_act = 8

### Behavior Cloning :

In [8]:
def prepareData(expert) :
    expert_data = getExpertData(expert)
    observations, actions, returns = expert_data.values()
    #Reshaping action to 2d
    actions = actions.reshape((actions.shape[0], actions.shape[-1]))
    #Train-validation split
    X_train, X_test, y_train, y_test = train_test_split(observations, actions, test_size = 0.1, random_state = 1)
    
    return(X_train, X_test, y_train, y_test)

In [9]:
def behaviorCloning(observations, actions, lr = 0.001, epochs = 70, batch_size = 128, verbose = 0) :
    #Getting shapes which are used in input and output layers of the network
    obs_shape = observations.shape
    act_shape = actions.shape
    obs_mean = np.mean(observations, axis = 0)
    obs_std = np.std(observations, axis = 0) + 1e-9
    #inp_obs = (observations - obs_mean) / obs_std 
    
    model = Sequential()
    model.add(InputLayer(input_shape = obs_shape[1:]))
    #model.add(Lambda(lambda x : (x - obs_mean) / obs_std, input_shape = obs_shape[1:]))
    model.add(Dense(units = 64, activation = "relu"))
    model.add(Dense(units = 64, activation = "relu"))
    model.add(Dense(units = act_shape[-1]))
    
    optimizer = Adam(lr)
    
    model.compile(optimizer = optimizer, loss = "mse", metrics = ["mse"])
    
    model.fit(x = observations, y = actions, epochs = epochs, batch_size = batch_size, verbose = verbose)

    return(model)

In [10]:
def getReturns(expert, model, rollouts) :
    model_returns = []
    env = gym.make(expert)
    max_steps = env.spec.timestep_limit
    for i in tqdm_notebook(range(rollouts), desc = "Rollout"):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            obs = obs.reshape((1, obs.shape[0]))
            action = model.predict(obs)
            action = action.reshape((action.shape[0], 1, action.shape[1]))
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if steps >= max_steps:
                    break
        model_returns.append(totalr)
    return(model_returns)

In [11]:
import pandas as pd

### Hyperparameters :

#### Learning Rate :

In [None]:
epochs = 50
rollouts = 50
lrs = [10 ** i for i in range(-6, 1, 1)]

In [None]:
lr_rewards = defaultdict(list)
for lr in lrs :
    print("Learning Rate : {}".format(lr))
    for expert in tqdm_notebook(experts, desc = "Expert") :
        expert_data = getExpertData(expert)
        observations, actions, expert_returns = expert_data["observations"], expert_data["actions"], expert_data["returns"]
        actions = np.squeeze(actions)
        model = behaviorCloning(observations, actions, lr = lr, epochs = epochs)
        model_returns = getReturns(expert, model, rollouts)
        lr_rewards[expert].append(model_returns)

In [None]:
fig, ax = plt.subplots(figsize = (4, 3))
for expert in experts :
    rewards = lr_rewards[expert]
    mean_reward = np.mean(np.array(rewards), axis = 1)
    stderr_reward = sem(np.array(rewards), axis = 1)
    ax.errorbar(np.log10(lrs), mean_reward, fmt = "", yerr = stderr_reward, capsize = 2, alpha = 0.7, linewidth = 0.7, elinewidth = 0.7, label = expert)
ax.set_xlabel("Log Learning Rate")
ax.set_ylabel("Mean Reward")
ax.set_xticks(np.log10(lrs))
#ax.set_xticklabels(["-6", "-5", "-4", "-3", "-2", "-1", "1e-0"])
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
fig.savefig("learningrate.png", bbox_inches = "tight", dpi = 300, bbox_extra_artists=(legend,))
plt.show()

##### Learning rate chosen to be 1e-3

#### Epochs :

In [None]:
epochs_list = np.arange(10, 101, 10)

In [None]:
epochs_rewards = defaultdict(list)
for epochs in epochs_list :
    print("Epochs : {}".format(epochs))
    for expert in tqdm_notebook(experts, desc = "Expert") :
        expert_data = getExpertData(expert)
        observations, actions, expert_returns = expert_data["observations"], expert_data["actions"], expert_data["returns"]
        actions = np.squeeze(actions)
        model = behaviorCloning(observations, actions, epochs = epochs)
        model_returns = getReturns(expert, model, rollouts)
        epochs_rewards[expert].append(model_returns)

In [None]:
fig, ax = plt.subplots(figsize = (4, 3))
for expert in experts :
    rewards = epochs_rewards[expert]
    mean_reward = np.mean(np.array(rewards), axis = 1)
    stderr_reward = sem(np.array(rewards), axis = 1)
    ax.errorbar(epochs_list, mean_reward, fmt = "", yerr = stderr_reward, capsize = 2, alpha = 0.7, linewidth = 0.7, elinewidth = 0.7, label = expert)
ax.set_xlabel("Epochs")
ax.set_ylabel("Mean Reward")
ax.set_xticks(epochs_list)
#ax.set_xticklabels(["-6", "-5", "-4", "-3", "-2", "-1", "1e-0"])
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
fig.savefig("epochs.png", bbox_inches = "tight", dpi = 300, bbox_extra_artists=(legend,))
plt.show()

In [None]:
results = []
for expert in experts :
    print("Expert : {}".format(expert))
    expert_data = getExpertData(expert)
    observations, actions, expert_returns = expert_data["observations"], expert_data["actions"], expert_data["returns"]
    actions = np.squeeze(actions)
    model = behaviorCloning(observations, actions)
    model_returns = getReturns(expert, model, rollouts)
    results.append([expert, np.mean(expert_returns), np.mean(model_returns), np.std(expert_returns), np.std(model_returns)])

In [None]:
df = pd.DataFrame(results, columns = ["Expert", "Mean Expert Reward", "Mean Model Reward", "Std Expert Reward", "Std Model Reward"])
df

In [None]:
df.to_csv("expert_data/behaviorcloning_results.csv")

### DAgger Algorithm :

In [12]:
def DAgger(expert, observations, actions, parameters, iterations) :
    obs_dataset = []
    act_dataset = []
    rewards = []
    lr, epochs, batch_size = parameters["lr"], parameters["epochs"], parameters["batch_size"]
    train_observations, train_actions = observations, actions
    for i in tqdm_notebook(range(iterations), desc = "DAgger Iteration") :
        #Train model on training set D
        model = behaviorCloning(train_observations, train_actions, lr, epochs, batch_size)
        env = gym.make(expert)
        max_steps = env.spec.timestep_limit
        
        #Using trained model to get mean rewards
        rollouts = 50
        returns = []
        for j in range(rollouts):
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                obs = obs.reshape((1, obs.shape[0]))
                model_action = model.predict(obs)
                model_action = model_action.reshape((model_action.shape[0], 1, model_action.shape[1]))
                obs, r, done, _ = env.step(model_action)
                totalr += r
                steps += 1
                if steps >= max_steps:
                    break
            returns.append(totalr)
        rewards.append(returns)
        
        
        #Start with initial observation and run model to get [o1, o2, o3]
        new_observations = []
        obs = env.reset()
        done = False
        steps = 0
        while not done:
            new_observations.append(obs)
            obs = obs.reshape((1, obs.shape[0]))
            model_action = model.predict(obs)
            model_action = model_action.reshape((model_action.shape[0], 1, model_action.shape[1]))
            obs, r, done, _ = env.step(model_action)
            steps += 1
            if steps >= max_steps:
                break
                    
        #Keeping track of rewards
        #rewards.append(totalr)
        
        #Use these observations as input to expert and get expert actions
        with tf.Session() :
            env = gym.make(expert)
            policy_fn = load_policy.load_policy("experts/{}.pkl".format(expert))
            new_actions = []

            for nobs in new_observations :
                expert_action = policy_fn(nobs[None,:])
                new_actions.append(expert_action)
            
        #Get labeled set D_exp = [(o1, a1), (o2, a2), ...]
        #Aggregate this to training set (D = D + D_exp) and retrain model
        obs_dataset.append(train_observations)
        act_dataset.append(train_actions)
        train_observations = np.concatenate((train_observations, np.array(new_observations)))
        train_actions = np.concatenate((train_actions, np.squeeze(np.array(new_actions))))
        
    return(obs_dataset, act_dataset, rewards)

In [13]:
obs_dataset = {}
act_dataset = {}
rewards_dataset = {}

In [None]:
exp = ["Hopper-v2"]
for expert in exp :
    print("Expert : {}".format(expert))
    expert_data = getExpertData(expert)
    observations, actions, expert_returns = expert_data["observations"], expert_data["actions"], expert_data["returns"]
    actions = np.squeeze(actions)
    obs_dataset[expert], act_dataset[expert], rewards_dataset[expert] = DAgger(expert, observations, actions, {"lr" : 0.001, "epochs" : 70, "batch_size" : 128}, 50)

In [None]:
expert = "Hopper-v2"
mean_rewards = [np.mean(rollout_rewards) for rollout_rewards in rewards_dataset[expert]][:20]
stderr_rewards = [sem(rollout_rewards) for rollout_rewards in rewards_dataset[expert]][:20]
expert_performance = df[df["Expert"] == expert]["Mean Expert Reward"].values[0]
bc_performance = df[df["Expert"] == expert]["Mean Model Reward"].values[0]
x = np.arange(1, 21, 1)

fig, ax = plt.subplots(figsize = (4, 3))
ax.errorbar(x, mean_rewards, fmt = "", yerr = stderr_rewards, ecolor = "black", capsize = 2, alpha = 0.7, linewidth = 0.7, elinewidth = 0.7, color = "blue", label = "DAgger")
#ax.errorbar(x, mean_rewards, fmt = "o", yerr = stderr_rewards, ecolor = "black", mfc = "None", markersize = 4, mec = "blue", capsize = 2, alpha = 0.7, mew = 0.7, elinewidth = 0.7)
ax.plot([1, 21], [expert_performance, expert_performance], linestyle = "--", color = "green", linewidth = 0.7, label = "Expert")
ax.plot([1, 21], [bc_performance, bc_performance], linestyle = "--", color = "red", linewidth = 0.7, label = "Behavior Cloning")
ax.set_xlabel("DAgger Iterations")
ax.set_ylabel("Mean Reward")
ax.set_xticks(np.arange(0, 21, 2))
ax.legend()
fig.savefig("Hopper-v2-dagger20.png", bbox_inches = "tight", dpi = 300)
plt.show()

In [None]:
exp = ["Humanoid-v2"]
for expert in exp :
    print("Expert : {}".format(expert))
    expert_data = getExpertData(expert)
    observations, actions, expert_returns = expert_data["observations"], expert_data["actions"], expert_data["returns"]
    actions = np.squeeze(actions)
    obs_dataset[expert], act_dataset[expert], rewards_dataset[expert] = DAgger(expert, observations, actions, {"lr" : 0.001, "epochs" : 70, "batch_size" : 128}, 50)

Expert : Humanoid-v2


HBox(children=(IntProgress(value=0, description='DAgger Iteration', max=50, style=ProgressStyle(description_wi…

obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 376) (1, 376)
obs (1, 37

In [None]:
df = pd.read_csv("expert_data/behaviorcloning_results.csv", index_col = 0)
df

In [None]:
expert = "Humanoid-v2"
mean_rewards = [np.mean(rollout_rewards) for rollout_rewards in rewards_dataset[expert]]
stderr_rewards = [sem(rollout_rewards) for rollout_rewards in rewards_dataset[expert]]
expert_performance = df[df["Expert"] == expert]["Mean Expert Reward"].values[0]
bc_performance = df[df["Expert"] == expert]["Mean Model Reward"].values[0]
x = np.arange(1, 51, 1)

fig, ax = plt.subplots(figsize = (4, 3))
ax.errorbar(x, mean_rewards, fmt = "", yerr = stderr_rewards, ecolor = "black", capsize = 2, alpha = 0.7, linewidth = 0.7, elinewidth = 0.7, color = "blue", label = "DAgger")
#ax.errorbar(x, mean_rewards, fmt = "o", yerr = stderr_rewards, ecolor = "black", mfc = "None", markersize = 4, mec = "blue", capsize = 2, alpha = 0.7, mew = 0.7, elinewidth = 0.7)
ax.plot([1, 51], [expert_performance, expert_performance], linestyle = "--", color = "green", linewidth = 0.7, label = "Expert")
ax.plot([1, 51], [bc_performance, bc_performance], linestyle = "--", color = "red", linewidth = 0.7, label = "Behavior Cloning")
ax.set_xlabel("DAgger Iterations")
ax.set_ylabel("Mean Reward")
ax.set_xticks(np.arange(0, 53, 4))
ax.legend()
fig.savefig("Humanoid-v2-dagger50.png", bbox_inches = "tight", dpi = 300)
plt.show()

In [None]:
expert = "Humanoid-v2"
mean_rewards = [np.mean(rollout_rewards) for rollout_rewards in rewards_dataset[expert]][:20]
stderr_rewards = [sem(rollout_rewards) for rollout_rewards in rewards_dataset[expert]][:20]
expert_performance = df[df["Expert"] == expert]["Mean Expert Reward"].values[0]
bc_performance = df[df["Expert"] == expert]["Mean Model Reward"].values[0]
x = np.arange(1, 21, 1)

fig, ax = plt.subplots(figsize = (4, 3))
ax.errorbar(x, mean_rewards, fmt = "", yerr = stderr_rewards, ecolor = "black", capsize = 2, alpha = 0.7, linewidth = 0.7, elinewidth = 0.7, color = "blue", label = "DAgger")
#ax.errorbar(x, mean_rewards, fmt = "o", yerr = stderr_rewards, ecolor = "black", mfc = "None", markersize = 4, mec = "blue", capsize = 2, alpha = 0.7, mew = 0.7, elinewidth = 0.7)
ax.plot([1, 21], [expert_performance, expert_performance], linestyle = "--", color = "green", linewidth = 0.7, label = "Expert")
ax.plot([1, 21], [bc_performance, bc_performance], linestyle = "--", color = "red", linewidth = 0.7, label = "Behavior Cloning")
ax.set_xlabel("DAgger Iterations")
ax.set_ylabel("Mean Reward")
ax.set_xticks(np.arange(0, 21, 2))
ax.legend()
fig.savefig("Humanoid-v2-dagger20.png", bbox_inches = "tight", dpi = 300)
plt.show()

In [None]:
x = 1