# Cartpole

## A2C Agent 

In [135]:
import gym
from a2c import A2CAgent 
import time
import numpy as np

# Create Gym environment
a2c_env = "CartPole-v1"
env = gym.make(a2c_env)

# Check agent class for initialization parameters and initialize agent
if a2c_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-3

agent = A2CAgent(env, gamma, lr)

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    trajectory = []
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward, next_state, done))
        episode_reward += reward  
        if done or step == max_steps:
            episode_rewards.append(episode_reward)
            print("Episode " + str(episode) + ": " + str(episode_reward))
            break
        state = next_state
    agent.update(trajectory, 0)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
a2c_rewards = episode_rewards
a2c_runtime = run_time

Episode 0: 18.0
Episode 1: 12.0
Episode 2: 15.0
Episode 3: 37.0
Episode 4: 10.0
Episode 5: 32.0
Episode 6: 28.0
Episode 7: 37.0
Episode 8: 52.0
Episode 9: 17.0
Episode 10: 21.0
Episode 11: 28.0
Episode 12: 62.0
Episode 13: 64.0
Episode 14: 15.0
Episode 15: 20.0
Episode 16: 13.0
Episode 17: 32.0
Episode 18: 21.0
Episode 19: 22.0
Episode 20: 92.0
Episode 21: 50.0
Episode 22: 42.0
Episode 23: 84.0
Episode 24: 34.0
Episode 25: 103.0
Episode 26: 61.0
Episode 27: 28.0
Episode 28: 65.0
Episode 29: 30.0
Episode 30: 58.0
Episode 31: 25.0
Episode 32: 122.0
Episode 33: 41.0
Episode 34: 115.0
Episode 35: 85.0
Episode 36: 78.0
Episode 37: 66.0
Episode 38: 121.0
Episode 39: 75.0
Episode 40: 219.0
Episode 41: 148.0
Episode 42: 158.0
Episode 43: 169.0
Episode 44: 132.0
Episode 45: 61.0
Episode 46: 29.0
Episode 47: 168.0
Episode 48: 192.0
Episode 49: 198.0
Episode 50: 293.0
Episode 51: 337.0
Episode 52: 107.0
Episode 53: 107.0
Episode 54: 168.0
Episode 55: 252.0
Episode 56: 268.0
Episode 57: 92.0
Episo

Episode 441: 209.0
Episode 442: 198.0
Episode 443: 351.0
Episode 444: 173.0
Episode 445: 168.0
Episode 446: 226.0
Episode 447: 191.0
Episode 448: 295.0
Episode 449: 164.0
Episode 450: 245.0
Episode 451: 210.0
Episode 452: 186.0
Episode 453: 152.0
Episode 454: 258.0
Episode 455: 272.0
Episode 456: 344.0
Episode 457: 160.0
Episode 458: 148.0
Episode 459: 160.0
Episode 460: 172.0
Episode 461: 176.0
Episode 462: 157.0
Episode 463: 333.0
Episode 464: 207.0
Episode 465: 343.0
Episode 466: 183.0
Episode 467: 156.0
Episode 468: 220.0
Episode 469: 163.0
Episode 470: 137.0
Episode 471: 254.0
Episode 472: 112.0
Episode 473: 125.0
Episode 474: 108.0
Episode 475: 146.0
Episode 476: 117.0
Episode 477: 107.0
Episode 478: 170.0
Episode 479: 181.0
Episode 480: 173.0
Episode 481: 140.0
Episode 482: 111.0
Episode 483: 228.0
Episode 484: 150.0
Episode 485: 199.0
Episode 486: 203.0
Episode 487: 133.0
Episode 488: 138.0
Episode 489: 109.0
Episode 490: 142.0
Episode 491: 208.0
Episode 492: 136.0
Episode 493:

In [None]:
name = './log_files/a2c/' + a2c_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((a2c_runtime, a2c_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (KL) 

In [132]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

# Create Gym environment
kl_env = "CartPole-v1"
env = gym.make(kl_env)

# Check agent class for initialization parameters and initialize agent
if kl_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    beta = 1.0

agent = DRTRPOAgent(env, gamma, lr)

############################### MC Updates  (Full Episode) ###############################

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
        
    # add randomness for better exploration
    if episode % 10 == 0:
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
    
    policy_loss = agent.compute_policy_loss_kl(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    avg_episode_reward = episode_reward/env.action_space.n
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_kl_rewards = episode_rewards
dr_trpo_kl_runtime = run_time

Episode 0: 22.5
Episode 1: 17.0
Episode 2: 13.5
Episode 3: 17.0
Episode 4: 19.5
Episode 5: 15.5
Episode 6: 21.0
Episode 7: 30.0
Episode 8: 43.5
Episode 9: 36.0
Episode 10: 34.5
Episode 11: 36.0
Episode 12: 18.0
Episode 13: 23.5
Episode 14: 29.5
Episode 15: 24.0
Episode 16: 20.0
Episode 17: 33.5
Episode 18: 36.0
Episode 19: 34.5
Episode 20: 29.0
Episode 21: 34.5
Episode 22: 57.5
Episode 23: 32.0
Episode 24: 48.5
Episode 25: 61.5
Episode 26: 91.0
Episode 27: 76.0
Episode 28: 49.5
Episode 29: 75.5
Episode 30: 98.5
Episode 31: 63.0
Episode 32: 67.0
Episode 33: 78.5
Episode 34: 69.0
Episode 35: 72.0
Episode 36: 42.5
Episode 37: 91.0
Episode 38: 52.0
Episode 39: 114.0
Episode 40: 56.0
Episode 41: 51.0
Episode 42: 49.5
Episode 43: 83.5
Episode 44: 82.5
Episode 45: 68.0
Episode 46: 51.5
Episode 47: 57.0
Episode 48: 47.0
Episode 49: 48.0
Episode 50: 34.5
Episode 51: 42.5
Episode 52: 50.5
Episode 53: 53.0
Episode 54: 44.0
Episode 55: 52.5
Episode 56: 75.0
Episode 57: 51.0
Episode 58: 62.0
Episod

Episode 442: 500.0
Episode 443: 500.0
Episode 444: 500.0
Episode 445: 500.0
Episode 446: 500.0
Episode 447: 500.0
Episode 448: 500.0
Episode 449: 500.0
Episode 450: 500.0
Episode 451: 500.0
Episode 452: 500.0
Episode 453: 500.0
Episode 454: 500.0
Episode 455: 500.0
Episode 456: 500.0
Episode 457: 500.0
Episode 458: 500.0
Episode 459: 500.0
Episode 460: 470.0
Episode 461: 500.0
Episode 462: 500.0
Episode 463: 500.0
Episode 464: 500.0
Episode 465: 437.0
Episode 466: 500.0
Episode 467: 500.0
Episode 468: 500.0
Episode 469: 500.0
Episode 470: 500.0
Episode 471: 466.0
Episode 472: 500.0
Episode 473: 476.0
Episode 474: 500.0
Episode 475: 500.0
Episode 476: 436.5
Episode 477: 500.0
Episode 478: 500.0
Episode 479: 448.0
Episode 480: 500.0
Episode 481: 500.0
Episode 482: 499.0
Episode 483: 500.0
Episode 484: 500.0
Episode 485: 500.0
Episode 486: 500.0
Episode 487: 500.0
Episode 488: 426.0
Episode 489: 450.5
Episode 490: 466.5
Episode 491: 500.0
Episode 492: 455.0
Episode 493: 500.0
Episode 494:

In [31]:
name = './log_files/dr_trpo_kl/' + kl_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_kl_runtime, dr_trpo_kl_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Wasserstein)

In [88]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

wass_env = "CartPole-v1"
# Create Gym environment
env = gym.make(wass_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

############################### MC Updates  (Full Episode) ###############################

# Define training parameters
max_episodes = 500
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    total_adv_diff += abs(state_adv[1] - state_adv[0])
    beta = total_adv_diff/episode 
    
    # add randomness for better exploration
    beta += np.random.random()*0.3-0.1
    if episode % 10 == 0:
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
    
    policy_loss = agent.compute_policy_loss_wass(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    avg_episode_reward = episode_reward/env.action_space.n
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_wass_rewards = episode_rewards
dr_trpo_wass_runtime = run_time

Episode 0: 19.0
Episode 1: 12.5
Episode 2: 12.0
Episode 3: 12.5
Episode 4: 12.0
Episode 5: 11.0
Episode 6: 12.5
Episode 7: 12.0
Episode 8: 12.0
Episode 9: 14.0
Episode 10: 12.5
Episode 11: 16.5
Episode 12: 16.0
Episode 13: 21.5
Episode 14: 19.0
Episode 15: 30.5
Episode 16: 26.5
Episode 17: 33.5
Episode 18: 33.5
Episode 19: 44.5
Episode 20: 45.0
Episode 21: 41.5
Episode 22: 49.5
Episode 23: 39.0
Episode 24: 44.5
Episode 25: 51.0
Episode 26: 57.5
Episode 27: 74.5
Episode 28: 74.5
Episode 29: 95.0
Episode 30: 61.0
Episode 31: 70.0
Episode 32: 38.0
Episode 33: 46.0
Episode 34: 51.0
Episode 35: 42.5
Episode 36: 29.0
Episode 37: 46.5
Episode 38: 47.0
Episode 39: 32.5
Episode 40: 40.5
Episode 41: 40.5
Episode 42: 31.5
Episode 43: 20.0
Episode 44: 27.0
Episode 45: 22.5
Episode 46: 25.0
Episode 47: 20.5
Episode 48: 30.0
Episode 49: 18.5
Episode 50: 17.5
Episode 51: 21.5
Episode 52: 14.5
Episode 53: 19.0
Episode 54: 21.0
Episode 55: 30.0
Episode 56: 27.0
Episode 57: 26.0
Episode 58: 21.0
Episode

Episode 446: 500.0
Episode 447: 500.0
Episode 448: 492.5
Episode 449: 351.5
Episode 450: 500.0
Episode 451: 500.0
Episode 452: 381.5
Episode 453: 356.0
Episode 454: 406.0
Episode 455: 422.0
Episode 456: 500.0
Episode 457: 373.5
Episode 458: 484.5
Episode 459: 500.0
Episode 460: 500.0
Episode 461: 464.5
Episode 462: 406.5
Episode 463: 330.5
Episode 464: 480.0
Episode 465: 444.0
Episode 466: 500.0
Episode 467: 399.5
Episode 468: 400.0
Episode 469: 456.0
Episode 470: 478.0
Episode 471: 500.0
Episode 472: 500.0
Episode 473: 500.0
Episode 474: 443.5
Episode 475: 413.5
Episode 476: 444.5
Episode 477: 459.5
Episode 478: 500.0
Episode 479: 451.0
Episode 480: 307.0
Episode 481: 315.0
Episode 482: 362.5
Episode 483: 500.0
Episode 484: 498.0
Episode 485: 500.0
Episode 486: 383.5
Episode 487: 500.0
Episode 488: 476.5
Episode 489: 314.5
Episode 490: 481.5
Episode 491: 500.0
Episode 492: 500.0
Episode 493: 467.0
Episode 494: 414.0
Episode 495: 500.0
Episode 496: 500.0
Episode 497: 482.0
Episode 498:

In [None]:
name = './log_files/dr_trpo_wass/' + wass_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_wass_runtime, dr_trpo_wass_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')