# Cartpole

## A2C Agent 

In [6]:
import gym
from a2c import A2CAgent 
import time
import numpy as np

# Create Gym environment
a2c_env = "CartPole-v1"
env = gym.make(a2c_env)

# Check agent class for initialization parameters and initialize agent
if a2c_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-3

agent = A2CAgent(env, gamma, lr)

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    trajectory = []
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward, next_state, done))
        episode_reward += reward  
        if done or step == max_steps:
            episode_rewards.append(episode_reward)
            print("Episode " + str(episode) + ": " + str(episode_reward))
            break
        state = next_state
    agent.update(trajectory, 0)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
a2c_rewards = episode_rewards
a2c_runtime = run_time

Episode 0: 21.0
Episode 1: 16.0
Episode 2: 20.0
Episode 3: 15.0
Episode 4: 44.0
Episode 5: 41.0
Episode 6: 42.0
Episode 7: 45.0
Episode 8: 33.0
Episode 9: 60.0
Episode 10: 38.0
Episode 11: 40.0
Episode 12: 69.0
Episode 13: 68.0
Episode 14: 24.0
Episode 15: 49.0
Episode 16: 37.0
Episode 17: 111.0
Episode 18: 38.0
Episode 19: 106.0
Episode 20: 31.0
Episode 21: 51.0
Episode 22: 86.0
Episode 23: 50.0
Episode 24: 57.0
Episode 25: 67.0
Episode 26: 34.0
Episode 27: 118.0
Episode 28: 85.0
Episode 29: 97.0
Episode 30: 71.0
Episode 31: 31.0
Episode 32: 96.0
Episode 33: 39.0
Episode 34: 28.0
Episode 35: 50.0
Episode 36: 38.0
Episode 37: 31.0
Episode 38: 76.0
Episode 39: 56.0
Episode 40: 91.0
Episode 41: 25.0
Episode 42: 54.0
Episode 43: 119.0
Episode 44: 48.0
Episode 45: 65.0
Episode 46: 102.0
Episode 47: 28.0
Episode 48: 114.0
Episode 49: 29.0
Episode 50: 110.0
Episode 51: 137.0
Episode 52: 80.0
Episode 53: 99.0
Episode 54: 44.0
Episode 55: 86.0
Episode 56: 91.0
Episode 57: 31.0
Episode 58: 39.0

Episode 441: 215.0
Episode 442: 205.0
Episode 443: 211.0
Episode 444: 213.0
Episode 445: 192.0
Episode 446: 209.0
Episode 447: 197.0
Episode 448: 207.0
Episode 449: 211.0
Episode 450: 195.0
Episode 451: 187.0
Episode 452: 207.0
Episode 453: 229.0
Episode 454: 232.0
Episode 455: 285.0
Episode 456: 203.0
Episode 457: 206.0
Episode 458: 188.0
Episode 459: 181.0
Episode 460: 208.0
Episode 461: 207.0
Episode 462: 238.0
Episode 463: 182.0
Episode 464: 193.0
Episode 465: 279.0
Episode 466: 222.0
Episode 467: 208.0
Episode 468: 280.0
Episode 469: 251.0
Episode 470: 241.0
Episode 471: 242.0
Episode 472: 222.0
Episode 473: 239.0
Episode 474: 294.0
Episode 475: 196.0
Episode 476: 189.0
Episode 477: 210.0
Episode 478: 185.0
Episode 479: 203.0
Episode 480: 191.0
Episode 481: 212.0
Episode 482: 214.0
Episode 483: 203.0
Episode 484: 171.0
Episode 485: 227.0
Episode 486: 216.0
Episode 487: 191.0
Episode 488: 183.0
Episode 489: 205.0
Episode 490: 204.0
Episode 491: 241.0
Episode 492: 239.0
Episode 493:

In [7]:
name = './log_files/a2c/' + a2c_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((a2c_runtime, a2c_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DR TRPO Agent (KL) 

In [31]:
import gym
from a2c_dr_trpo import DRTRPOAgent 
import time
import numpy as np

# Create Gym environment
kl_env = "CartPole-v1"
env = gym.make(kl_env)

# Check agent class for initialization parameters and initialize agent
if kl_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    beta = 0.8

agent = DRTRPOAgent(env, gamma, lr)

############################### MC Updates  (Full Episode) ###############################

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    first_state = env.reset()
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    policy_loss = agent.compute_policy_loss_kl(state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    avg_episode_reward = episode_reward/env.action_space.n
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_kl_rewards = episode_rewards
dr_trpo_kl_runtime = run_time

Episode 0: 33.5
Episode 1: 13.5
Episode 2: 15.0
Episode 3: 13.0
Episode 4: 16.5
Episode 5: 23.5
Episode 6: 31.0
Episode 7: 23.0
Episode 8: 25.0
Episode 9: 29.0
Episode 10: 32.5
Episode 11: 20.5
Episode 12: 21.5
Episode 13: 42.0
Episode 14: 40.5
Episode 15: 35.5
Episode 16: 28.5
Episode 17: 36.5
Episode 18: 38.0
Episode 19: 41.0
Episode 20: 44.0
Episode 21: 45.5
Episode 22: 50.5
Episode 23: 78.5
Episode 24: 99.5
Episode 25: 63.5
Episode 26: 54.0
Episode 27: 138.0
Episode 28: 100.5
Episode 29: 94.0
Episode 30: 134.5
Episode 31: 80.5
Episode 32: 99.0
Episode 33: 162.5
Episode 34: 125.5
Episode 35: 91.5
Episode 36: 103.0
Episode 37: 143.0
Episode 38: 213.5
Episode 39: 291.0
Episode 40: 500.0
Episode 41: 500.0
Episode 42: 440.5
Episode 43: 449.5
Episode 44: 431.5
Episode 45: 403.5
Episode 46: 492.5
Episode 47: 447.5
Episode 48: 500.0
Episode 49: 430.5
Episode 50: 349.0
Episode 51: 500.0
Episode 52: 482.5
Episode 53: 302.0
Episode 54: 387.0
Episode 55: 381.5
Episode 56: 452.5
Episode 57: 387

Episode 440: 377.0
Episode 441: 384.0
Episode 442: 242.0
Episode 443: 335.5
Episode 444: 372.0
Episode 445: 340.5
Episode 446: 381.5
Episode 447: 400.0
Episode 448: 439.5
Episode 449: 333.5
Episode 450: 374.5
Episode 451: 340.0
Episode 452: 416.0
Episode 453: 315.0
Episode 454: 492.0
Episode 455: 335.0
Episode 456: 391.0
Episode 457: 346.5
Episode 458: 500.0
Episode 459: 395.5
Episode 460: 283.0
Episode 461: 463.0
Episode 462: 500.0
Episode 463: 274.0
Episode 464: 352.5
Episode 465: 463.0
Episode 466: 401.5
Episode 467: 362.0
Episode 468: 359.0
Episode 469: 372.0
Episode 470: 363.0
Episode 471: 373.0
Episode 472: 199.5
Episode 473: 328.0
Episode 474: 311.5
Episode 475: 347.5
Episode 476: 421.0
Episode 477: 346.5
Episode 478: 330.5
Episode 479: 371.5
Episode 480: 431.5
Episode 481: 241.5
Episode 482: 350.0
Episode 483: 371.5
Episode 484: 341.5
Episode 485: 310.5
Episode 486: 380.0
Episode 487: 300.5
Episode 488: 490.0
Episode 489: 411.5
Episode 490: 309.5
Episode 491: 299.5
Episode 492:

In [32]:
name = './log_files/dr_trpo_kl/' + kl_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_kl_runtime, dr_trpo_kl_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DR TRPO Agent (Wasserstein)

In [13]:
import gym
from a2c_dr_trpo import DRTRPOAgent 
import time
import numpy as np

wass_env = "CartPole-v1"
# Create Gym environment
env = gym.make(wass_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

############################### MC Updates  (Full Episode) ###############################

# Define training parameters
max_episodes = 500
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    first_state = env.reset()
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    total_adv_diff += abs(state_adv[1] - state_adv[0])
    beta = total_adv_diff/episode
    beta += np.random.random()*0.3-0.1
    policy_loss = agent.compute_policy_loss_wass(state, state_adv, beta)

    
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    avg_episode_reward = episode_reward/env.action_space.n
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_wass_rewards = episode_rewards
dr_trpo_wass_runtime = run_time

Episode 0: 23.5
Episode 1: 18.5
Episode 2: 14.0
Episode 3: 15.0
Episode 4: 15.0
Episode 5: 15.0
Episode 6: 20.0
Episode 7: 17.0
Episode 8: 15.5
Episode 9: 15.5
Episode 10: 20.0
Episode 11: 26.0
Episode 12: 20.0
Episode 13: 25.0
Episode 14: 20.5
Episode 15: 21.0
Episode 16: 26.0
Episode 17: 14.5
Episode 18: 14.5
Episode 19: 18.0
Episode 20: 14.5
Episode 21: 16.5
Episode 22: 16.0
Episode 23: 15.0
Episode 24: 12.0
Episode 25: 15.0
Episode 26: 13.0
Episode 27: 15.5
Episode 28: 13.0
Episode 29: 17.0
Episode 30: 21.0
Episode 31: 22.5
Episode 32: 25.5
Episode 33: 28.0
Episode 34: 31.0
Episode 35: 32.0
Episode 36: 36.5
Episode 37: 39.0
Episode 38: 62.5
Episode 39: 50.0
Episode 40: 51.0
Episode 41: 86.0
Episode 42: 60.0
Episode 43: 55.0
Episode 44: 34.5
Episode 45: 49.5
Episode 46: 88.5
Episode 47: 45.5
Episode 48: 47.5
Episode 49: 42.0
Episode 50: 37.0
Episode 51: 32.5
Episode 52: 42.0
Episode 53: 29.5
Episode 54: 39.0
Episode 55: 33.5
Episode 56: 39.5
Episode 57: 40.5
Episode 58: 35.5
Episode

Episode 445: 390.5
Episode 446: 344.0
Episode 447: 373.0
Episode 448: 357.5
Episode 449: 313.0
Episode 450: 294.0
Episode 451: 377.5
Episode 452: 379.0
Episode 453: 413.0
Episode 454: 354.5
Episode 455: 436.0
Episode 456: 392.5
Episode 457: 350.0
Episode 458: 414.0
Episode 459: 455.0
Episode 460: 395.0
Episode 461: 291.0
Episode 462: 349.0
Episode 463: 361.0
Episode 464: 386.5
Episode 465: 331.0
Episode 466: 303.0
Episode 467: 383.0
Episode 468: 215.5
Episode 469: 421.5
Episode 470: 419.0
Episode 471: 394.0
Episode 472: 380.5
Episode 473: 382.5
Episode 474: 324.0
Episode 475: 339.5
Episode 476: 362.5
Episode 477: 347.0
Episode 478: 381.5
Episode 479: 325.0
Episode 480: 380.5
Episode 481: 389.5
Episode 482: 395.5
Episode 483: 438.0
Episode 484: 369.5
Episode 485: 374.5
Episode 486: 336.0
Episode 487: 411.0
Episode 488: 408.5
Episode 489: 401.5
Episode 490: 360.0
Episode 491: 387.5
Episode 492: 360.0
Episode 493: 327.5
Episode 494: 453.5
Episode 495: 358.5
Episode 496: 342.5
Episode 497:

In [13]:
name = './log_files/dr_trpo_wass/' + wass_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_wass_runtime, dr_trpo_wass_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')