# Cartpole

## A2C Agent 

In [None]:
import gym
from a2c import A2CAgent 
import time
import numpy as np

# Create Gym environment
a2c_env = "CartPole-v1"
env = gym.make(a2c_env)

# Check agent class for initialization parameters and initialize agent
if a2c_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-3

agent = A2CAgent(env, gamma, lr)

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    trajectory = []
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward, next_state, done))
        episode_reward += reward  
        if done or step == max_steps:
            episode_rewards.append(episode_reward)
            print("Episode " + str(episode) + ": " + str(episode_reward))
            break
        state = next_state
    agent.update(trajectory, 0)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
a2c_rewards = episode_rewards
a2c_runtime = run_time

In [None]:
name = './log_files/a2c/' + a2c_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((a2c_runtime, a2c_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (KL) 

In [11]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

# Create Gym environment
kl_env = "CartPole-v1"
env = gym.make(kl_env)

# Check agent class for initialization parameters and initialize agent
if kl_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    beta = 0.8

agent = DRTRPOAgent(env, gamma, lr)

############################### MC Updates  (Full Episode) ###############################

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
        
    # restart the agent if stuck
    avg_episode_reward = episode_reward/env.action_space.n
    if (episode >= 5) and (avg_episode_reward <= 15):
        agent = DRTRPOAgent(env, gamma, lr)
        
    # add randomness for better exploration
    if (avg_episode_reward <= 300) and (episode % 10 == 0):
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
    
    policy_loss = agent.compute_policy_loss_kl(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_kl_rewards = episode_rewards
dr_trpo_kl_runtime = run_time

Episode 0: 19.0
Episode 1: 21.0
Episode 2: 10.5
Episode 3: 10.5
Episode 4: 11.5
Episode 5: 12.0
Episode 6: 56.0
Episode 7: 22.0
Episode 8: 27.0
Episode 9: 36.5
Episode 10: 21.0
Episode 11: 19.0
Episode 12: 13.5
Episode 13: 15.5
Episode 14: 12.5
Episode 15: 13.0
Episode 16: 20.5
Episode 17: 23.5
Episode 18: 76.5
Episode 19: 51.0
Episode 20: 47.5
Episode 21: 43.0
Episode 22: 23.5
Episode 23: 31.0
Episode 24: 36.0
Episode 25: 62.5
Episode 26: 68.0
Episode 27: 137.5
Episode 28: 200.5
Episode 29: 135.5
Episode 30: 141.5
Episode 31: 136.5
Episode 32: 120.0
Episode 33: 130.0
Episode 34: 115.0
Episode 35: 110.0
Episode 36: 100.0
Episode 37: 79.5
Episode 38: 57.5
Episode 39: 94.5
Episode 40: 60.5
Episode 41: 49.0
Episode 42: 39.5
Episode 43: 34.5
Episode 44: 48.0
Episode 45: 48.5
Episode 46: 56.5
Episode 47: 112.0
Episode 48: 98.0
Episode 49: 115.0
Episode 50: 125.5
Episode 51: 133.5
Episode 52: 125.5
Episode 53: 137.0
Episode 54: 125.0
Episode 55: 136.0
Episode 56: 164.0
Episode 57: 179.0
Epis

Episode 439: 500.0
Episode 440: 417.0
Episode 441: 447.5
Episode 442: 362.0
Episode 443: 401.0
Episode 444: 500.0
Episode 445: 486.5
Episode 446: 421.0
Episode 447: 351.0
Episode 448: 447.0
Episode 449: 427.0
Episode 450: 499.5
Episode 451: 407.5
Episode 452: 455.5
Episode 453: 437.5
Episode 454: 500.0
Episode 455: 392.5
Episode 456: 418.5
Episode 457: 478.0
Episode 458: 463.0
Episode 459: 487.0
Episode 460: 394.0
Episode 461: 500.0
Episode 462: 450.5
Episode 463: 443.5
Episode 464: 415.5
Episode 465: 412.0
Episode 466: 500.0
Episode 467: 488.0
Episode 468: 500.0
Episode 469: 422.0
Episode 470: 500.0
Episode 471: 445.0
Episode 472: 464.5
Episode 473: 443.0
Episode 474: 448.5
Episode 475: 432.5
Episode 476: 391.0
Episode 477: 490.0
Episode 478: 444.5
Episode 479: 423.5
Episode 480: 500.0
Episode 481: 457.5
Episode 482: 492.5
Episode 483: 432.5
Episode 484: 344.5
Episode 485: 409.0
Episode 486: 357.5
Episode 487: 500.0
Episode 488: 447.0
Episode 489: 355.0
Episode 490: 461.5
Episode 491:

In [12]:
name = './log_files/dr_trpo_kl/' + kl_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_kl_runtime, dr_trpo_kl_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Wasserstein)

In [16]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

wass_env = "CartPole-v1"
# Create Gym environment
env = gym.make(wass_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

############################### MC Updates  (Full Episode) ###############################

# Define training parameters
max_episodes = 500
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    total_adv_diff += abs(state_adv[1] - state_adv[0])
    # larger beta, better stability; smaller beta, better exploration
    beta = total_adv_diff/episode 
    beta += np.random.random()*0.3-0.1
    
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration
    if (episode % 10 == 0) and (avg_episode_reward <= 350): 
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
        
    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= 10):
        agent = DRTRPOAgent(env, gamma, lr)
    
    policy_loss = agent.compute_policy_loss_wass(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_wass_rewards = episode_rewards
dr_trpo_wass_runtime = run_time

Episode 0: 18.5
Episode 1: 14.5
Episode 2: 12.0
Episode 3: 10.5
Episode 4: 11.0
Episode 5: 11.5
Episode 6: 11.5
Episode 7: 11.0
Episode 8: 10.5
Episode 9: 15.5
Episode 10: 15.5
Episode 11: 14.0
Episode 12: 13.0
Episode 13: 16.5
Episode 14: 15.0
Episode 15: 25.0
Episode 16: 27.0
Episode 17: 31.0
Episode 18: 36.5
Episode 19: 80.0
Episode 20: 56.5
Episode 21: 62.5
Episode 22: 73.0
Episode 23: 35.0
Episode 24: 41.0
Episode 25: 39.5
Episode 26: 34.0
Episode 27: 23.5
Episode 28: 30.0
Episode 29: 22.0
Episode 30: 24.0
Episode 31: 20.0
Episode 32: 21.5
Episode 33: 28.0
Episode 34: 40.5
Episode 35: 54.5
Episode 36: 53.0
Episode 37: 67.0
Episode 38: 119.0
Episode 39: 65.5
Episode 40: 122.5
Episode 41: 81.5
Episode 42: 101.0
Episode 43: 153.0
Episode 44: 87.0
Episode 45: 110.5
Episode 46: 77.5
Episode 47: 84.5
Episode 48: 69.0
Episode 49: 83.0
Episode 50: 57.5
Episode 51: 159.0
Episode 52: 127.5
Episode 53: 88.5
Episode 54: 110.0
Episode 55: 67.0
Episode 56: 63.5
Episode 57: 138.0
Episode 58: 197

Episode 440: 500.0
Episode 441: 462.0
Episode 442: 422.5
Episode 443: 332.0
Episode 444: 400.5
Episode 445: 469.0
Episode 446: 445.0
Episode 447: 412.5
Episode 448: 383.5
Episode 449: 472.0
Episode 450: 361.5
Episode 451: 422.0
Episode 452: 475.5
Episode 453: 338.5
Episode 454: 500.0
Episode 455: 437.0
Episode 456: 338.5
Episode 457: 478.0
Episode 458: 420.0
Episode 459: 392.0
Episode 460: 420.5
Episode 461: 428.0
Episode 462: 380.0
Episode 463: 442.5
Episode 464: 457.0
Episode 465: 466.5
Episode 466: 375.0
Episode 467: 416.5
Episode 468: 406.5
Episode 469: 500.0
Episode 470: 435.0
Episode 471: 462.0
Episode 472: 430.0
Episode 473: 369.0
Episode 474: 456.5
Episode 475: 422.5
Episode 476: 436.0
Episode 477: 382.0
Episode 478: 491.0
Episode 479: 416.5
Episode 480: 409.5
Episode 481: 425.5
Episode 482: 463.0
Episode 483: 401.5
Episode 484: 469.0
Episode 485: 392.0
Episode 486: 500.0
Episode 487: 470.0
Episode 488: 375.5
Episode 489: 461.5
Episode 490: 401.5
Episode 491: 484.0
Episode 492:

In [17]:
name = './log_files/dr_trpo_wass/' + wass_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_wass_runtime, dr_trpo_wass_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')