# Cartpole

## A2C Agent 

In [None]:
import gym
from a2c import A2CAgent 
import time
import numpy as np

# Create Gym environment
a2c_env = "CartPole-v1"
env = gym.make(a2c_env)

# Check agent class for initialization parameters and initialize agent
if a2c_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-3

agent = A2CAgent(env, gamma, lr)

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    trajectory = []
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward, next_state, done))
        episode_reward += reward  
        if done or step == max_steps:
            episode_rewards.append(episode_reward)
            print("Episode " + str(episode) + ": " + str(episode_reward))
            break
        state = next_state
    agent.update(trajectory, 0)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
a2c_rewards = episode_rewards
a2c_runtime = run_time

In [None]:
name = './log_files/a2c/' + a2c_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((a2c_runtime, a2c_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (KL) 

In [16]:
import gym
import gym_electricitymarket
from drpo import DRTRPOAgent 
import time
import numpy as np

# Create Gym environment
kl_env = 'ElectricityMarketDiscreteDQN-v0'
env = gym.make(kl_env)

# Check agent class for initialization parameters and initialize agent

# When the learning rate is large, policy neural network can overflow and lead to NaNs. 
# A possible fix is to reduce lr or increase beta to lower the learning rate.

if kl_env == "ElectricityMarketDiscreteDQN-v0":
    gamma = 0.95
    lr = 5e-2
    beta = 8
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 1500
max_steps = 30

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
        
    avg_episode_reward = episode_reward/env.action_space.n        
    # add randomness for better exploration
#     if (avg_episode_reward <= 300) and (episode % 10 == 0):
#         state_adv[0] += (np.random.random()-0.5)*0.5
#         state_adv[1] += (np.random.random()-0.5)*0.5
    
#     state_adv[0] += 0.5
    
    # restart the agent if stuck
#     if (episode >= 5) and (avg_episode_reward <= 15):
#         agent = DRTRPOAgent(env, gamma, lr)   
    
    policy_loss = agent.compute_policy_loss_kl(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

Episode 0: -491.76343876458304
Episode 1: -504.00273862777925
Episode 2: -497.4807915381953
Episode 3: -481.71015216735924
Episode 4: -420.9890770395866
Episode 5: -419.741918642372
Episode 6: -365.5660323583432
Episode 7: -305.4510198763947
Episode 8: -274.03296315902793
Episode 9: -233.3684673583299
Episode 10: -237.79265755833174
Episode 11: -272.2631288416685
Episode 12: -294.52618657083315
Episode 13: -293.37382169861274
Episode 14: -304.3364966416682
Episode 15: -272.76994733403194
Episode 16: -242.48407003889247
Episode 17: -237.2404553458364
Episode 18: -245.59054345278093
Episode 19: -316.37072381458546
Episode 20: -316.4325354708408
Episode 21: -237.4084014687513
Episode 22: -260.234595161112
Episode 23: -243.09391232985956
Episode 24: -317.60685074861345
Episode 25: -374.94749762916916
Episode 26: -198.6663194631924
Episode 27: 59.532303385418835
Episode 28: 80.12525553402645
Episode 29: -212.23709180069417
Episode 30: -385.02585035486715
Episode 31: -535.2151469534707
Episo

Episode 260: 239.45672430559986
Episode 261: 239.45672430559986
Episode 262: 239.45672430559986
Episode 263: 239.45672430559986
Episode 264: 239.45672430559986
Episode 265: 239.45672430559986
Episode 266: 239.45672430559986
Episode 267: 239.45672430559986
Episode 268: 239.45672430559986
Episode 269: 239.45672430559986
Episode 270: 239.45672430559986
Episode 271: 239.45672430559986
Episode 272: 239.45672430559986
Episode 273: 239.45672430559986
Episode 274: 239.45672430559986
Episode 275: 239.45672430559986
Episode 276: 239.45672430559986
Episode 277: 239.45672430559986
Episode 278: 239.45672430559986
Episode 279: 239.45672430559986
Episode 280: 239.45672430559986
Episode 281: 239.45672430559986
Episode 282: 239.45672430559986
Episode 283: 239.45672430559986
Episode 284: 239.45672430559986
Episode 285: 239.45672430559986
Episode 286: 239.45672430559986
Episode 287: 239.45672430559986
Episode 288: 239.45672430559986
Episode 289: 239.45672430559986
Episode 290: 239.45672430559986
Episode 

Episode 517: 239.45672430559986
Episode 518: 239.45672430559986
Episode 519: 239.45672430559986
Episode 520: 239.45672430559986
Episode 521: 239.45672430559986
Episode 522: 239.45672430559986
Episode 523: 239.45672430559986
Episode 524: 239.45672430559986
Episode 525: 239.45672430559986
Episode 526: 239.45672430559986
Episode 527: 239.45672430559986
Episode 528: 239.45672430559986
Episode 529: 239.45672430559986
Episode 530: 239.45672430559986
Episode 531: 239.45672430559986
Episode 532: 239.45672430559986
Episode 533: 239.45672430559986
Episode 534: 239.45672430559986
Episode 535: 239.45672430559986
Episode 536: 239.45672430559986
Episode 537: 239.45672430559986
Episode 538: 239.45672430559986
Episode 539: 239.45672430559986
Episode 540: 239.45672430559986
Episode 541: 239.45672430559986
Episode 542: 239.45672430559986
Episode 543: 239.45672430559986
Episode 544: 239.45672430559986
Episode 545: 239.45672430559986
Episode 546: 239.45672430559986
Episode 547: 239.45672430559986
Episode 

KeyboardInterrupt: 

In [19]:
dr_trpo_kl_rewards = episode_rewards
dr_trpo_kl_runtime = run_time

In [20]:
name = './log_files/dr_trpo_kl/' + kl_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_kl_runtime, dr_trpo_kl_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Wasserstein)

In [119]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

wass_env = "CartPole-v1"
# Create Gym environment
env = gym.make(wass_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 150
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    total_adv_diff += abs(state_adv[1] - state_adv[0])
    # larger beta, better stability; smaller beta, better exploration
    beta = total_adv_diff/episode 
    beta += np.random.random()*0.3-0.1
    
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration
    if (episode % 10 == 0) and (avg_episode_reward <= 350): 
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
        
    state_adv[0] += 0.5
        
    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= 15):
        agent = DRTRPOAgent(env, gamma, lr)
    
    policy_loss = agent.compute_policy_loss_wass(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_wass_rewards = episode_rewards
dr_trpo_wass_runtime = run_time

Episode 0: 20.0
Episode 1: 15.0
Episode 2: 13.5
Episode 3: 12.5
Episode 4: 11.0
Episode 5: 11.0
Episode 6: 15.5
Episode 7: 20.0
Episode 8: 20.5
Episode 9: 20.5
Episode 10: 40.0
Episode 11: 50.5
Episode 12: 86.5
Episode 13: 75.0
Episode 14: 62.5
Episode 15: 56.5
Episode 16: 46.5
Episode 17: 75.0
Episode 18: 46.0
Episode 19: 52.5
Episode 20: 50.0
Episode 21: 35.0
Episode 22: 26.0
Episode 23: 27.0
Episode 24: 17.5
Episode 25: 16.5
Episode 26: 14.5
Episode 27: 20.5
Episode 28: 18.0
Episode 29: 18.0
Episode 30: 19.0
Episode 31: 24.0
Episode 32: 19.0
Episode 33: 27.0
Episode 34: 32.0
Episode 35: 59.5
Episode 36: 71.5
Episode 37: 63.0
Episode 38: 105.0
Episode 39: 41.0
Episode 40: 33.5
Episode 41: 30.0
Episode 42: 26.5
Episode 43: 26.5
Episode 44: 19.0
Episode 45: 28.0
Episode 46: 27.0
Episode 47: 31.5
Episode 48: 38.0
Episode 49: 44.0
Episode 50: 47.0
Episode 51: 83.5
Episode 52: 142.5
Episode 53: 95.0
Episode 54: 76.5
Episode 55: 93.5
Episode 56: 53.0
Episode 57: 91.5
Episode 58: 74.5
Episo

In [120]:
name = './log_files/dr_trpo_wass/' + wass_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_wass_runtime, dr_trpo_wass_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Sinkhorn)

In [None]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

sink_env = "CartPole-v1"
# Create Gym environment
env = gym.make(sink_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 200
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss

    total_adv_diff += abs(state_adv[1] - state_adv[0])
    # larger beta, better stability; smaller beta, better exploration
    beta = total_adv_diff/episode 
    beta += np.random.random()*0.3-0.1
    
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration
    if (episode % 10 == 0) and (avg_episode_reward <= 350): 
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
        
    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= 10):
        agent = DRTRPOAgent(env, gamma, lr)
    
    beta = 50
    policy_loss = agent.compute_policy_loss_sinkhorn(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_sink_rewards = episode_rewards
dr_trpo_sink_runtime = run_time

In [None]:
name = './log_files/dr_trpo_sink/' + sink_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_sink_runtime, dr_trpo_sink_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')