# Cartpole

## A2C Agent 

In [None]:
import gym
from a2c import A2CAgent 
import time
import numpy as np

# Create Gym environment
a2c_env = "CartPole-v1"
env = gym.make(a2c_env)

# Check agent class for initialization parameters and initialize agent
if a2c_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-3

agent = A2CAgent(env, gamma, lr)

# Define training parameters
max_episodes = 500
max_steps = 500

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    trajectory = []
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward, next_state, done))
        episode_reward += reward  
        if done or step == max_steps:
            episode_rewards.append(episode_reward)
            print("Episode " + str(episode) + ": " + str(episode_reward))
            break
        state = next_state
    agent.update(trajectory, 0)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
a2c_rewards = episode_rewards
a2c_runtime = run_time

In [None]:
name = './log_files/a2c/' + a2c_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((a2c_runtime, a2c_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (KL) 

In [2]:
import gym
import gym_electricitymarket
from drpo import DRTRPOAgent 
import time
import numpy as np

# Create Gym environment
kl_env = 'ElectricityMarketDiscreteDQN-v0'
env = gym.make(kl_env)

# Check agent class for initialization parameters and initialize agent

# When the learning rate is large, policy neural network can overflow and lead to NaNs. 
# A possible fix is to reduce lr or increase beta to lower the learning rate.

if kl_env == "ElectricityMarketDiscreteDQN-v0":
    gamma = 0.95
    lr = 1e-2
    beta = 8
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 1500
max_steps = 30

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
        
    avg_episode_reward = episode_reward/env.action_space.n        
    # add randomness for better exploration
#     if (avg_episode_reward <= 300) and (episode % 10 == 0):
#         state_adv[0] += (np.random.random()-0.5)*0.5
#         state_adv[1] += (np.random.random()-0.5)*0.5
    
#     state_adv[0] += 0.5
    
    # restart the agent if stuck
#     if (episode >= 5) and (avg_episode_reward <= 15):
#         agent = DRTRPOAgent(env, gamma, lr)   
    f
    policy_loss = agent.compute_policy_loss_kl(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

Episode 0: -497.4252074263878
Episode 1: -483.0215132368023
Episode 2: -497.2963056597233
Episode 3: -490.68502547639036
Episode 4: -485.0753926694442
Episode 5: -487.3645913555585
Episode 6: -484.91585864930846
Episode 7: -469.752737040972
Episode 8: -467.17962752778
Episode 9: -452.67319262777545
Episode 10: -426.8950803368036
Episode 11: -416.6232095631957
Episode 12: -395.5657715812533
Episode 13: -385.72991919792025
Episode 14: -375.4774829000003
Episode 15: -348.10248611666793
Episode 16: -332.0362797847208
Episode 17: -304.8082848708315
Episode 18: -299.47632802916536
Episode 19: -277.3325855722204
Episode 20: -270.7764667013873
Episode 21: -251.92331735902414
Episode 22: -227.775523430554
Episode 23: -216.13877688610842
Episode 24: -202.9080506923587
Episode 25: -185.19371057777622
Episode 26: -174.06608393472018
Episode 27: -160.65654007569276
Episode 28: -154.9727689020818
Episode 29: -149.7162191729155
Episode 30: -148.32500677291605
Episode 31: -145.74313281736113
Episode 3

Episode 259: 594.9844268055828
Episode 260: 594.0128815722483
Episode 261: 594.1263316847488
Episode 262: 593.8940320354429
Episode 263: 593.6713163451648
Episode 264: 594.2071637611374
Episode 265: 594.0052356111373
Episode 266: 594.2350889944707
Episode 267: 593.5283405597479
Episode 268: 594.0803021062765
Episode 269: 594.1841455451652
Episode 270: 594.766375636138
Episode 271: 593.7755591097483
Episode 272: 593.5772138944703
Episode 273: 594.492251336138
Episode 274: 594.0683989250261
Episode 275: 594.7342647222491
Episode 276: 593.5365189097481
Episode 277: 594.2637668083598
Episode 278: 594.2053632611376
Episode 279: 594.1149226208595
Episode 280: 594.8156382639158
Episode 281: 593.2843817229425
Episode 282: 593.9955825284984
Episode 283: 593.7808945403037
Episode 284: 594.0276922729428
Episode 285: 594.7578299333607
Episode 286: 594.3587395729435
Episode 287: 593.942849619471
Episode 288: 594.7314810500267
Episode 289: 594.5895259694712
Episode 290: 593.696609282665
Episode 291:

Episode 525: 594.4919120173877
Episode 526: 594.892466395166
Episode 527: 594.2409775028042
Episode 528: 595.0749011805829
Episode 529: 594.4361116055821
Episode 530: 594.1991650486374
Episode 531: 594.1548317139155
Episode 532: 594.2643952139153
Episode 533: 594.5646302618321
Episode 534: 595.0014183194718
Episode 535: 594.9878293055827
Episode 536: 594.5887722090546
Episode 537: 593.8970660646094
Episode 538: 594.3116784715543
Episode 539: 594.834000092388
Episode 540: 594.2573908979432
Episode 541: 594.7634713361379
Episode 542: 594.3469862562765
Episode 543: 594.8241342028047
Episode 544: 594.6449903778044
Episode 545: 594.9075429028048
Episode 546: 594.3174828284989
Episode 547: 593.970965495165
Episode 548: 594.719319202805
Episode 549: 594.8861006055824
Episode 550: 594.5489559694711
Episode 551: 593.9259142583595
Episode 552: 594.1413178722485
Episode 553: 595.0505759722495
Episode 554: 594.7923241757213
Episode 555: 594.6176649507212
Episode 556: 594.6530865944712
Episode 557:

Episode 791: 594.9318370201662
Episode 792: 594.5227211972489
Episode 793: 594.8229116097492
Episode 794: 594.9042259784994
Episode 795: 594.8335293618325
Episode 796: 594.5397650486378
Episode 797: 594.1237033472487
Episode 798: 594.8704695694715
Episode 799: 594.4723941736379
Episode 800: 594.4530582611378
Episode 801: 594.5919669055821
Episode 802: 594.4216541014156
Episode 803: 594.7359233639156
Episode 804: 595.0933102722495
Episode 805: 594.6732440444712
Episode 806: 595.0591143055829
Episode 807: 594.2825985319708
Episode 808: 594.254247741693
Episode 809: 594.839669461138
Episode 810: 594.2261426396099
Episode 811: 594.8868599333606
Episode 812: 594.8414820916937
Episode 813: 594.5003328673881
Episode 814: 594.5079074923879
Episode 815: 594.801610272249
Episode 816: 594.9707813916939
Episode 817: 594.5117734479434
Episode 818: 594.9746101944719
Episode 819: 594.2944395229432
Episode 820: 594.7789718055824
Episode 821: 595.011320813916
Episode 822: 594.7463151416937
Episode 823:

Episode 1055: 595.1788120201662
Episode 1056: 594.3680667055822
Episode 1057: 594.5143814361378
Episode 1058: 594.0296690319708
Episode 1059: 594.7439293555825
Episode 1060: 593.6593316500258
Episode 1061: 594.0019025236372
Episode 1062: 594.8363659784992
Episode 1063: 593.9964005958594
Episode 1064: 594.2605195694712
Episode 1065: 594.9294322222494
Episode 1066: 595.062254438916
Episode 1067: 594.776294986138
Episode 1068: 594.6358331361379
Episode 1069: 594.2038304500263
Episode 1070: 594.541929772249
Episode 1071: 594.5160817805823
Episode 1072: 594.7809034528046
Episode 1073: 593.9155303034985
Episode 1074: 594.6207124368323
Episode 1075: 594.7221002118323
Episode 1076: 594.9479337534995
Episode 1077: 594.7980943611379
Episode 1078: 594.8016283139157
Episode 1079: 594.5257364666935
Episode 1080: 594.799466127805
Episode 1081: 595.0610745139162
Episode 1082: 594.7699696250268
Episode 1083: 594.3644671583598
Episode 1084: 593.4945400139147
Episode 1085: 594.56030556461
Episode 1086: 

Episode 1312: 594.1188696194708
Episode 1313: 594.8142130778048
Episode 1314: 594.3273114146099
Episode 1315: 594.4124593361377
Episode 1316: 594.3111695451654
Episode 1317: 594.3680715479431
Episode 1318: 594.8665388951658
Episode 1319: 594.9682967861382
Episode 1320: 594.1406264139154
Episode 1321: 594.8488357639158
Episode 1322: 594.6796103757214
Episode 1323: 593.9923169632206
Episode 1324: 594.7431401361379
Episode 1325: 594.5498563694713
Episode 1326: 594.9222992528047
Episode 1327: 594.0425184173871
Episode 1328: 594.8779800111382
Episode 1329: 594.715263267388
Episode 1330: 594.451702619471
Episode 1331: 594.7824888090547
Episode 1332: 594.7378487333605
Episode 1333: 593.9804968590539
Episode 1334: 594.4936974111379
Episode 1335: 594.9623760833606
Episode 1336: 594.3123281083597
Episode 1337: 594.5282213423878
Episode 1338: 594.3347759729434
Episode 1339: 594.4645168555824
Episode 1340: 595.0406013889162
Episode 1341: 594.7893317028046
Episode 1342: 594.5368430583602
Episode 13

KeyboardInterrupt: 

In [3]:
dr_trpo_kl_rewards = episode_rewards
dr_trpo_kl_runtime = run_time

In [4]:
name = './log_files/dr_trpo_kl/' + kl_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_kl_runtime, dr_trpo_kl_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Wasserstein)

In [119]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

wass_env = "CartPole-v1"
# Create Gym environment
env = gym.make(wass_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 150
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss
    
    total_adv_diff += abs(state_adv[1] - state_adv[0])
    # larger beta, better stability; smaller beta, better exploration
    beta = total_adv_diff/episode 
    beta += np.random.random()*0.3-0.1
    
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration
    if (episode % 10 == 0) and (avg_episode_reward <= 350): 
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
        
    state_adv[0] += 0.5
        
    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= 15):
        agent = DRTRPOAgent(env, gamma, lr)
    
    policy_loss = agent.compute_policy_loss_wass(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_wass_rewards = episode_rewards
dr_trpo_wass_runtime = run_time

Episode 0: 20.0
Episode 1: 15.0
Episode 2: 13.5
Episode 3: 12.5
Episode 4: 11.0
Episode 5: 11.0
Episode 6: 15.5
Episode 7: 20.0
Episode 8: 20.5
Episode 9: 20.5
Episode 10: 40.0
Episode 11: 50.5
Episode 12: 86.5
Episode 13: 75.0
Episode 14: 62.5
Episode 15: 56.5
Episode 16: 46.5
Episode 17: 75.0
Episode 18: 46.0
Episode 19: 52.5
Episode 20: 50.0
Episode 21: 35.0
Episode 22: 26.0
Episode 23: 27.0
Episode 24: 17.5
Episode 25: 16.5
Episode 26: 14.5
Episode 27: 20.5
Episode 28: 18.0
Episode 29: 18.0
Episode 30: 19.0
Episode 31: 24.0
Episode 32: 19.0
Episode 33: 27.0
Episode 34: 32.0
Episode 35: 59.5
Episode 36: 71.5
Episode 37: 63.0
Episode 38: 105.0
Episode 39: 41.0
Episode 40: 33.5
Episode 41: 30.0
Episode 42: 26.5
Episode 43: 26.5
Episode 44: 19.0
Episode 45: 28.0
Episode 46: 27.0
Episode 47: 31.5
Episode 48: 38.0
Episode 49: 44.0
Episode 50: 47.0
Episode 51: 83.5
Episode 52: 142.5
Episode 53: 95.0
Episode 54: 76.5
Episode 55: 93.5
Episode 56: 53.0
Episode 57: 91.5
Episode 58: 74.5
Episo

In [120]:
name = './log_files/dr_trpo_wass/' + wass_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_wass_runtime, dr_trpo_wass_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')

## DRPO Agent (Sinkhorn)

In [None]:
import gym
from drpo import DRTRPOAgent 
import time
import numpy as np

sink_env = "CartPole-v1"
# Create Gym environment
env = gym.make(sink_env)

# Check agent class for initialization parameters and initialize agent
if wass_env == "CartPole-v1":
    gamma = 0.95
    lr = 1e-2
    
agent = DRTRPOAgent(env, gamma, lr)

# Define training parameters
max_episodes = 200
max_steps = 500
total_adv_diff = 0

episode_rewards = []
run_time = []
start_time = time.time()
for episode in range(max_episodes):
    if episode == 0:
        first_state = env.reset()
    else:
        first_state = state
    state_adv = []
    total_value_loss = 0
    
    episode_reward = 0
    # loop through the first action
    for i in range(env.action_space.n):
        env.reset()
        state = first_state
        action = i
        trajectory = []
        
        for step in range(max_steps):
            if step != 0:
                action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append((state, action, reward, next_state, done))
            episode_reward += reward  
            if done or step == max_steps-1:
                break
            state = next_state
            
        adv, value_loss = agent.compute_adv_mc(trajectory)
        state_adv.append(adv[0])
        total_value_loss += value_loss

    total_adv_diff += abs(state_adv[1] - state_adv[0])
    # larger beta, better stability; smaller beta, better exploration
    beta = total_adv_diff/episode 
    beta += np.random.random()*0.3-0.1
    
    avg_episode_reward = episode_reward/env.action_space.n
    # add randomness for better exploration
    if (episode % 10 == 0) and (avg_episode_reward <= 350): 
        state_adv[0] += (np.random.random()-0.5)*0.5
        state_adv[1] += (np.random.random()-0.5)*0.5
        
    # restart the agent if stuck
    if (episode >= 5) and (avg_episode_reward <= 10):
        agent = DRTRPOAgent(env, gamma, lr)
    
    beta = 50
    policy_loss = agent.compute_policy_loss_sinkhorn(first_state, state_adv, beta)
    agent.update(value_loss, policy_loss)
    elapse = time.time() - start_time
    run_time.append(elapse)
    
    episode_rewards.append(avg_episode_reward)
    print("Episode " + str(episode) + ": " + str(avg_episode_reward))

dr_trpo_sink_rewards = episode_rewards
dr_trpo_sink_runtime = run_time

In [None]:
name = './log_files/dr_trpo_sink/' + sink_env + '-' + str(time.time()) + '.csv' 
out = np.column_stack((dr_trpo_sink_runtime, dr_trpo_sink_rewards))
with open(name, 'ab') as f:
    np.savetxt(f, out, delimiter=',')