In [1]:
import gym
import numpy as np
import torch

In [2]:
env = gym.make('CartPole-v0')
print(env.action_space, env.observation_space)

Discrete(2) Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)


In [3]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_state, n_action

(4, 2)

# Random search

In [4]:
def run_episode(env, weight):
    actions = []
    state = env.reset()
#     print('reset - ', state)
    total_reward = 0
    is_done = False
    while not is_done:
        state = torch.from_numpy(state).float()
        action = torch.argmax(torch.matmul(state, weight))
        #print(torch.matmul(state, weight), action)
        state, reward, is_done, _ = env.step(action.item())
        total_reward += reward
        actions.append(action)
    return total_reward

In [5]:
n_episode = 1000

best_total_reward = 0
best_weight = None

total_rewards = []

for episode in range(n_episode):
    weight = torch.rand(n_state, n_action)
    total_reward = run_episode(env, weight)
    print('Episode {}: {}'.format(episode+1, total_reward))
    if total_reward > best_total_reward:
        best_weight = weight
        best_total_reward = total_reward
    total_rewards.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode))

Episode 1: 63.0
Episode 2: 200.0
Episode 3: 28.0
Episode 4: 66.0
Episode 5: 81.0
Episode 6: 200.0
Episode 7: 9.0
Episode 8: 74.0
Episode 9: 200.0
Episode 10: 79.0
Episode 11: 8.0
Episode 12: 8.0
Episode 13: 65.0
Episode 14: 50.0
Episode 15: 21.0
Episode 16: 10.0
Episode 17: 56.0
Episode 18: 10.0
Episode 19: 10.0
Episode 20: 9.0
Episode 21: 8.0
Episode 22: 10.0
Episode 23: 200.0
Episode 24: 9.0
Episode 25: 9.0
Episode 26: 10.0
Episode 27: 129.0
Episode 28: 10.0
Episode 29: 19.0
Episode 30: 8.0
Episode 31: 86.0
Episode 32: 9.0
Episode 33: 11.0
Episode 34: 9.0
Episode 35: 11.0
Episode 36: 9.0
Episode 37: 9.0
Episode 38: 10.0
Episode 39: 37.0
Episode 40: 10.0
Episode 41: 21.0
Episode 42: 10.0
Episode 43: 200.0
Episode 44: 91.0
Episode 45: 200.0
Episode 46: 21.0
Episode 47: 9.0
Episode 48: 9.0
Episode 49: 38.0
Episode 50: 36.0
Episode 51: 10.0
Episode 52: 8.0
Episode 53: 8.0
Episode 54: 9.0
Episode 55: 9.0
Episode 56: 9.0
Episode 57: 143.0
Episode 58: 200.0
Episode 59: 22.0
Episode 60: 25.0

Episode 471: 9.0
Episode 472: 59.0
Episode 473: 60.0
Episode 474: 47.0
Episode 475: 89.0
Episode 476: 9.0
Episode 477: 9.0
Episode 478: 10.0
Episode 479: 175.0
Episode 480: 68.0
Episode 481: 9.0
Episode 482: 9.0
Episode 483: 9.0
Episode 484: 10.0
Episode 485: 9.0
Episode 486: 23.0
Episode 487: 10.0
Episode 488: 10.0
Episode 489: 59.0
Episode 490: 9.0
Episode 491: 30.0
Episode 492: 200.0
Episode 493: 10.0
Episode 494: 200.0
Episode 495: 188.0
Episode 496: 200.0
Episode 497: 10.0
Episode 498: 80.0
Episode 499: 57.0
Episode 500: 70.0
Episode 501: 10.0
Episode 502: 10.0
Episode 503: 9.0
Episode 504: 149.0
Episode 505: 64.0
Episode 506: 92.0
Episode 507: 9.0
Episode 508: 10.0
Episode 509: 9.0
Episode 510: 62.0
Episode 511: 9.0
Episode 512: 195.0
Episode 513: 9.0
Episode 514: 9.0
Episode 515: 56.0
Episode 516: 200.0
Episode 517: 9.0
Episode 518: 8.0
Episode 519: 10.0
Episode 520: 10.0
Episode 521: 10.0
Episode 522: 53.0
Episode 523: 9.0
Episode 524: 122.0
Episode 525: 106.0
Episode 526: 10.0

Episode 983: 10.0
Episode 984: 38.0
Episode 985: 45.0
Episode 986: 9.0
Episode 987: 140.0
Episode 988: 33.0
Episode 989: 69.0
Episode 990: 200.0
Episode 991: 10.0
Episode 992: 10.0
Episode 993: 131.0
Episode 994: 200.0
Episode 995: 31.0
Episode 996: 9.0
Episode 997: 61.0
Episode 998: 200.0
Episode 999: 9.0
Episode 1000: 23.0
Average total reward over 1000 episode: 50.637


In [7]:
n_episode_eval = 100
total_rewards_eval = []
for episode in range(n_episode_eval):
    total_reward = run_episode(env, best_weight)
#     print('Episode {}: {}'.format(episode+1, total_reward))
    total_rewards_eval.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards_eval) / n_episode_eval))

Average total reward over 1000 episode: 140.37


In [8]:
best_weight

tensor([[0.9113, 0.6964],
        [0.4157, 0.5779],
        [0.7755, 0.5194],
        [0.6823, 0.8948]])

In [10]:
n_episode = 1000
n_training = 1000
n_episode_training = []
for _ in range(n_training):
    for episode in range(n_episode):
        weight = torch.rand(n_state, n_action)
        total_reward = run_episode(env, weight)
        if total_reward == 200:
            n_episode_training.append(episode+1)
            break

print('Expectation of training episodes needed: ', sum(n_episode_training) / n_training)

Expectation of training episodes needed:  14.242


# Hill climbing

In [11]:
n_episode = 1000
best_weight = torch.rand(n_state, n_action)
best_total_reward = 0
total_rewards = []

noise_scale = 0.1

for episode in range(n_episode):
    weight = best_weight + noise_scale * torch.rand(n_state, n_action)
    total_reward = run_episode(env, weight)
    if total_reward >= best_total_reward:
        best_total_reward = total_reward
        best_weight = weight
    total_rewards.append(total_reward)
#     print('Episode {}: {}'.format(episode + 1, total_reward))

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode))

Average total reward over 1000 episode: 20.661


## Do noise adaptation

* set the initial noise factor

* if the quality of the episode was improved, reduce the noise figure factor

* In our case, the coefficient is halved, but never becomes less than 0.0001;

* if the quality of the episode deteriorates, increase the noise figure.

* In our case, the coefficient doubles, but never becomes more than 2.

In [12]:
best_weight = torch.rand(n_state, n_action)
noise_scale = 0.01

best_total_reward = 0
total_rewards = []
for episode in range(n_episode):
    weight = best_weight + noise_scale * torch.rand(n_state, n_action)
    total_reward = run_episode(env, weight)
    if total_reward >= best_total_reward:
        best_total_reward = total_reward
        best_weight = weight
        noise_scale = max(noise_scale / 2, 1e-4)
    else:
        noise_scale = min(noise_scale * 2, 2)

    print('Episode {}: {}'.format(episode + 1, total_reward))
    total_rewards.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode))

Episode 1: 9.0
Episode 2: 8.0
Episode 3: 10.0
Episode 4: 8.0
Episode 5: 8.0
Episode 6: 9.0
Episode 7: 8.0
Episode 8: 9.0
Episode 9: 9.0
Episode 10: 9.0
Episode 11: 9.0
Episode 12: 9.0
Episode 13: 10.0
Episode 14: 9.0
Episode 15: 9.0
Episode 16: 9.0
Episode 17: 8.0
Episode 18: 8.0
Episode 19: 10.0
Episode 20: 9.0
Episode 21: 10.0
Episode 22: 10.0
Episode 23: 9.0
Episode 24: 9.0
Episode 25: 9.0
Episode 26: 10.0
Episode 27: 10.0
Episode 28: 10.0
Episode 29: 9.0
Episode 30: 10.0
Episode 31: 9.0
Episode 32: 9.0
Episode 33: 9.0
Episode 34: 10.0
Episode 35: 9.0
Episode 36: 10.0
Episode 37: 8.0
Episode 38: 9.0
Episode 39: 9.0
Episode 40: 10.0
Episode 41: 10.0
Episode 42: 11.0
Episode 43: 8.0
Episode 44: 10.0
Episode 45: 10.0
Episode 46: 10.0
Episode 47: 10.0
Episode 48: 10.0
Episode 49: 10.0
Episode 50: 10.0
Episode 51: 10.0
Episode 52: 10.0
Episode 53: 9.0
Episode 54: 9.0
Episode 55: 9.0
Episode 56: 10.0
Episode 57: 10.0
Episode 58: 11.0
Episode 59: 8.0
Episode 60: 8.0
Episode 61: 40.0
Episod

Episode 473: 29.0
Episode 474: 38.0
Episode 475: 34.0
Episode 476: 35.0
Episode 477: 23.0
Episode 478: 56.0
Episode 479: 55.0
Episode 480: 26.0
Episode 481: 66.0
Episode 482: 28.0
Episode 483: 48.0
Episode 484: 49.0
Episode 485: 61.0
Episode 486: 42.0
Episode 487: 74.0
Episode 488: 61.0
Episode 489: 66.0
Episode 490: 46.0
Episode 491: 65.0
Episode 492: 29.0
Episode 493: 28.0
Episode 494: 40.0
Episode 495: 44.0
Episode 496: 41.0
Episode 497: 35.0
Episode 498: 24.0
Episode 499: 57.0
Episode 500: 36.0
Episode 501: 60.0
Episode 502: 29.0
Episode 503: 73.0
Episode 504: 68.0
Episode 505: 46.0
Episode 506: 24.0
Episode 507: 46.0
Episode 508: 50.0
Episode 509: 64.0
Episode 510: 26.0
Episode 511: 45.0
Episode 512: 34.0
Episode 513: 35.0
Episode 514: 59.0
Episode 515: 52.0
Episode 516: 23.0
Episode 517: 38.0
Episode 518: 42.0
Episode 519: 58.0
Episode 520: 61.0
Episode 521: 27.0
Episode 522: 24.0
Episode 523: 66.0
Episode 524: 58.0
Episode 525: 25.0
Episode 526: 25.0
Episode 527: 42.0
Episode 52

Episode 992: 47.0
Episode 993: 76.0
Episode 994: 35.0
Episode 995: 37.0
Episode 996: 39.0
Episode 997: 39.0
Episode 998: 60.0
Episode 999: 53.0
Episode 1000: 43.0
Average total reward over 1000 episode: 40.106


In [13]:
n_episode_eval = 100
total_rewards_eval = []
for episode in range(n_episode_eval):
    total_reward = run_episode(env, best_weight)
#     print('Episode {}: {}'.format(episode+1, total_reward))
    total_rewards_eval.append(total_reward)


print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards_eval) / n_episode_eval))

Average total reward over 1000 episode: 46.91


### Criteria for stop: 100 consecutive episodes, the average reward is not less than 195

In [14]:
best_weight = torch.rand(n_state, n_action)
noise_scale = 0.01

best_total_reward = 0
total_rewards = []
for episode in range(n_episode):
    weight = best_weight + noise_scale * torch.rand(n_state, n_action)
    total_reward = run_episode(env, weight)
    if total_reward >= best_total_reward:
        best_total_reward = total_reward
        best_weight = weight
        noise_scale = max(noise_scale / 2, 1e-4)
    else:
        noise_scale = min(noise_scale * 2, 2)
    print('Episode {}: {}'.format(episode + 1, total_reward))
    total_rewards.append(total_reward)
    if episode >= 99 and sum(total_rewards[-100:]) >= 19500:
        break

Episode 1: 63.0
Episode 2: 75.0
Episode 3: 77.0
Episode 4: 60.0
Episode 5: 91.0
Episode 6: 54.0
Episode 7: 62.0
Episode 8: 65.0
Episode 9: 69.0
Episode 10: 82.0
Episode 11: 69.0
Episode 12: 50.0
Episode 13: 113.0
Episode 14: 67.0
Episode 15: 96.0
Episode 16: 103.0
Episode 17: 82.0
Episode 18: 58.0
Episode 19: 9.0
Episode 20: 28.0
Episode 21: 32.0
Episode 22: 9.0
Episode 23: 9.0
Episode 24: 9.0
Episode 25: 15.0
Episode 26: 81.0
Episode 27: 79.0
Episode 28: 9.0
Episode 29: 59.0
Episode 30: 168.0
Episode 31: 116.0
Episode 32: 200.0
Episode 33: 200.0
Episode 34: 200.0
Episode 35: 162.0
Episode 36: 189.0
Episode 37: 200.0
Episode 38: 200.0
Episode 39: 192.0
Episode 40: 193.0
Episode 41: 200.0
Episode 42: 200.0
Episode 43: 200.0
Episode 44: 200.0
Episode 45: 200.0
Episode 46: 200.0
Episode 47: 200.0
Episode 48: 200.0
Episode 49: 200.0
Episode 50: 200.0
Episode 51: 200.0
Episode 52: 200.0
Episode 53: 200.0
Episode 54: 200.0
Episode 55: 200.0
Episode 56: 200.0
Episode 57: 200.0
Episode 58: 200

# Additing previous state for making steps

## random search

In [15]:
import numpy as np
def run_episode_knowing_previous_step(env, weight):
    actions = []
    previous_state = env.reset()
    total_reward = 0
    is_done = False
    
    #first step is regular random
    #initial weight 
    initial_weight = torch.rand(n_state, n_action)
    previous_state = torch.from_numpy(previous_state).float() # current state from reset
    action = torch.argmax(torch.matmul(previous_state, initial_weight)) # make the action 
    current_state, reward, is_done, _ = env.step(action.item()) #save state as previous 
    total_reward += reward
    #second step
    
    
    
    while not is_done:
       
        state = torch.from_numpy(np.concatenate((previous_state, current_state), axis=0)).float()
        action = torch.argmax(torch.matmul(state, weight))
        previous_state = current_state
        current_state, reward, is_done, _ = env.step(action.item())
        total_reward += reward
        #print(previous_state, current_state)
        
    return total_reward

In [16]:
current_state = env.reset()
print(env.observation_space)
print(torch.from_numpy(current_state).float())

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
tensor([-0.0145,  0.0423, -0.0353,  0.0439])


In [17]:
n_episode = 1000

best_total_reward = 0
best_weight = None

total_rewards = []

for episode in range(n_episode):
    weight = torch.rand(n_state*2, n_action)
    total_reward = run_episode_knowing_previous_step(env, weight)
    print('Episode {}: {}'.format(episode+1, total_reward))
    if total_reward > best_total_reward:
        best_weight = weight
        best_total_reward = total_reward
    total_rewards.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode))

Episode 1: 10.0
Episode 2: 25.0
Episode 3: 34.0
Episode 4: 54.0
Episode 5: 58.0
Episode 6: 9.0
Episode 7: 28.0
Episode 8: 9.0
Episode 9: 8.0
Episode 10: 9.0
Episode 11: 27.0
Episode 12: 82.0
Episode 13: 28.0
Episode 14: 24.0
Episode 15: 75.0
Episode 16: 10.0
Episode 17: 27.0
Episode 18: 103.0
Episode 19: 46.0
Episode 20: 91.0
Episode 21: 9.0
Episode 22: 8.0
Episode 23: 10.0
Episode 24: 24.0
Episode 25: 10.0
Episode 26: 36.0
Episode 27: 24.0
Episode 28: 10.0
Episode 29: 11.0
Episode 30: 43.0
Episode 31: 22.0
Episode 32: 23.0
Episode 33: 157.0
Episode 34: 10.0
Episode 35: 17.0
Episode 36: 26.0
Episode 37: 9.0
Episode 38: 10.0
Episode 39: 51.0
Episode 40: 53.0
Episode 41: 10.0
Episode 42: 30.0
Episode 43: 10.0
Episode 44: 14.0
Episode 45: 76.0
Episode 46: 49.0
Episode 47: 144.0
Episode 48: 21.0
Episode 49: 9.0
Episode 50: 56.0
Episode 51: 63.0
Episode 52: 47.0
Episode 53: 29.0
Episode 54: 10.0
Episode 55: 8.0
Episode 56: 10.0
Episode 57: 21.0
Episode 58: 59.0
Episode 59: 200.0
Episode 60:

Episode 524: 68.0
Episode 525: 16.0
Episode 526: 91.0
Episode 527: 8.0
Episode 528: 9.0
Episode 529: 200.0
Episode 530: 124.0
Episode 531: 45.0
Episode 532: 37.0
Episode 533: 9.0
Episode 534: 42.0
Episode 535: 9.0
Episode 536: 28.0
Episode 537: 148.0
Episode 538: 10.0
Episode 539: 9.0
Episode 540: 8.0
Episode 541: 200.0
Episode 542: 10.0
Episode 543: 200.0
Episode 544: 10.0
Episode 545: 9.0
Episode 546: 101.0
Episode 547: 9.0
Episode 548: 90.0
Episode 549: 60.0
Episode 550: 10.0
Episode 551: 9.0
Episode 552: 172.0
Episode 553: 61.0
Episode 554: 175.0
Episode 555: 87.0
Episode 556: 9.0
Episode 557: 12.0
Episode 558: 9.0
Episode 559: 16.0
Episode 560: 10.0
Episode 561: 27.0
Episode 562: 33.0
Episode 563: 8.0
Episode 564: 185.0
Episode 565: 21.0
Episode 566: 10.0
Episode 567: 10.0
Episode 568: 24.0
Episode 569: 45.0
Episode 570: 43.0
Episode 571: 47.0
Episode 572: 9.0
Episode 573: 10.0
Episode 574: 30.0
Episode 575: 200.0
Episode 576: 10.0
Episode 577: 58.0
Episode 578: 35.0
Episode 579: 

In [18]:
n_episode = 1000
best_weight = torch.rand(n_state*2, n_action)
best_total_reward = 0
total_rewards = []

noise_scale = 0.1

for episode in range(n_episode):
    weight = best_weight + noise_scale * torch.rand(n_state*2, n_action)
    total_reward = run_episode_knowing_previous_step(env, weight)
    if total_reward >= best_total_reward:
        best_total_reward = total_reward
        best_weight = weight
    total_rewards.append(total_reward)
#     print('Episode {}: {}'.format(episode + 1, total_reward))

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode))

Average total reward over 1000 episode: 38.303


# I've added previous state in
# `def run_episode_knowing_previous_step(env, weight)` 
# but it doesn't work well. What I missed?