#**Reinforcement Learning**

Package/Dependancy Installs *(Requirement for Colab)*


In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[classic_control]
!pip3 install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.3.0
    Uninstalling pygame-2.3.0:
      Successfully uninstalled pygame-2.3.0
Successfully installed pygame-2.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Imports and Packages *(all)*

In [2]:
#PyTorch
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

#OpenAI Gym
import gym
from gym import logger as gymlogger
from gym.wrappers import RecordVideo
gym.logger.set_level(40)  #log errors encountered only

#Misc
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import time
from collections import deque
from itertools import count

#Display Window
import glob
import io
import base64
import IPython
from IPython.display import HTML
from IPython import display as ipythondisplay
is_ipython = 'inline' in matplotlib.get_backend()
from IPython.display import clear_output 

Display Setup (Environment Monitor)

In [3]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f2142c589a0>

In [4]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />. 
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("error finding video file.")

def wrap_env(env):
  env = RecordVideo(env, './video')
  return env

#**Q-Learning**


> Implementation of Algorithm 6.1





**Training Algorithm** | Learns to beat a particular instance of 'Frozen Lake' game by OpenAI

In [5]:
#CHANGE RUNTIME PARAMETERS HERE
document_video = True
video_frequency = 1000 #How often video updates show if enabled
env_set = ["FrozenLake-v1", True, "4x4"] 
           #environment, is_slippery, map size

env = wrap_env(gym.make(env_set[0], is_slippery=env_set[1], \
                        map_name=env_set[2]))
action_space_size = env.action_space.n     #returns as integer
state_space_size = env.observation_space.n #returns as integer

q_table = np.zeros((state_space_size, action_space_size))

#HYPERPARAMETERS
num_episodes = 10000
max_steps_per_episode = 100
learning_rate = 0.1          
discount_rate = 0.99            
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01     
exploration_decay_rate = 0.001  

rewards_all_episodes = []

#Q-Learning / Training
for episode in range (num_episodes):

  if episode%video_frequency == 0 and document_video == True:
    env = wrap_env(gym.make(env_set[0], is_slippery=env_set[1],\
                            map_name=env_set[2]))
  
  state = env.reset()         #starts/resets simulation
  done = False                #episode state (boolean)
  rewards_current_episode = 0 #reset rewards

  for step in range (max_steps_per_episode):
    exploration_rate_thresh = random.uniform(0, 1)
    if exploration_rate_thresh > exploration_rate:
      action = np.argmax(q_table[state,:]) #EXPLOITATION
    else:
      action = env.action_space.sample()   #EXPLORATION

    #take step, return env info
    new_state, reward, done, info = env.step(action) 

    #update Q-Table using Bellman Equation
    q_table[state, action] = q_table[state, action] * (1 - \
        learning_rate) + learning_rate * (reward + discount_rate \
                                * np.max(q_table[new_state, :])) 

    state = new_state
    rewards_current_episode += reward

    if done == True:
      break  #next episode

  #check if episode is at an update_ep and render video if so
  if episode%video_frequency == 0 and document_video == True:
      env.close()
      clear_output()
      print("Episode ", episode, " Playback:\n")
      time.sleep(0.3)
      show_video()
      print("Q-Table\n", q_table)

  #update exploration rate
  exploration_rate = min_exploration_rate + (max_exploration_rate \
    - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
  rewards_all_episodes.append(rewards_current_episode)

#SUMMARY OUTPUT
rewards_per_thousand_episodes = np.split(np.array(\
                            rewards_all_episodes), num_episodes/1000)

print("\n ~ Runtime statistics ~ \nAverage reward per thousand episodes\n")
count = 1000
for r in rewards_per_thousand_episodes:
    print(count-1000, " - ", count, " : ", str(sum(r/1000)))
    count += 1000

Episode  9000  Playback:



Q-Table
 [[0.57637661 0.50258773 0.51893294 0.50770926]
 [0.43736448 0.27651697 0.41431132 0.50628917]
 [0.45198378 0.40051857 0.43757885 0.46422105]
 [0.34261089 0.29921415 0.32138857 0.4475932 ]
 [0.60812673 0.37552107 0.45512101 0.46627602]
 [0.         0.         0.         0.        ]
 [0.31920598 0.15931488 0.18843752 0.19568857]
 [0.         0.         0.         0.        ]
 [0.34113544 0.26103188 0.40412011 0.63549114]
 [0.44082114 0.66370624 0.46275787 0.42404768]
 [0.6174056  0.48652345 0.35085219 0.31295377]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.50733672 0.35662982 0.73986328 0.41039065]
 [0.73845594 0.86599368 0.75384309 0.7461729 ]
 [0.         0.         0.         0.        ]]

 ~ Runtime statistics ~ 
Average reward per thousand episodes

0  -  1000  :  0.057000000000000044
1000  -  2000  :  0.20600000000000016
2000  -  3000  :  0.44200000000000034
3000  -  4000  :  0.5660000000000004
4000  -  5000  :  0.65200

*Aside: Plotting High-Res Figures*

In [None]:
episodes_to_move_avg = 100
data_split = np.split(np.array(rewards_all_episodes), \
                      num_episodes/episodes_to_move_avg)
avgs_y = []
avgs_x = []
j = 0
for i in data_split:
    avgs_y.append(sum(i/episodes_to_move_avg))
    avgs_x.append(j*episodes_to_move_avg)
    j += 1

plt.figure(2, dpi=900)
plt.clf()        
plt.xlabel('Episode')
plt.ylabel('Average Reward For Period')
plt.plot(avgs_x, avgs_y)
plt.plot()

**Play Using Determined Q-Table**

In [7]:
env = wrap_env(gym.make(env_set[0], is_slippery=env_set[1], \
                        map_name=env_set[2]))
state = env.reset() #start simulation
while True:
    action = np.argmax(lambda x: q_table[state,x])   
    state, reward, done, info = env.step(action)   
    if done: 
        break;
            
env.close()
show_video()

# **Deep Q-Learning**

> Solving the 'Cart-Pole' problem (by OpenAI) | Implementation of Algorithm 7.1 

> **note:** runtime needs to be cleared and restarted if FrozenLake cells have been run previously - video wrapper tends to malfunction and display incorrect videos if environment type changes.






In [None]:
#obtains a rolling average of rewards to plot live during training
def get_moving_average(period, values):
    values = torch.tensor(values, dtype=torch.float)
    if len(values) >= period:
        moving_avg = values.unfold(dimension=0, size=period, step=1) \
            .mean(dim=1).flatten(start_dim=0)
        moving_avg = torch.cat((torch.zeros(period-1), moving_avg))
        return moving_avg.numpy()
    else:
        moving_avg = torch.zeros(len(values))
        return moving_avg.numpy()

#matplot setup for live rolling average stats
def plot(values, moving_avg_period, clear, dpi_in):
    plt.figure(2, dpi=dpi_in)
    plt.clf()        
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(values)
    plt.plot(get_moving_average(moving_avg_period, values))
    plt.pause(0.001)
    if clear == True:
        if is_ipython: ipythondisplay.clear_output(wait=True)

In [None]:
#converts input into tensor
def to_tensor(arr, dtype=torch.float, reshape=False):
    output = torch.from_numpy(np.array(arr)).type(dtype)
    if reshape:
        output = output.reshape(-1, 1)

    return output

In [None]:
class DQN(nn.Module):
    def __init__(self, env, layer_size):
        super(DQN, self).__init__()
        in_features = int(np.prod(env.observation_space.shape))

        self.net = nn.Sequential(
            nn.Linear(in_features, layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, env.action_space.n),
        )

    def forward(self, observation):
        return self.net(observation)

In [None]:
class Memory(object):
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)
    def store(self, experience):
        self.memory.append(experience)
    def sample(self, n_samples):
        return zip(*random.sample(self.memory, n_samples))
    def __len__(self):
        return len(self.memory)

In [None]:
class EpsilonGreedyStrategy():
    def __init__(self, start, end, decay):
        self.start = start
        self.end = end
        self.decay = decay
    def get_exploration_rate(self, episode):
        return self.end + (self.start - self.end)*np.exp(-episode*self.decay)

**Training Algorithm** | Solves 'CartPole-v1'

> *potential epilepsy warning for Google Colab*



In [None]:
#Hyper Parameters
memory_capacity = 10000
learning_rate = .0001
discount_factor = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.001
EPSILON_DECAY = 0.0001
NUM_EPISODES = 1000
replay_memory_minimum = 500
MOVING_AVERAGE_INT = 25  #Episodes to consider in rolling average

env = gym.make("CartPole-v1")

model = DQN(env, 128)
strategy = EpsilonGreedyStrategy(EPSILON_START, EPSILON_END, EPSILON_DECAY)

# Optimizer and loss function initialisation
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()
memory = Memory(memory_capacity)
episode_durations = []

observation = env.reset()
episode_reward = 0
episode = 0

#INITIALISING REPLAY MEMORY
for step in range(replay_memory_minimum):
    action = env.action_space.sample()
    new_state, reward, done, info = env.step(action)
    episode_reward += reward
    if done:
        new_state = np.zeros(env.observation_space.shape)
    memory.store([observation, action, reward, int(done), new_state])
    done = done
    if done:
        observation = env.reset()
    else: 
        observation = new_state    

#MAIN TRAINING LOOP
observation = env.reset()
episode_reward = 0
episode = 0
step = 0    
for step in count():
    if np.random.random() <= strategy.get_exploration_rate(step):
        action = env.action_space.sample()
    else:
        state = to_tensor(observation).unsqueeze(0)
        action  = model(state).argmax().item()

    # Take action and store in replay memory
    new_state, reward, done, info = env.step(action)
    episode_reward += reward
    if done:
        new_state = np.zeros(env.observation_space.shape)
        episode_durations.append(episode_reward)
    memory.store([observation, action, reward, int(done), new_state])
    done = done
    observation = new_state

    if done:
        plot(episode_durations, MOVING_AVERAGE_INT, True, 100)
        obs = env.reset()
        episode_reward = 0
        episode += 1

    # Main Training Loop once replay memory is minimum capacity
    model.train() #training mode - affects how weights are calculated
                  #greatly improves the algorithms overall performance 
                  #during training.
    states, actions, rewards, dones, new_states = memory.sample(128)

    # Wrap all values
    states = to_tensor(states)
    actions = to_tensor(actions, torch.int64, reshape=True)
    new_states = to_tensor(new_states)
    rewards = to_tensor(rewards, reshape=True)
    dones = to_tensor(dones, reshape=True)

    # Get current q-values
    qs = model(states)
    qs = torch.gather(qs, dim=1, index=actions)

    # Compute target q-values
    with torch.no_grad():
        predicted_qs, _ = model(new_states).max(dim=1)
        predicted_qs = predicted_qs.reshape(-1, 1)
    
    #Form of the piecewise function shown in the DeepMind 2015 paper
    target_qs = rewards + discount_factor * (1 - dones) * \
                                        predicted_qs.reshape(-1, 1) 
    
    # Compute loss
    loss = loss_fn(qs, target_qs)
    optimizer.zero_grad()
    loss.backward()   
    optimizer.step()  #Backpropagation

    #break loop and run model
    if episode == NUM_EPISODES:
        env.close()
        wrap_env(env)
        done = False   
        while done == False:
            state = to_tensor(obs).unsqueeze(0)
            action = model(state).argmax().item()
            obs, reward, done, info = env.step(action)
        env.close()
        plot(episode_durations, MOVING_AVERAGE_INT, False, 100)
        print("Final Run:")
        show_video()
        print("See Cell below to run more.")
        break

**Play Using Trained Model**


In [None]:
env = wrap_env(gym.make('CartPole-v1'))
obs = env.reset()
done = False
total_reward = 0
while done == False:
    state = to_tensor(obs).unsqueeze(0)
    action = model(state).argmax().item()
    obs, reward, done, info = env.step(action)
    total_reward += reward
env.close()
show_video()
print("Total Reward/Runtime = ", total_reward)
if total_reward >= 500:
    print("Optimal Solution for CartPole-v1!")

*Aside: Plotting High-Res Figures*

In [None]:
plot(episode_durations, 50, False, 900)