In [None]:
# Google Colab
!pip3 install box2d-py

In [None]:
# Local Machine
!pip3 install torch
!pip3 install gym
!pip3 install box2d-py
!pip3 install pyglet

## Import Packages

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys
import torch
from torch import nn
from torch import optim
print("PyTorch:\t{}".format(torch.__version__))

In [None]:
# set seed
seed = 31
np.random.seed(seed)
torch.manual_seed(seed)

## GYM Environments
* `CartPole-v1`
<img src="cartpole.jpg"
     alt="World"
     width="500" />


* `LunarLander-v2`
<img src="LunarLander.png"
     alt="World"
     width="500" />
[source](https://shiva-verma.medium.com/solving-lunar-lander-openaigym-reinforcement-learning-785675066197)

## Environment 1

In [None]:
env_1 = gym.make('CartPole-v1')

In [None]:
env_1.seed(seed)

### Action Space A:
* left
* right

In [None]:
action_list = ["left", "right"]
print(env_1.action_space)

### Observation (State) Space S:
* position of cart
* velocity of cart
* angle of pole
* rotation rate of pole

In [None]:
print(env_1.observation_space)

### Transition:
A transition is a tuple consisting of 
* the current state,
* the action used, 
* the received reward , 
* and the new state 

![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/1/1b/Reinforcement_learning_diagram.svg/300px-Reinforcement_learning_diagram.svg.png).

Start the game (Episode)

In [None]:
state_0 = env_1.reset()
print(state_0)

Make one interaction with the environment

In [None]:
action = env_1.action_space.sample()
print("action" , action_list[action])

In [None]:
state, reward, done, info = env_1.step(action) 
print("reward", reward)
print("done", done) 

### Episode

In [None]:
n_episode = 3
horizon   = 200

# Set up lists to hold results
for i_episode in range(n_episode):
    states  = []
    actions = []
    rewards = []
    state = env_1.reset()
    for t in range(horizon):
        env_1.render()
        time.sleep(0.05)
        action = env_1.action_space.sample()
        state, reward, done, info = env_1.step(action)
        ## Add the transition to the lists
        states.append(state)
        rewards.append(reward)
        actions.append(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
    print("total Reward", np.sum(rewards))
# Convert lists to numpy arrays
states =  np.array(states)
actions=  np.array(actions)
rewards=  np.array(rewards)

In [None]:
env_1.close()

## Visualization

#### Plot the States 

In [None]:
duration = np.arange(states.shape[0])
print(duration)

In [None]:
# plot results (States + Reward)
fig1, (ax1, ax2, ax3,ax4,ax5) = plt.subplots(figsize=(25,45), nrows=5, ncols=1)


# plot ensemble predictions
ax1.plot(duration, states[ :, 0], label='Pos_x' , marker='o')
ax2.plot(duration, states[ :, 1], label='Vel_x' , marker='o')
ax3.plot(duration, states[ :, 2], label='Ang' , marker='o')
ax4.plot(duration, states[ :, 3], label='Vel_ang' , marker='o')
ax5.plot(duration, rewards[:], label='reward ', marker='o')

# set title
ax1.set_title('cart position in x axis')
ax2.set_title('cart velocity x axis')
ax3.set_title('pole angle')
ax4.set_title('pole angular velocity')
ax5.set_title('reward')

# plot legend
for ax in (ax1, ax2, ax3,ax4,ax5):
    ax.legend(loc='best', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize = (8,8))
plt.hist(actions)
plt.title("actions histogram")
plt.show()

### Policy (Deep RL)

In [None]:
class policy_estimator():
    def __init__(self, env):
        self.n_inputs = env.observation_space.shape[0]
        self.n_outputs = env.action_space.n  
        # Define network
        self.network = nn.Sequential(
            nn.Linear(self.n_inputs, 256), 
            nn.ReLU(), 
            nn.Linear(256, 128),
            nn.ReLU(), 
            nn.Linear(128, self.n_outputs),
            nn.Softmax(dim=-1))
    
    def predict(self, state):
        action_probs = self.network(torch.FloatTensor(state))
        return action_probs

In [None]:
policy = policy_estimator(env_1)

### Discount Rewards

<img src="https://imgur.com/g3mYTzn.png" width="400" height="400" />


### Causality
$\sum_{t'=t}^H R(s_{t'},a_{t'})$

In [None]:
"""
takes a list of rewards {r_0, r_1, ..., r_t', ... r_T},
and returns a list where the entry in each index t' is
sum_{t'=t}^T gamma^(t'-t) * r_{t'}
"""
gamma = 0.9
# We are in the state 1,1
rewards = [1,1,1,1,1,1]
print(1 + 0.9 + 0.9**2 + 0.9**3 + 0.9**4 + 0.9**5)
print(0.9 + 0.9**2 + 0.9**3 + 0.9**4 + 0.9**5)
print(0.9**2 + 0.9**3 + 0.9**4 + 0.9**5)
print(0.9**3 + 0.9**4 + 0.9**5)
print(0.9**4 + 0.9**5)
print(0.9**5)

In [None]:
def discount_rewards(rewards, gamma=0.99):
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    #r = r[::-1].cumsum()[::-1]
    r = np.array([gamma**i * rewards[i] for i in range(len(rewards))])
    r = r[::-1].cumsum()[::-1]
    return r - r.mean()

## Hyper-Parameters

In [None]:
gamma=0.99
num_episodes = 1200 # run agent for this many episodes
batch_size= 10
lr = 0.001 # learning rate for actor

## Optimizer

In [None]:
# Define optimizer
optimizer = torch.optim.Adam(policy.network.parameters(), 
                        lr=lr)    

## Reinforce

In [None]:
action_space = np.arange(env_1.action_space.n)
action_space

In [None]:
# Set up lists to hold results
total_rewards = []
batch_rewards = []
batch_actions = []
batch_states  = []
batch_counter = 1
ep = 0
epoch = 0
while ep < num_episodes:
    s_0 = env_1.reset()
    states = []
    rewards = []
    actions = []
    done = False
    while done == False:
        # Get actions and convert to numpy array
        action_probs = policy.predict(s_0).detach().numpy()
        action = np.random.choice(action_space, p=action_probs)
        s_1, r, done, _ = env_1.step(action)
        states.append(s_0)
        rewards.append(r)
        actions.append(action)
        s_0 = s_1
        
        # If done, batch data
        if done:
            batch_rewards.extend(discount_rewards(rewards, gamma))
            batch_states.extend(states)
            batch_actions.extend(actions)
            batch_counter += 1
            total_rewards.append(sum(rewards))
            
            # If batch is complete, update network
            if batch_counter == batch_size:
                print("Epoch:", epoch)
                optimizer.zero_grad()
                state_tensor = torch.FloatTensor(batch_states)
                reward_tensor = torch.FloatTensor(batch_rewards)
                # Actions are used as indices, must be 
                # LongTensor
                action_tensor = torch.LongTensor(batch_actions)
                # Calculate loss
                logprob = torch.log(policy.predict(state_tensor))
                selected_logprobs = reward_tensor * torch.gather(logprob, 1,action_tensor.unsqueeze(1)).squeeze()
                loss = -selected_logprobs.mean()
                
                # Calculate gradients
                loss.backward()
                # Apply gradients
                optimizer.step()
                
                batch_rewards = []
                batch_actions = []
                batch_states = []
                batch_counter = 1
                epoch += 1
                
            avg_rewards = np.mean(total_rewards[-100:])
            # Print running average
            print("Episode: ", ep + 1)
            print("Average of last 100 Episode:", avg_rewards)
            ep += 1

## Training Results
plot the results

In [None]:
# plot rewards
episode = np.arange(len(total_rewards))
plt.figure(figsize=(10,8))
plt.plot(episode, total_rewards)
plt.xlabel('Episode')
plt.ylabel('Episode Reward')
plt.show()

## Evaluation

In [None]:
n_episode = 3
horizon   = 200

# Set up lists to hold results
for i_episode in range(n_episode):
    states  = []
    actions = []
    rewards = []
    state = env_1.reset()
    for t in range(horizon):
        env_1.render()
        time.sleep(0.05)
        # Get actions and convert to numpy array
        action_probs = policy.predict(state).detach().numpy()
        #action = np.random.choice(action_space, p=action_probs)
        action = np.argmax(action_probs)
        state, reward, done, info = env_1.step(action)
        ## Add the transition to the lists
        states.append(state)
        rewards.append(reward)
        actions.append(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
    print("total Reward", np.sum(rewards))

# Convert lists to numpy arrays
states =  np.array(states)
actions=  np.array(actions)
rewards=  np.array(rewards)
# env.close()

In [None]:
env_1.close()

In [None]:
plt.figure(figsize = (8,8))
plt.hist(actions)
plt.title("actions histogram")
plt.show()

## Environment 2

In [None]:
env_2 = gym.make('LunarLander-v2')
env_2.seed(seed)

### Action Space A:
* Four discrete actions available: 
  * do nothing, 
  * fire left orientation engine, 
  * fire main engine, 
  * fire right orientation engine

In [None]:
action_list = ["do nothing", "fire left orientation engine", "fire main engine" , "fire right orientation engine"]
print(env_2.action_space)

### Observation (State) Space S:
* position in x axis and y axis(hieght)
* x,y axis velocity 
* lander angle and angular velocity
* left and right contact points


In [None]:
print(env_2.observation_space)

### Reward Function R(s,a):
* Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. 
* If lander moves away from landing pad it loses reward back. 
* Episode finishes if the lander crashes or comes to rest, receiving additional -100 or +100 points. 
* Each leg ground contact is +10. 
* Firing main engine is -0.3 points each frame. 
* Solved is 200 points


### Episode

In [None]:
from utils import on_environment

n_episode = 3
horizon   = 200
policy = "random"

states,actions,rewards = on_environment(env_2,policy,n_episode,horizon)

In [None]:
duration = np.arange(states.shape[0])

# plot results (States + Reward)
fig1, (ax1, ax2, ax3,ax4,ax5,ax6,ax7,ax8,ax9) = plt.subplots(figsize=(25,45), nrows=9, ncols=1)


# plot ensemble predictions
ax1.plot(duration, states[ :, 0], label='Pos_x' , marker='o')
ax2.plot(duration, states[ :, 1], label='Pos_y' , marker='o')
ax3.plot(duration, states[ :, 2], label='Vel_x' , marker='o')
ax4.plot(duration, states[ :, 3], label='Vel_y' , marker='o')
ax5.plot(duration, states[ :, 4], label='Ang' , marker='o')
ax6.plot(duration, states[ :, 5], label='Vel_ang' , marker='o')
ax7.plot(duration, states[ :, 6],label='Cont_rig', marker='o')
ax8.plot(duration, states[ :, 7], label='Cont_lef ' , marker='o')
ax9.plot(duration, rewards[:], label='reward ', marker='o')

# set title
ax1.set_title('lander position in x axis')
ax2.set_title('lander position in y axis(hieght)')
ax3.set_title('lander velocity x axis')
ax4.set_title('lander velocity y axis')
ax5.set_title('lander angle')
ax6.set_title('lander angular velocity')
ax7.set_title('right contact points')
ax8.set_title('left contact points')
ax9.set_title('reward')

# plot legend
for ax in (ax1, ax2, ax3,ax4,ax5,ax6,ax7,ax8,ax9):
    ax.legend(loc='best', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize = (8,8))
plt.hist(actions)
plt.title("actions histogram")
plt.show()

## Define Policy

In [None]:
policy = policy_estimator(env_2)

In [None]:
args = dict()
args["gamma"]=0.99
args["num_episodes"] = 1200 # run agent for this many episodes
args["batch_size"]= 10
args["lr"] = 0.001 # learning rate for actor

### Optimizaer

In [None]:
# Define optimizer
optimizer = torch.optim.Adam(policy.network.parameters(), 
                        lr=args["lr"])    

In [None]:
from utils import reinforce
policy,total_rewards = reinforce(env_2,policy,optimizer,args)

In [None]:
# plot rewards
episode = np.arange(len(total_rewards))
plt.figure(figsize=(10,8))
plt.plot(episode, total_rewards)
plt.xlabel('Episode')
plt.ylabel('Episode Reward')
plt.show()

## Evaluation

In [None]:
n_episode = 3
horizon   = 200
states,actions,rewards = on_environment(env_2,policy,n_episode,horizon)

## Plot the Results

In [None]:
# plot results (States + Reward)
fig1, (ax1, ax2, ax3,ax4,ax5,ax6,ax7,ax8,ax9) = plt.subplots(figsize=(25,45), nrows=9, ncols=1)

duration = np.arange(states.shape[0])

# plot ensemble predictions
ax1.plot(duration, states[ :, 0], label='Pos_x' , marker='o')
ax2.plot(duration, states[ :, 1], label='Pos_y' , marker='o')
ax3.plot(duration, states[ :, 2], label='Vel_x' , marker='o')
ax4.plot(duration, states[ :, 3], label='Vel_y' , marker='o')
ax5.plot(duration, states[ :, 4], label='Ang' , marker='o')
ax6.plot(duration, states[ :, 5], label='Vel_ang' , marker='o')
ax7.plot(duration, states[ :, 6],label='Cont_rig', marker='o')
ax8.plot(duration, states[ :, 7], label='Cont_lef ' , marker='o')
ax9.plot(duration, rewards[:], label='reward ', marker='o')

# set title
ax1.set_title('lander position in x axis')
ax2.set_title('lander position in y axis(hieght)')
ax3.set_title('lander velocity x axis')
ax4.set_title('lander velocity y axis')
ax5.set_title('lander angle')
ax6.set_title('lander angular velocity')
ax7.set_title('right contact points')
ax8.set_title('left contact points')
ax9.set_title('reward')

# plot legend
for ax in (ax1, ax2, ax3,ax4,ax5,ax6,ax7,ax8,ax9):
    ax.legend(loc='best', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize = (8,8))
plt.hist(actions)
plt.title("actions histogram")
plt.show()

## Actor Critic

In [None]:
 !pip install stable_baselines3

In [None]:
from stable_baselines3 import A2C

policy = A2C("MlpPolicy", env_2, learning_rate=0.007, verbose=1)
policy.learn(total_timesteps=20000)

In [None]:
n_episode = 3
horizon   = 250

# Set up lists to hold results

for i_episode in range(n_episode):
    states  = []
    actions = []
    rewards = []
    state = env_2.reset()
    for t in range(horizon):
        env_2.render()
        time.sleep(0.05)
        # Get actions and convert to numpy array
        action, _states = policy.predict(state)
        state, reward, done, info = env_2.step(action)
        ## Add the transition to the lists
        states.append(state)
        rewards.append(reward)
        actions.append(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
    
    print("total Reward", np.sum(rewards))
        
# Convert lists to numpy arrays
states =  np.array(states)
actions=  np.array(actions)
rewards=  np.array(rewards)
env_2.close()

In [None]:
# plot results (States + Reward)
fig1, (ax1, ax2, ax3,ax4,ax5,ax6,ax7,ax8,ax9) = plt.subplots(figsize=(25,45), nrows=9, ncols=1)

duration = np.arange(states.shape[0])

# plot ensemble predictions
ax1.plot(duration, states[ :, 0], label='Pos_x' , marker='o')
ax2.plot(duration, states[ :, 1], label='Pos_y' , marker='o')
ax3.plot(duration, states[ :, 2], label='Vel_x' , marker='o')
ax4.plot(duration, states[ :, 3], label='Vel_y' , marker='o')
ax5.plot(duration, states[ :, 4], label='Ang' , marker='o')
ax6.plot(duration, states[ :, 5], label='Vel_ang' , marker='o')
ax7.plot(duration, states[ :, 6],label='Cont_rig', marker='o')
ax8.plot(duration, states[ :, 7], label='Cont_lef ' , marker='o')
ax9.plot(duration, rewards[:], label='reward ', marker='o')

# set title
ax1.set_title('lander position in x axis')
ax2.set_title('lander position in y axis(hieght)')
ax3.set_title('lander velocity x axis')
ax4.set_title('lander velocity y axis')
ax5.set_title('lander angle')
ax6.set_title('lander angular velocity')
ax7.set_title('right contact points')
ax8.set_title('left contact points')
ax9.set_title('reward')

# plot legend
for ax in (ax1, ax2, ax3,ax4,ax5,ax6,ax7,ax8,ax9):
    ax.legend(loc='best', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize = (8,8))
plt.hist(actions)
plt.title("actions histogram")
plt.show()