# -----Presentation-----

In [None]:
# Installation
# !pip install gym

In [None]:
# Environments to choose from:
from gym import envs
for spec in envs.registry.all(): print(spec.id)

In [None]:
# Environment Set-up
import gym
env = gym.make('MountainCar-v0')
env.reset()
for _ in range(1000): env.render()
env.close()

In [None]:
# Environment Dimensions:
print('State Space: ', env.observation_space)
print('Action Space: ', env.action_space)

In [None]:
# The (2,) in the state space means that this is a two-dimensional box.
# The Discrete(3) means that the action space consists of 
# three discrete actions.

In [None]:
# This will give us further information about the state vector values:
print("The cart position is limited to a range of [",env.observation_space.low[0],', ',env.observation_space.high[0],'].')
print("The cart's velocity is limited to a range of [",env.observation_space.low[1],', ',env.observation_space.high[1],'].')

In [None]:
# What is the initial state of the environment?
print(env.reset())

In [None]:
# Let's see the effect of taking an action using the step() method.
print(env.step(2))

In [None]:
# The output is the equivalence of array([A,B],C,D,{E})
# A is the first observation state value
# B is the second observation state value
# C is the Reward
# D is a Boolean indicator on whether the episode has terminated yet (goal reached or 200 steps elapsed)
# E is any additional information which is not applicable to this problem

In [None]:
# Example of actions executed on a random basis:
import gym

randenv = gym.make('MountainCar-v0')
randenv.reset()
for _ in range(1000):
    randenv.render()
    randenv.step(randenv.action_space.sample())
randenv.close()

In [None]:
# Example of Q-Learning Applied to the problem.
# This is using a specific reenforcement Q-learning technique called Epsilon 
# Greedy Exploration Strategy with epsilon decay.
# The details of setting up a reinforcement learning program is not important.
# However, seeing the results of the application is.  
# There are numerous reinforcement learning techinques that can be applied to 
# any of these environments.
import numpy as np
import gym
import matplotlib.pyplot as plt

env = gym.make('MountainCar-v0')
env.reset()

# Define Q-learning function
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
    # Determine size of discretized state space
    num_states = (env.observation_space.high - env.observation_space.low)*\
                    np.array([10, 100])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialize Q table
    Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialize variables to track rewards
    reward_list = []
    ave_reward_list = []
    
    # Calculate episodic reduction in epsilon
    reduction = (epsilon - min_eps)/episodes
    
    # Run Q learning algorithm
    for i in range(episodes):
        # Initialize parameters
        done = False
        tot_reward, reward = 0,0
        state = env.reset()
        
        # Discretize state
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
    
        while done != True:   
            # Render environment for last five episodes
            if i >= (episodes - 20):
                env.render()
                
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]]) 
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Get next state and reward
            state2, reward, done, info = env.step(action) 
            
            # Discretize state2
            state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)
            
            #Allow for terminal states
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                
            # Adjust Q value for current state
            else:
                delta = learning*(reward + 
                                 discount*np.max(Q[state2_adj[0], 
                                                   state2_adj[1]]) - 
                                 Q[state_adj[0], state_adj[1],action])
                Q[state_adj[0], state_adj[1],action] += delta
                                     
            # Update variables
            tot_reward += reward
            state_adj = state2_adj
        
        # Decay epsilon
        if epsilon > min_eps:
            epsilon -= reduction
        
        # Track rewards
        reward_list.append(tot_reward)
        
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []
            
        if (i+1) % 100 == 0:    
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
            
    env.close()
    
    return ave_reward_list

# Run Q-learning algorithm
rewards = QLearning(env, 0.2, 0.9, 0.8, 0, 5000)

# Plot Rewards
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('rewards.jpg')     
plt.close()  

# -----Guide------


# OpenAI Gym 

Gym is a toolkit for developing and comparing reinforcement learning algorithms.  Gym provides the environment; you provide the algorithm.

#### Basics
Reinforcement learning conists of two basic concepts:

* *Environment* - A task or simulation that is constructed that needs interaction to solve.

* *Agent* - An AI algorithm that is developed to interact with the environment.

![image info](https://miro.medium.com/max/674/0*6yvI8Ul2ETKO-Ils.png)
 

The illustration above shows a simplified diagram of how the agent and environment interact.  The environment provides the agent its state and reward system.  The agent will then send an action to the environment.  The environment will respond with a change in the state and reward the agent based on the reward system criteria.  The agent will continue to respond and provide actions to maximize its rewards as it begins optimizing its machine learning process.

#### Examples:

Below are some links to videos of some environments being simulated:

[SpaceInvaders-v0](https://gym.openai.com/videos/2019-10-21--mqt8Qj1mwo/SpaceInvaders-v0/original.mp4)

[Ant-v2](https://gym.openai.com/videos/2019-10-21--mqt8Qj1mwo/Ant-v2/original.mp4)

## Installation:  

You will need Python 3.5+ installed to get started.  Install gym simply by using pip:

In [None]:
pip install gym

If you are looking to create and use your own environment the appropriate start would be:

In [None]:
#git clone https://github.com/openai/gym
#cd gym
#pip install ~e

To create your own environment, the following link will assist you: https://github.com/openai/gym/blob/master/docs/creating-environments.md

 ## Environments: 

These two are examples of default environments that are available in gym:

In [None]:
import gym
env = gym.make('MountainCar-v0')
env.reset()
for _ in range(500):
    env.render()
    env.step(env.action_space.sample()) 
env.close()

In [None]:
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(500):
    env.render()
    env.step(env.action_space.sample()) 
env.close()

To provide a list of all the environments available to use, there is a *registry.all()* object that can provide a list of all of them.

In [None]:
from gym import envs
print(envs.registry.all())

More refined, readable list below:

In [None]:
envids = [spec.id for spec in envs.registry.all()]
for envid in sorted(envids):
    print(envid)

In [None]:
import gym
envs = gym.envs.registry.all()
print('Total envs available:',len(envs))

Some of the environments require dependency on Atari and MujoCo. The commands to do such is:

*from gym.envs.atari.env import AtariEnv*

*from gym.envs.mujoco_env import MujocoEnv*

## Observations:

To have the agent interact with the environment in a designed and non-random way, we will need to understand the step function of the environment.  The step function returns four values:

1. **Observation** (object) - an environment-specific *object* that represents the observation of the environment like a board state in a board game or pixel data from a camera.


2. **Reward** (float) - the amount of reward gratned from the previous action set by the rules of the environment.  Always looks to maximize the reward.


3. **Done** (boolean) - This determines the appropriate time to *reset* the environment again.  This gives the definitive end of the episode for the agent to take another action. Exxample: lost a life, pole tipped too far, and so on.


4. **Info** (dict) - Provides the diagnostic information that can be useful to debug the solution but the agent cannot use or evaluate this information. 


By calling *reset()*, this process gets started by returning an initial observation.  By using *if done* it can be written more properly.  Example below:

In [None]:
import gym
env = gym.make('CartPole-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()


The code should produce a video of the result and through the *print("Episode finished after {} timesteps".format(t+1))* coding, you should also receive a long detailed output of the result.

## Spaces

Every environment consists of two important spaces: an **"action_space"** and an **"observation_space"**


In [None]:
# Example of action_space:
import gym
env = gym.make('MountainCar-v0')
print(env.action_space)

In [None]:
# Example of observation_space:
print(env.observation_space)

In [None]:
# Furthermore we can find out the bounds of the environment's observation_space by:
print(env.observation_space.high)

In [None]:
print(env.observation_space.low)

## Example

Below is an example:

In [None]:
env = gym.make('CartPole-v1')
observation = env.reset()
for _ in range(1000):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    
    if done:
        observation = env.reset()
env.close()        

# Helpful Links for Further Studying

Environments:
https://github.com/openai/gym/blob/master/docs/environments.md

Creating Environments:
https://github.com/openai/gym/blob/master/docs/creating-environments.md

Wrappers: (to transform observation and/or action space.  Can perform pre/postprocessing on the data that is exchanged between the agent and the environment.)
https://github.com/openai/gym/blob/master/docs/wrappers.md

Agents: (Facilitates the running of the algorithm against an environment.)

https://towardsdatascience.com/create-your-own-reinforcement-learning-environment-beb12f4151ef

# Lastly, What is possible with OpenAI Gym?

What if you could teach a robot hand to solve a rubik's cube?!
Check out the link below:

https://www.youtube.com/watch?v=kVmp0uGtShk