In [0]:
!pip install gym > /dev/null 2>&1
!pip install tflearn  > /dev/null 2>&1
!pip install tensorflow > /dev/null 2>&1

## Hi, I'm AI GYM

`gym` is a python module created by Open AI to introduce reinforcement learning to game playing!

It's famous for using reinforcement learning to beat lots of old Atari games (see below)
  
 `gym` is a wrapper for different game environments. Each environment has the following properties:
  

*   ``step()``
    * evolves one time step forward
* ``action()``
    * something to do
* ``render()``
  * make a display
*   ``reset()``
    * revert to intial state
* ``seed()``
    * sets local RNG


In [0]:
from IPython.display import HTML
HTML('<img src="https://miro.medium.com/max/884/1*qFHnCDhep6OmqkbVN6NY_g.gif">')

In [0]:
import gym

In [0]:
env = gym.make('CartPole-v0')  # create your environment here
env.reset();  # prepare the environment for use

Cartpole looks like this:

There are two controls: 
  
  0=`move left`
  
  1= `move right`
  
  The goal is to balance the pole as long as possible.
  
<img src=https://gym.openai.com/videos/2019-05-31--eRh4Fbp8G5/CartPole-v1/poster.jpg>

In [0]:
## Our zeroth order learning strategy is do nothing. 
##    We'll just try a random operation

nsteps = 100  # try this many times
i_stop=0;
action_list = []
for i in range(nsteps):
    #env.render()  # display the environment at this step
    action = env.action_space.sample()  # choose a random action
    result=env.step(action)  # now loop through the action 
    #action_list.append(action)
    if result[2]:
        print('out of bounds after '+str(i-i_stop+1)+' steps')
        env.reset()
        i_stop=i
        #print(action_list) #' '.join( [str(i) for i in action_list]) )
        #action_list = []

out of bounds after 17 steps
out of bounds after 14 steps
out of bounds after 23 steps
out of bounds after 36 steps
out of bounds after 12 steps


## Let's beef up our learning with reinforcement

In [0]:
import gym
import random
import numpy as np
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression
from statistics import median, mean
from collections import Counter
pass;

In [0]:
LR = 1e-3
num_games = 50    # number of trials / episodes
goal_steps = 200  # number of steps per trial
score_requirement = 50  # the minimum score to achieve
initial_games = 10000   # number of games to use for training
env = gym.make('CartPole-v0')  # create your environment here

#### Define some helper functions
1. Data-generation
2. Neural network model
3. Training function

In [0]:
def initial_population():
    # [OBS, MOVES]
    training_data = []
    # all scores:
    scores = []
    # just the scores that met our threshold:
    accepted_scores = []
    # iterate through however many games we want:
    for _ in range(initial_games):
        score = 0
        # moves specifically from this environment:
        game_memory = []
        # previous observation that we saw
        prev_observation = []
        # for each frame in 200
        for _ in range(goal_steps):
            # choose random action (0 or 1)
            action = random.randrange(0,2)
            # do it!
            observation, reward, done, info = env.step(action)

            # notice that the observation is returned FROM the action
            # so we'll store the previous observation here, pairing
            # the prev observation to the action we'll take.
            if len(prev_observation) > 0 :
                game_memory.append([prev_observation, action])
            prev_observation = observation
            score+=reward
            if done: break

        # IF our score is higher than our threshold, we'd like to save
        # every move we made
        # NOTE the reinforcement methodology here.
        # all we're doing is reinforcing the score, we're not trying
        # to influence the machine in any way as to HOW that score is
        # reached.
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                # convert to one-hot (this is the output layer for our neural network)
                if data[1] == 1:
                    output = [0,1]
                elif data[1] == 0:
                    output = [1,0]
                # saving our training data
                training_data.append([data[0], output])

        # reset env to play again
        env.reset()
        # save overall scores
        scores.append(score)

    # just in case you wanted to reference later
    training_data_save = np.array(training_data)
    np.save('saved.npy',training_data_save)

    # some stats here, to further illustrate the neural network magic!
    print('Average of all score:',mean(scores))
    print('Average accepted score:',mean(accepted_scores))
    print('Median score for accepted scores:',median(accepted_scores))
    print(Counter(accepted_scores))

    return training_data



# Create a simple multilayer perceptron model
def neural_network_model(input_size):
    # our input layer which will be dependent on the number of inputs
    # we want to create
    network = input_data(shape=[None, input_size, 1], name='input')

    # our first hidden layer
    network = fully_connected(network, 128, activation='relu')
    network = dropout(network, 0.8)

    # our second hiddent layer
    network = fully_connected(network, 256, activation='relu')
    network = dropout(network, 0.8)

    # our third hidden layer
    network = fully_connected(network, 512, activation='relu')
    network = dropout(network, 0.8)

    # our fourth hidden layer
    network = fully_connected(network, 256, activation='relu')
    network = dropout(network, 0.8)

    # our fifth hidden layer
    network = fully_connected(network, 128, activation='relu')
    network = dropout(network, 0.8)

    # the output layer
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets')
    model = tflearn.DNN(network, tensorboard_dir='log')

    return model


def train_model(training_data, model=False):
    X = np.array([i[0] for i in training_data]).reshape(-1,len(training_data[0][0]),1)
    y = [i[1] for i in training_data]

    if not model:
        model = neural_network_model(input_size = len(X[0]))

    model.fit({'input': X}, {'targets': y}, n_epoch=5, snapshot_step=500, show_metric=True, run_id='openai_learning')
    return model

In [0]:
env.reset()
training_data = initial_population()
#model = train_model(training_data)

Average of all score: 22.3799
Average accepted score: 60.97969543147208
Median score for accepted scores: 57.5
Counter({52.0: 35, 50.0: 32, 51.0: 31, 53.0: 25, 55.0: 24, 54.0: 22, 62.0: 16, 61.0: 16, 57.0: 16, 59.0: 14, 58.0: 13, 64.0: 13, 56.0: 12, 67.0: 11, 65.0: 9, 63.0: 9, 60.0: 9, 72.0: 9, 66.0: 7, 71.0: 5, 73.0: 5, 69.0: 4, 74.0: 4, 68.0: 4, 70.0: 4, 81.0: 4, 76.0: 4, 84.0: 3, 91.0: 3, 77.0: 3, 79.0: 3, 83.0: 3, 113.0: 2, 78.0: 2, 80.0: 2, 90.0: 2, 86.0: 2, 75.0: 2, 82.0: 1, 89.0: 1, 96.0: 1, 85.0: 1, 103.0: 1, 88.0: 1, 93.0: 1, 94.0: 1, 109.0: 1, 92.0: 1})


In [0]:
training_data[0]

[array([-0.03998216, -0.21915937,  0.03802265,  0.31112833]), [0, 1]]

In [0]:
scores = list()
choices = list()

for trial in range(1, num_games+1):
    score = 0
    game_memory = list()
    prev_observations = list()
    env.reset() # prepare the environment for this trial / episode

    for step in range(1, goal_steps+1):
        #env.render()

        # previously, we performed a random action:
        # `action = env.action_space.sample()`
        #
        # now, we're going to say that if the angle is positive, move right (1)
        # otherwise, move left (0)
        if len(prev_observations) == 0:
            # create a random action between 0 and 1
            action = random.randrange(0, 2)
        else:
            action = np.argmax(model.predict(prev_observations.reshape(-1, len(prev_observations), 1))[0])

        choices.append(action)

        # get what was observed, the reward, if the trial was completed
        # and information about the run.
        new_observation, reward, done, info = env.step(action)
        prev_observations = new_observation

        game_memory.append([new_observation, action])
        score += reward


        # from before we know that the application can sometimes terminate before
        # the trial is complete. If that should occur, we want to stop.
        if done:
            print('Trial {0} finished after {1} timesteps. Score: {2}'.format(trial, step, score))
            break
    scores.append(score)

Trial 1 finished after 111 timesteps. Score: 111.0
Trial 2 finished after 169 timesteps. Score: 169.0
Trial 3 finished after 154 timesteps. Score: 154.0
Trial 4 finished after 200 timesteps. Score: 200.0
Trial 5 finished after 200 timesteps. Score: 200.0
Trial 6 finished after 200 timesteps. Score: 200.0
Trial 7 finished after 109 timesteps. Score: 109.0
Trial 8 finished after 145 timesteps. Score: 145.0
Trial 9 finished after 185 timesteps. Score: 185.0
Trial 10 finished after 200 timesteps. Score: 200.0
Trial 11 finished after 143 timesteps. Score: 143.0
Trial 12 finished after 200 timesteps. Score: 200.0
Trial 13 finished after 163 timesteps. Score: 163.0
Trial 14 finished after 154 timesteps. Score: 154.0
Trial 15 finished after 200 timesteps. Score: 200.0
Trial 16 finished after 196 timesteps. Score: 196.0
Trial 17 finished after 200 timesteps. Score: 200.0
Trial 18 finished after 192 timesteps. Score: 192.0
Trial 19 finished after 200 timesteps. Score: 200.0
Trial 20 finished aft

In [0]:
print('Average Score:',sum(scores)/float(len(scores)))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))
print(score_requirement)

Average Score: 162.2
choice 1:0.4938347718865598  choice 0:0.5061652281134402
50


## Compare CartPole to the control theory approach:

[YouTube link](https://youtu.be/1_UobILf3cc?t=694)
See 8:08 and 11:35

## Alternative games
1. Taxi driver

  Actions: 
    There are 6 discrete deterministic actions:
    - 0: move south
    - 1: move north
    - 2: move east 
    - 3: move west 
    - 4: pickup passenger
    - 5: dropoff passenger
    
  Rewards: 
    * -1 for each action 
    * +20 for delievering the passenger
    * -10 for executing actions "pickup" and "dropoff" illegally.
    


In [0]:
env = gym.make('Taxi-v2')

env.reset()
env.render()

+---------+
|R:[43m [0m| : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [0]:
## play around a bit
env.step(1); env.render()
env.step(1); env.step(1); env.step(1); env.render()
env.step(4)

+---------+
|R:[43m [0m| : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R:[43m [0m| : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)


(26, -10, False, {'prob': 1.0})

## Build your own!
* See source code at /usr/local/lib/python3.6/dist-packages/gym/envs/toy_text/taxi.py

In [0]:
from gym import spaces
from gym.utils import seeding


class GuessingGame(gym.Env):
    """Number guessing game

    The object of the game is to guess within 1% of the randomly chosen number
    within 200 time steps

    After each step the agent is provided with one of four possible observations
    which indicate where the guess is in relation to the randomly chosen number

    0 - No guess yet submitted (only after reset)
    1 - Guess is lower than the target
    2 - Guess is equal to the target
    3 - Guess is higher than the target

    The rewards are:
    0 if the agent's guess is outside of 1% of the target
    1 if the agent's guess is inside 1% of the target

    The episode terminates after the agent guesses within 1% of the target or
    200 steps have been taken

    The agent will need to use a memory of previously submitted actions and observations
    in order to efficiently explore the available actions

    The purpose is to have agents optimise their exploration parameters (e.g. how far to
    explore from previous actions) based on previous experience. Because the goal changes
    each episode a state-value or action-value function isn't able to provide any additional
    benefit apart from being able to tell whether to increase or decrease the next guess.

    The perfect agent would likely learn the bounds of the action space (without referring
    to them explicitly) and then follow binary tree style exploration towards to goal number
    """
    def __init__(self):
        self.range = 1000  # Randomly selected number is within +/- this value
        self.bounds = 10000

        self.action_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]),
                                       dtype=np.float32)
        self.observation_space = spaces.Discrete(4)

        self.number = 0
        self.guess_count = 0
        self.guess_max = 200
        self.observation = 0

        self.seed()
        self.reset()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(action)

        if action < self.number:
            self.observation = 1

        elif action == self.number:
            self.observation = 2

        elif action > self.number:
            self.observation = 3

        reward = 0
        done = False

        if (self.number - self.range * 0.01) < action < (self.number + self.range * 0.01):
            reward = 1
            done = True

        self.guess_count += 1
        if self.guess_count >= self.guess_max:
            done = True

        return self.observation, reward, done, {"number": self.number, "guesses": self.guess_count}

    def reset(self):
        self.number = self.np_random.uniform(-self.range, self.range)
        self.guess_count = 0
        self.observation = 0
        return self.observation


In [0]:
!cat "/usr/local/lib/python3.6/dist-packages/gym/core.py"

from gym import logger

import gym
from gym import error
from gym.utils import closer

env_closer = closer.Closer()

# Env-related abstractions

class Env(object):
    """The main OpenAI Gym class. It encapsulates an environment with
    arbitrary behind-the-scenes dynamics. An environment can be
    partially or fully observed.

    The main API methods that users of this class need to know are:

        step
        reset
        render
        close
        seed

    And set the following attributes:

        action_space: The Space object corresponding to valid actions
        observation_space: The Space object corresponding to valid observations
        reward_range: A tuple corresponding to the min and max possible rewards

    Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range.

    The methods are accessed publicly as "step", "reset", etc.. The
    non-underscored versions are wrapper methods to which we may add
    functionalit

In [0]:
 observation, reward, done, info = env.step(1)