# CartPole-v0

## Load packages & Setup

In [1]:
import gym
import numpy as np
import time

env = gym.make('CartPole-v0')
state = env.reset()
env.render()
time.sleep(1)
env.close()

## Define a testing function

In [2]:
def test(func):
    NUM_EPOCH = 100
    memory_score  = np.zeros((NUM_EPOCH,))
    done = False
    for epoch in np.arange(NUM_EPOCH):
        score = 0
        state = env.reset()
        done = False
        while not done:
            action = func(state)
            state, reward, done, _ = env.step(action)
            score += 1
            if epoch == NUM_EPOCH-1: # render only the last epoch
                env.render()
        memory_score[epoch] = score
    print('Average Score : ' + str(np.mean(memory_score)) + ' +-' + str('{0:.2f}').format(np.std(memory_score)))
    env.close()

## Method 1 : (no training) Random action
### Random action

In [3]:
def randomAction(state):
    return np.random.randint(1)
test(randomAction)

Average Score : 9.42 +-0.71


## Method 2 : (no training) Conditional action
### Push the Cart toward the falling direction

In [4]:
def conditionalAction(state):
    # state[0] : cart position
    # state[1] : cart velocity
    # state[2] : pole angle
    # state[3] : pole velocity at tip
    angle = state[2] 
    if angle > 0:
        return 1
    else:
        return 0
test(conditionalAction)

Average Score : 43.06 +-8.56


## Method 3 : Learn from lucky episodes
### Store lucky episodes and use them to train the neural network 

In [5]:
import tensorflow as tf
if tf.__version__ < '2.3.0' :
    raise Exception('tensorflow version mismatch')

### Generate Model
* Model Description
|Layer|input node|output node|num param|activation function|
| --- | --- | --- | --- | --- |
|Input Layer | 4 | 10 | 50 | relu |
|Hidden Layer| 10 | 10 | 110 | relu |
|Output Layer| 10 |1 | 11 | linear |


In [6]:
agent = tf.keras.Sequential([
    tf.keras.layers.Dense(10,input_shape=(4,),activation='relu'),
    tf.keras.layers.Dense(10,activation='relu'),
    tf.keras.layers.Dense(1)
])
agent.compile(optimizer='adam',
             loss='mean_squared_error',
             metrics='mean_squared_error')

In [7]:
agent.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                50        
_________________________________________________________________
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________


In [8]:
def learnLucky(state):
    if agent(np.expand_dims(state,0)) > 0:
        return 1
    else:
        return 0

### Algorithm
1. During initial learning stages (ex. first 30 stages), **Conditional action** method is used to collect lucky episodes
2. While running **NUM_RANDOM_EPISODES** number of random episodes, store the best episode in to the **memory_best_episode**
3. After selecting the lucky episode, train the agent(neural network) with the episode
4. Repeat 1~3 steps
5. After initial learning stages, use **learnLucky** function which now has sufficiently trained agent to obstain certain level of performance
6. Now utilize **learnLucky** function to obtain lucky episode
7. After selecting the lucky episode, train the agent(neural network) with the episode
8. Repeat 6~7 steps

In [9]:
NUM_LEARNING = 100
NUM_RANDOM_EPISODES = 100
MAXIMUM_REWARD = 200

for learn in np.arange(NUM_LEARNING):
    memory_best_episode = np.zeros((MAXIMUM_REWARD,5))
    memory_best_score = 0
    for ep in np.arange(NUM_RANDOM_EPISODES): # Run random episodes to pick a lucky episode
        score = 0
        temporal_memory_episode = np.zeros((MAXIMUM_REWARD,5))
        state = env.reset()
        done = False
        while not done:
            if learn < 50:
                action = conditionalAction(state)
            else:
                action = learnLucky(state)
            temporal_memory_episode[score,:] = np.hstack((state,action)) # memorize
            state, reward, done, _ = env.step(action)
            score += 1
        if memory_best_score < score:
            memory_best_episode = temporal_memory_episode.copy()
            memory_best_score = score
            ep = 2
            if score == MAXIMUM_REWARD:
                ep = 4
                break
    # after memorizing the best episode, start learn
    print("Learning Stage #" + str('{0:03d}').format(learn) + " | Learning from the best score : " + str(memory_best_score))
    agent.fit(memory_best_episode[0:memory_best_score,0:4],memory_best_episode[0:memory_best_score,4]*2-1,epochs=ep,verbose=0)
    
test(learnLucky)

Learning Stage #000 | Learning from the best score : 67
Learning Stage #001 | Learning from the best score : 62
Learning Stage #002 | Learning from the best score : 61
Learning Stage #003 | Learning from the best score : 62
Learning Stage #004 | Learning from the best score : 68
Learning Stage #005 | Learning from the best score : 72
Learning Stage #006 | Learning from the best score : 62
Learning Stage #007 | Learning from the best score : 68
Learning Stage #008 | Learning from the best score : 62
Learning Stage #009 | Learning from the best score : 70
Learning Stage #010 | Learning from the best score : 68
Learning Stage #011 | Learning from the best score : 62
Learning Stage #012 | Learning from the best score : 67
Learning Stage #013 | Learning from the best score : 66
Learning Stage #014 | Learning from the best score : 63
Learning Stage #015 | Learning from the best score : 67
Learning Stage #016 | Learning from the best score : 61
Learning Stage #017 | Learning from the best sco

## Method 4 : Q-Learning, Actor-Critic method, Deep Q-Learning, etc....