# CartPole

Read environment description: 
https://gym.openai.com/envs/CartPole-v1/

In [39]:
import gym
import numpy as np
import random

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


# Neural network

In [235]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=4))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='linear'))
model.compile(loss='mse', optimizer=Adam())

def predict_q(state):
    return model.predict(state.reshape(1,-1)).reshape(-1)

In [213]:
model.predict(np.array([[1,2,3,4]]))

array([[1.0949209, 1.0251367]], dtype=float32)

In [209]:
model.fit(np.array([[1,2,3,4]]), np.array([[1,1]]), verbose=0)

<keras.callbacks.History at 0x2223f95f6a0>

# Q-learning / Sarsa

In [260]:
env = gym.make('CartPole-v1')
epsilon = 0.9
learning_rate = 0.1
gamma = 0.99

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=4))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='linear'))
model.compile(loss='mse', optimizer=Adam())

epochs = 0

def act_eps_greedy(state):
    if random.random() < epsilon:
        return random.randint(0, 1)
    else:
        q = predict_q(state)
        return np.argmax(q)

def run_episode():
    env.reset()
    observations = []
    state = env.reset()
    while True:
        action = act_eps_greedy(state)
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, reward, done])
        if done:
            break
        state = next_state
    return observations

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [261]:
run_episode()

[[array([0.0086365 , 0.03818126, 0.00372417, 0.02845115]), 1, 1.0, False],
 [array([ 0.00940012,  0.23324961,  0.00429319, -0.26305444]), 1, 1.0, False],
 [array([ 0.01406511,  0.42831002, -0.0009679 , -0.55438016]), 0, 1.0, False],
 [array([ 0.02263131,  0.23320167, -0.0120555 , -0.26200234]), 1, 1.0, False],
 [array([ 0.02729535,  0.42849361, -0.01729555, -0.5584632 ]), 1, 1.0, False],
 [array([ 0.03586522,  0.62385402, -0.02846481, -0.85654462]), 1, 1.0, False],
 [array([ 0.0483423 ,  0.81935202, -0.0455957 , -1.15804035]), 1, 1.0, False],
 [array([ 0.06472934,  1.01503759, -0.06875651, -1.46466413]), 1, 1.0, False],
 [array([ 0.08503009,  1.21093106, -0.09804979, -1.77800855]), 1, 1.0, False],
 [array([ 0.10924871,  1.40701079, -0.13360996, -2.09949594]), 1, 1.0, False],
 [array([ 0.13738893,  1.60319857, -0.17559988, -2.43031938]), 1, 1.0, True]]

In [262]:
# Sarsa
while True:
    total_reward = 0.
    episodes = 100
    epsilon = np.maximum(np.exp(-epochs/10), 0.01)

    for _ in range(episodes):
        
        observations = run_episode()
        
        states = []
        target_qs = []
        
        for i in range(len(observations)):
            (state, action, reward, done) = observations[i]
            total_reward += reward
            
            target_q = predict_q(state)
            
            if done:
                target_q[action] = reward
            else:
                next_state = observations[i+1][0]
                next_action = observations[i+1][1]
                next_q = predict_q(next_state)
                target_q[action] = gamma * next_q[next_action] + reward
                
            states.append(state)
            target_qs.append(target_q)
            
        model.fit(np.array(states), np.array(target_qs), verbose=0)

    epochs += 1
    print(epochs, '- Average reward:', total_reward/episodes, ' (eps', epsilon, ')')

1 - Average reward: 23.21  (eps 1.0 )
2 - Average reward: 19.25  (eps 0.9048374180359595 )
3 - Average reward: 18.81  (eps 0.8187307530779818 )
4 - Average reward: 19.31  (eps 0.7408182206817179 )
5 - Average reward: 28.0  (eps 0.6703200460356393 )
6 - Average reward: 39.5  (eps 0.6065306597126334 )
7 - Average reward: 59.35  (eps 0.5488116360940265 )
8 - Average reward: 81.86  (eps 0.4965853037914095 )
9 - Average reward: 134.52  (eps 0.44932896411722156 )
10 - Average reward: 194.14  (eps 0.4065696597405991 )


KeyboardInterrupt: 

# Try it

In [265]:
env = gym.make('CartPole-v1')
state = env.reset()
steps = 0

while True:
    env.render()
    action = np.argmax(predict_q(state))
    (state, reward, done, _) = env.step(action)
    steps += 1
    if done:
        break

env.close()
print('Finished in %s steps' % steps)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Finished in 500 steps
