In [1]:
import numpy as np
import random
import gym
from collections import deque
import tensorflow as tf
from tensorflow import keras

In [2]:
from tensorflow.keras.models import Sequential,clone_model
from tensorflow.keras.layers import Dense,Activation
from tensorflow.keras.optimizers import Adam

In [3]:
env_name = 'CartPole-v1'
env = gym.make(env_name)
env.reset()
env.close()

In [4]:
num_observation = env.observation_space.shape[0]

In [5]:
num_actions = env.action_space.n

In [6]:
num_actions

2

In [7]:
model = Sequential()
model.add(Dense(16,input_shape=(1,num_observation)))
model.add(Activation('relu'))

model.add(Dense(32))
model.add(Activation('relu'))

model.add(Dense(num_actions))
model.add(Activation('linear'))



In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1, 16)             80        
_________________________________________________________________
activation (Activation)      (None, 1, 16)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 32)             544       
_________________________________________________________________
activation_1 (Activation)    (None, 1, 32)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1, 2)              66        
_________________________________________________________________
activation_2 (Activation)    (None, 1, 2)              0         
Total params: 690
Trainable params: 690
Non-trainable params: 0
__________________________________________________________

In [9]:
target_model = clone_model(model)

In [17]:
EPOCHS = 100
BATCH_SIZE = 32
epsilon = 1.0
EPSILON_REDUCE = 0.995

LEARNING_RATE = 0.001
GAMMA = 0.95

In [11]:
def epsilon_greedy_action_selection(model, epsilon, observation):
    if np.random.random() > epsilon:
        prediction = model.predict(observation)  
        action = np.argmax(prediction) 
    else:
        action = np.random.randint(0, env.action_space.n)  
    return action

In [12]:
replay_buffer = deque(maxlen=20000)
update_target_model = 10

In [13]:
def replay(replay_buffer, batch_size, model, target_model):
    

    if len(replay_buffer) < batch_size: 
        return

    samples = random.sample(replay_buffer, batch_size)  

    target_batch = []  

    zipped_samples = list(zip(*samples))  
    states, actions, rewards, new_states, dones = zipped_samples  

    targets = target_model.predict(np.array(states))

    q_values = model.predict(np.array(new_states))  

    for i in range(batch_size):  
        

        q_value = max(q_values[i][0])  

        target = targets[i].copy()  
        if dones[i]:
            target[0][actions[i]] = rewards[i]
        else:
            target[0][actions[i]] = rewards[i] + q_value * GAMMA
        target_batch.append(target)

    model.fit(np.array(states), np.array(target_batch), epochs=1, verbose=0)  


In [14]:
def update_model_handler(epoch,update_target_model,model,target_model):
    
    if epoch>0 and epoch % update_target_model==0:
        target_model.set_weights(model.get_weights())

In [15]:
model.compile(loss='mse' , optimizer=(Adam(learning_rate=LEARNING_RATE)))

In [18]:
best_so_far = 0
for epoch in range(EPOCHS):
    observation = env.reset() 

    observation = observation.reshape([ 1, 4])  
    done = False  
    
    points = 0
    while not done: 

        action = epsilon_greedy_action_selection(model, epsilon, observation)
        
        next_observation, reward, done, info = env.step(action)  
        next_observation = next_observation.reshape([ 1, 4])  
        replay_buffer.append((observation, action, reward, next_observation, done))
        observation = next_observation
        points+=1

        replay(replay_buffer, 32, model, target_model)

    
    epsilon *= EPSILON_REDUCE

    update_model_handler(epoch, update_target_model, model, target_model)
    
    if points > best_so_far:
        best_so_far = points
    if epoch %25 == 0:
        print(f"{epoch}: Points reached: {points} - epsilon: {epsilon} - Best: {best_so_far}")

0: Points reached: 34 - epsilon: 0.995 - Best: 34
25: Points reached: 78 - epsilon: 0.8778091417340573 - Best: 78
50: Points reached: 34 - epsilon: 0.7744209942832988 - Best: 98
75: Points reached: 59 - epsilon: 0.6832098777212641 - Best: 160


In [None]:
# model.save_weights('./')
# model.load_weights('./checkpoint')

In [None]:
# model.save('my_model1')
# saved_model = tf.keras.models.load_model('my_model')

In [1]:
observation = env.reset()
for counter in range(300):
    env.render()

    action = np.argmax(saved_model.predict(observation.reshape([1,1,4])))

    observation, reward, done, info = env.step(action)
    
    if done:
        print(f"done")
        break
env.close()