In [1]:
from gym import envs
list(envs.registry.all())

[EnvSpec(Copy-v0),
 EnvSpec(RepeatCopy-v0),
 EnvSpec(ReversedAddition-v0),
 EnvSpec(ReversedAddition3-v0),
 EnvSpec(DuplicatedInput-v0),
 EnvSpec(Reverse-v0),
 EnvSpec(CartPole-v0),
 EnvSpec(CartPole-v1),
 EnvSpec(MountainCar-v0),
 EnvSpec(MountainCarContinuous-v0),
 EnvSpec(Pendulum-v0),
 EnvSpec(Acrobot-v1),
 EnvSpec(LunarLander-v2),
 EnvSpec(LunarLanderContinuous-v2),
 EnvSpec(BipedalWalker-v2),
 EnvSpec(BipedalWalkerHardcore-v2),
 EnvSpec(CarRacing-v0),
 EnvSpec(Blackjack-v0),
 EnvSpec(KellyCoinflip-v0),
 EnvSpec(KellyCoinflipGeneralized-v0),
 EnvSpec(FrozenLake-v0),
 EnvSpec(FrozenLake8x8-v0),
 EnvSpec(CliffWalking-v0),
 EnvSpec(NChain-v0),
 EnvSpec(Roulette-v0),
 EnvSpec(Taxi-v3),
 EnvSpec(GuessingGame-v0),
 EnvSpec(HotterColder-v0),
 EnvSpec(Reacher-v2),
 EnvSpec(Pusher-v2),
 EnvSpec(Thrower-v2),
 EnvSpec(Striker-v2),
 EnvSpec(InvertedPendulum-v2),
 EnvSpec(InvertedDoublePendulum-v2),
 EnvSpec(HalfCheetah-v2),
 EnvSpec(HalfCheetah-v3),
 EnvSpec(Hopper-v2),
 EnvSpec(Hopper-v3),
 

In [2]:
import gym
import numpy as np
from collections import deque
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random
import matplotlib.pyplot as plt

env = gym.make("MountainCar-v0")

print("Understanding the Environment")
print("\n")
print("Action Space: ",env.action_space)
print("State Space: ",env.observation_space)
print("State Example: ",env.reset())
print("Cycle tuple Example: ",env.step(0))

Using TensorFlow backend.


Understanding the Environment


Action Space:  Discrete(3)
State Space:  Box(2,)
State Example:  [-0.50800453  0.        ]
Cycle tuple Example:  (array([-0.50912144, -0.00111691]), -1.0, False, {})


In [12]:
class DQNAgent():
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.e = 1
        self.e_decay_rate = 0.99
        self.e_min = 0.1
        self.learning_rate = 0.0005
        self.discount_rate = 0.99
        
        self.memory = []
        self.model = self.build_model()
        
    def build_model(self):
        
        model = Sequential()
        model.add(Dense(50, activation="relu", input_shape = (self.state_size,)))
        model.add(Dense(self.action_size, activation="linear"))
        model.compile(optimizer=Adam(learning_rate = self.learning_rate), loss='mse')
        
        return model
    
    def calculate_reward(self, next_state):
        return next_state[1]**2
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def select_action(self, state):
        if np.random.rand() < self.e:
            return random.choice([a for a in range(self.action_size)])
        return np.argmax(self.model.predict(state)[0])
    
    
    def replay(self, mini_batch_size=32):
        
        mini_batch = random.sample(self.memory, mini_batch_size)
        
        for state, action, reward, next_state, done in mini_batch:
            target = reward
            if not done:
                target = reward + self.discount_rate * (np.max(self.model.predict(next_state)[0]))
            
            q_values = self.model.predict(state)
            q_values[0][action] = target
            self.model.fit(x = state, y = q_values, epochs = 1, verbose = 0)
        
  
    def load_weight(self, weights):
        self.model.load_weight(weights)
        
        
        
        
        
        
        

In [13]:
def train(agent = None, episode = 500, timelimit = 200, train_batch = 200):
    
    for ep in range(episode):
        
        done = False
        state = env.reset()
        total_reward = 0
        position_list = []
        for timestep in range(timelimit):
        
            state = state.reshape(1,agent.state_size)
            action = agent.select_action(state)

            next_state, _, done, _ = env.step(action)
            position_list.append(state[0])
            reward = agent.calculate_reward(next_state)
            next_state = next_state.reshape(1, agent.state_size)
            
            
            agent.remember(state, action, reward, next_state, done)
            total_reward += reward
            
            state = next_state
            
            if done:
                print("The current episode is {}, epsilon = {:0.2f}, Total Reward: {:0.2f}, Total Time Taken: {} , Max Position: {:0.4f}".format(ep, agent.e , total_reward, timestep, np.max(position_list)))
                break 
                
        if len(agent.memory) > train_batch:
            agent.replay(mini_batch_size = train_batch)
                
            if agent.e > agent.e_min:
                agent.e *= agent.e_decay_rate
                
    

In [14]:
state_size, action_size = env.observation_space.shape[0] , env.action_space.n
mountain_agent = DQNAgent(state_size, action_size)
train(agent = mountain_agent , episode = 1000, timelimit = 200, train_batch = 500)

The current episode is 0, epsilon = 1.00, Total Reward: 0.04, Total Time Taken: 199 , Max Position: 0.0258
The current episode is 1, epsilon = 1.00, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0123
The current episode is 2, epsilon = 1.00, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0071
The current episode is 3, epsilon = 0.99, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0090
The current episode is 4, epsilon = 0.98, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0182
The current episode is 5, epsilon = 0.97, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0095
The current episode is 6, epsilon = 0.96, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0097
The current episode is 7, epsilon = 0.95, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0153
The current episode is 8, epsilon = 0.94, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0138
The current episode is 9, epsilon = 0

The current episode is 76, epsilon = 0.48, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0099
The current episode is 77, epsilon = 0.47, Total Reward: 0.02, Total Time Taken: 199 , Max Position: 0.0172
The current episode is 78, epsilon = 0.47, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0069
The current episode is 79, epsilon = 0.46, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0099
The current episode is 80, epsilon = 0.46, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0108
The current episode is 81, epsilon = 0.45, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0048
The current episode is 82, epsilon = 0.45, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0105
The current episode is 83, epsilon = 0.44, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0106
The current episode is 84, epsilon = 0.44, Total Reward: 0.02, Total Time Taken: 199 , Max Position: 0.0211
The current episode is 85, e

The current episode is 152, epsilon = 0.22, Total Reward: 0.10, Total Time Taken: 199 , Max Position: 0.1752
The current episode is 153, epsilon = 0.22, Total Reward: 0.03, Total Time Taken: 199 , Max Position: 0.0239
The current episode is 154, epsilon = 0.22, Total Reward: 0.06, Total Time Taken: 199 , Max Position: 0.0858
The current episode is 155, epsilon = 0.21, Total Reward: 0.02, Total Time Taken: 199 , Max Position: 0.0223
The current episode is 156, epsilon = 0.21, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0051
The current episode is 157, epsilon = 0.21, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0076
The current episode is 158, epsilon = 0.21, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0071
The current episode is 159, epsilon = 0.21, Total Reward: 0.08, Total Time Taken: 199 , Max Position: 0.0355
The current episode is 160, epsilon = 0.20, Total Reward: 0.03, Total Time Taken: 199 , Max Position: 0.0267
The current episode

The current episode is 228, epsilon = 0.10, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0094
The current episode is 229, epsilon = 0.10, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0043
The current episode is 230, epsilon = 0.10, Total Reward: 0.02, Total Time Taken: 199 , Max Position: 0.0314
The current episode is 231, epsilon = 0.10, Total Reward: 0.14, Total Time Taken: 199 , Max Position: 0.0667
The current episode is 232, epsilon = 0.10, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0037
The current episode is 233, epsilon = 0.10, Total Reward: 0.01, Total Time Taken: 199 , Max Position: 0.0143
The current episode is 234, epsilon = 0.10, Total Reward: 0.13, Total Time Taken: 154 , Max Position: 0.4720
The current episode is 235, epsilon = 0.10, Total Reward: 0.00, Total Time Taken: 199 , Max Position: 0.0046
The current episode is 236, epsilon = 0.10, Total Reward: 0.14, Total Time Taken: 163 , Max Position: 0.4616
The current episode

KeyboardInterrupt: 

In [94]:
max(1,2)

2

In [18]:

def test(agent = mountain_agent, timelimit = 200):
    done = False
    state = env.reset()
    total_reward = 0
    position_list = []
    for timestep in range(timelimit):

        state = state.reshape(1,agent.state_size)
        action = agent.select_action(state)

        next_state, _, done, _ = env.step(action)
        position_list.append(state[0])
        reward = agent.calculate_reward(next_state)
        next_state = next_state.reshape(1, agent.state_size)

        total_reward += reward

        state = next_state

        if done:
            print("The current episode is {}, epsilon = {:0.2f}, Total Reward: {:0.2f}, Total Time Taken: {} , Max Position: {:0.4f}".format(1, agent.e , total_reward, timestep, np.max(position_list)))
            break 

In [19]:
test()

The current episode is 1, epsilon = 0.10, Total Reward: 0.14, Total Time Taken: 151 , Max Position: 0.4782
