# Module Five Assignment: Cartpole Problem
Review the code in this notebook and in the score_logger.py file in the *scores* folder (directory). Once you have reviewed the code, return to this notebook and select **Cell** and then **Run All** from the menu bar to run this code. The code takes several minutes to run.

In [1]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



Using TensorFlow backend.


In [2]:
cartpole()

Run: 1, exploration: 1.0, score: 16
Scores: (min: 16, avg: 16, max: 16)

Run: 2, exploration: 0.8265651079747222, score: 42
Scores: (min: 16, avg: 29, max: 42)

Run: 3, exploration: 0.7901049725470279, score: 10
Scores: (min: 10, avg: 22.666666666666668, max: 42)

Run: 4, exploration: 0.7183288830986236, score: 20
Scores: (min: 10, avg: 22, max: 42)

Run: 5, exploration: 0.6763948591909945, score: 13
Scores: (min: 10, avg: 20.2, max: 42)

Run: 6, exploration: 0.5704072587541458, score: 35
Scores: (min: 10, avg: 22.666666666666668, max: 42)

Run: 7, exploration: 0.4858739637363176, score: 33
Scores: (min: 10, avg: 24.142857142857142, max: 42)

Run: 8, exploration: 0.46211964903917074, score: 11
Scores: (min: 10, avg: 22.5, max: 42)

Run: 9, exploration: 0.4417353564707963, score: 10
Scores: (min: 10, avg: 21.11111111111111, max: 42)

Run: 10, exploration: 0.4180382776616619, score: 12
Scores: (min: 10, avg: 20.2, max: 42)

Run: 11, exploration: 0.39166620452737816, score: 14
Scores: (mi

Run: 89, exploration: 0.01, score: 142
Scores: (min: 9, avg: 89.49438202247191, max: 272)

Run: 90, exploration: 0.01, score: 102
Scores: (min: 9, avg: 89.63333333333334, max: 272)

Run: 91, exploration: 0.01, score: 103
Scores: (min: 9, avg: 89.78021978021978, max: 272)

Run: 92, exploration: 0.01, score: 69
Scores: (min: 9, avg: 89.55434782608695, max: 272)

Run: 93, exploration: 0.01, score: 58
Scores: (min: 9, avg: 89.21505376344086, max: 272)

Run: 94, exploration: 0.01, score: 146
Scores: (min: 9, avg: 89.81914893617021, max: 272)

Run: 95, exploration: 0.01, score: 77
Scores: (min: 9, avg: 89.6842105263158, max: 272)

Run: 96, exploration: 0.01, score: 155
Scores: (min: 9, avg: 90.36458333333333, max: 272)

Run: 97, exploration: 0.01, score: 105
Scores: (min: 9, avg: 90.51546391752578, max: 272)

Run: 98, exploration: 0.01, score: 132
Scores: (min: 9, avg: 90.93877551020408, max: 272)

Run: 99, exploration: 0.01, score: 138
Scores: (min: 9, avg: 91.41414141414141, max: 272)

Run

Run: 189, exploration: 0.01, score: 220
Scores: (min: 9, avg: 126.69, max: 409)

Run: 190, exploration: 0.01, score: 236
Scores: (min: 9, avg: 128.03, max: 409)

Run: 191, exploration: 0.01, score: 103
Scores: (min: 9, avg: 128.03, max: 409)

Run: 192, exploration: 0.01, score: 187
Scores: (min: 9, avg: 129.21, max: 409)

Run: 193, exploration: 0.01, score: 159
Scores: (min: 9, avg: 130.22, max: 409)

Run: 194, exploration: 0.01, score: 39
Scores: (min: 9, avg: 129.15, max: 409)

Run: 195, exploration: 0.01, score: 53
Scores: (min: 9, avg: 128.91, max: 409)

Run: 196, exploration: 0.01, score: 23
Scores: (min: 9, avg: 127.59, max: 409)

Run: 197, exploration: 0.01, score: 253
Scores: (min: 9, avg: 129.07, max: 409)

Run: 198, exploration: 0.01, score: 500
Scores: (min: 9, avg: 132.75, max: 500)

Run: 199, exploration: 0.01, score: 212
Scores: (min: 9, avg: 133.49, max: 500)

Run: 200, exploration: 0.01, score: 500
Scores: (min: 9, avg: 137.21, max: 500)

Run: 201, exploration: 0.01, sc

Run: 291, exploration: 0.01, score: 134
Scores: (min: 23, avg: 188.42, max: 500)

Run: 292, exploration: 0.01, score: 156
Scores: (min: 23, avg: 188.11, max: 500)

Run: 293, exploration: 0.01, score: 249
Scores: (min: 23, avg: 189.01, max: 500)

Run: 294, exploration: 0.01, score: 110
Scores: (min: 23, avg: 189.72, max: 500)

Run: 295, exploration: 0.01, score: 88
Scores: (min: 23, avg: 190.07, max: 500)

Run: 296, exploration: 0.01, score: 399
Scores: (min: 29, avg: 193.83, max: 500)

Run: 297, exploration: 0.01, score: 174
Scores: (min: 29, avg: 193.04, max: 500)

Run: 298, exploration: 0.01, score: 145
Scores: (min: 29, avg: 189.49, max: 500)

Run: 299, exploration: 0.01, score: 123
Scores: (min: 29, avg: 188.6, max: 500)

Run: 300, exploration: 0.01, score: 165
Scores: (min: 29, avg: 185.25, max: 500)

Run: 301, exploration: 0.01, score: 316
Scores: (min: 29, avg: 186.96, max: 500)

Run: 302, exploration: 0.01, score: 143
Scores: (min: 29, avg: 183.39, max: 500)

Run: 303, explorat

Run: 392, exploration: 0.01, score: 224
Scores: (min: 11, avg: 173.94, max: 500)

Run: 393, exploration: 0.01, score: 95
Scores: (min: 11, avg: 172.4, max: 500)

Run: 394, exploration: 0.01, score: 500
Scores: (min: 11, avg: 176.3, max: 500)

Run: 395, exploration: 0.01, score: 138
Scores: (min: 11, avg: 176.8, max: 500)

Run: 396, exploration: 0.01, score: 88
Scores: (min: 11, avg: 173.69, max: 500)

Run: 397, exploration: 0.01, score: 147
Scores: (min: 11, avg: 173.42, max: 500)

Run: 398, exploration: 0.01, score: 169
Scores: (min: 11, avg: 173.66, max: 500)

Run: 399, exploration: 0.01, score: 206
Scores: (min: 11, avg: 174.49, max: 500)

Run: 400, exploration: 0.01, score: 500
Scores: (min: 11, avg: 177.84, max: 500)

Run: 401, exploration: 0.01, score: 110
Scores: (min: 11, avg: 175.78, max: 500)

Run: 402, exploration: 0.01, score: 339
Scores: (min: 11, avg: 177.74, max: 500)

Run: 403, exploration: 0.01, score: 139
Scores: (min: 11, avg: 175.88, max: 500)

Run: 404, exploration

NameError: name 'exit' is not defined

Note: If the code is running properly, you should begin to see output appearing above this code block. It will take several minutes, so it is recommended that you let this code run in the background while completing other work. When the code has finished, it will print output saying, "Solved in _ runs, _ total runs."

You may see an error about not having an exit command. This error does not affect the program's functionality and results from the steps taken to convert the code from Python 2.x to Python 3. Please disregard this error.

In [13]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

# Experiment 1
GAMMA = 0.99 # changed the decay rate (discount factor) of past observations from 0.95 to 0.99  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()

In [14]:
cartpole()

Run: 1, exploration: 1.0, score: 9
Scores: (min: 9, avg: 9, max: 9)

Run: 2, exploration: 0.9655206468094844, score: 18
Scores: (min: 9, avg: 13.5, max: 18)

Run: 3, exploration: 0.9091562615825302, score: 13
Scores: (min: 9, avg: 13.333333333333334, max: 18)

Run: 4, exploration: 0.7822236754458713, score: 31
Scores: (min: 9, avg: 17.75, max: 31)

Run: 5, exploration: 0.736559652908221, score: 13
Scores: (min: 9, avg: 16.8, max: 31)

Run: 6, exploration: 0.6832098777212641, score: 16
Scores: (min: 9, avg: 16.666666666666668, max: 31)

Run: 7, exploration: 0.6401093727576664, score: 14
Scores: (min: 9, avg: 16.285714285714285, max: 31)

Run: 8, exploration: 0.6027415843082742, score: 13
Scores: (min: 9, avg: 15.875, max: 31)

Run: 9, exploration: 0.5590843898207511, score: 16
Scores: (min: 9, avg: 15.88888888888889, max: 31)

Run: 10, exploration: 0.5211953074858876, score: 15
Scores: (min: 9, avg: 15.8, max: 31)

Run: 11, exploration: 0.4932355662165453, score: 12
Scores: (min: 9, avg

Run: 83, exploration: 0.01724627885940145, score: 10
Scores: (min: 8, avg: 10.975903614457831, max: 31)

Run: 84, exploration: 0.0165683801277891, score: 9
Scores: (min: 8, avg: 10.952380952380953, max: 31)

Run: 85, exploration: 0.01575835418494799, score: 11
Scores: (min: 8, avg: 10.952941176470588, max: 31)

Run: 86, exploration: 0.015215016325303928, score: 8
Scores: (min: 8, avg: 10.918604651162791, max: 31)

Run: 87, exploration: 0.01461696034160619, score: 9
Scores: (min: 8, avg: 10.89655172413793, max: 31)

Run: 88, exploration: 0.013972200057807112, score: 10
Scores: (min: 8, avg: 10.886363636363637, max: 31)

Run: 89, exploration: 0.013422995398979608, score: 9
Scores: (min: 8, avg: 10.865168539325843, max: 31)

Run: 90, exploration: 0.01283090141222608, score: 10
Scores: (min: 8, avg: 10.855555555555556, max: 31)

Run: 91, exploration: 0.012264924940880204, score: 10
Scores: (min: 8, avg: 10.846153846153847, max: 31)

Run: 92, exploration: 0.010605575411209664, score: 30
Sco

Run: 184, exploration: 0.01, score: 37
Scores: (min: 8, avg: 28.95, max: 134)

Run: 185, exploration: 0.01, score: 41
Scores: (min: 8, avg: 29.25, max: 134)

Run: 186, exploration: 0.01, score: 36
Scores: (min: 9, avg: 29.53, max: 134)

Run: 187, exploration: 0.01, score: 36
Scores: (min: 9, avg: 29.8, max: 134)

Run: 188, exploration: 0.01, score: 35
Scores: (min: 9, avg: 30.05, max: 134)

Run: 189, exploration: 0.01, score: 52
Scores: (min: 10, avg: 30.48, max: 134)

Run: 190, exploration: 0.01, score: 27
Scores: (min: 10, avg: 30.65, max: 134)

Run: 191, exploration: 0.01, score: 45
Scores: (min: 10, avg: 31, max: 134)

Run: 192, exploration: 0.01, score: 28
Scores: (min: 10, avg: 30.98, max: 134)

Run: 193, exploration: 0.01, score: 35
Scores: (min: 10, avg: 31.16, max: 134)

Run: 194, exploration: 0.01, score: 43
Scores: (min: 10, avg: 31.2, max: 134)

Run: 195, exploration: 0.01, score: 51
Scores: (min: 10, avg: 30.93, max: 134)

Run: 196, exploration: 0.01, score: 30
Scores: (mi

Run: 286, exploration: 0.01, score: 492
Scores: (min: 27, avg: 180.86, max: 500)

Run: 287, exploration: 0.01, score: 455
Scores: (min: 27, avg: 185.05, max: 500)

Run: 288, exploration: 0.01, score: 142
Scores: (min: 27, avg: 186.12, max: 500)

Run: 289, exploration: 0.01, score: 251
Scores: (min: 27, avg: 188.11, max: 500)

Run: 290, exploration: 0.01, score: 171
Scores: (min: 28, avg: 189.55, max: 500)

Run: 291, exploration: 0.01, score: 203
Scores: (min: 28, avg: 191.13, max: 500)

Run: 292, exploration: 0.01, score: 179
Scores: (min: 30, avg: 192.64, max: 500)

Run: 293, exploration: 0.01, score: 148
Scores: (min: 30, avg: 193.77, max: 500)

Run: 294, exploration: 0.01, score: 409
Scores: (min: 30, avg: 197.43, max: 500)

Solved in 194 runs, 294 total runs.


NameError: name 'exit' is not defined

In [19]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

#Experiment 2 - keep the decay rate (discount factor) from previous experiment and modify the learning rate

GAMMA = 0.99 # kept the decay rate from the previous experiment 
LEARNING_RATE = 0.0001 # changed learning rate from 0.001 to 0.0001 
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()

In [20]:
cartpole()

Run: 1, exploration: 0.9558895783575597, score: 29
Scores: (min: 29, avg: 29, max: 29)

Run: 2, exploration: 0.8647077305675338, score: 21
Scores: (min: 21, avg: 25, max: 29)

Run: 3, exploration: 0.8224322824348486, score: 11
Scores: (min: 11, avg: 20.333333333333332, max: 29)

Run: 4, exploration: 0.7744209942832988, score: 13
Scores: (min: 11, avg: 18.5, max: 29)

Run: 5, exploration: 0.6629680834613705, score: 32
Scores: (min: 11, avg: 21.2, max: 32)

Run: 6, exploration: 0.6149486215357263, score: 16
Scores: (min: 11, avg: 20.333333333333332, max: 32)

Run: 7, exploration: 0.567555222460375, score: 17
Scores: (min: 11, avg: 19.857142857142858, max: 32)

Run: 8, exploration: 0.5344229416520513, score: 13
Scores: (min: 11, avg: 19, max: 32)

Run: 9, exploration: 0.483444593917636, score: 21
Scores: (min: 11, avg: 19.22222222222222, max: 32)

Run: 10, exploration: 0.46444185833082485, score: 9
Scores: (min: 9, avg: 18.2, max: 32)

Run: 11, exploration: 0.4417353564707963, score: 11
S

Run: 84, exploration: 0.01, score: 27
Scores: (min: 8, avg: 16.571428571428573, max: 46)

Run: 85, exploration: 0.01, score: 27
Scores: (min: 8, avg: 16.694117647058825, max: 46)

Run: 86, exploration: 0.01, score: 60
Scores: (min: 8, avg: 17.197674418604652, max: 60)

Run: 87, exploration: 0.01, score: 30
Scores: (min: 8, avg: 17.344827586206897, max: 60)

Run: 88, exploration: 0.01, score: 26
Scores: (min: 8, avg: 17.443181818181817, max: 60)

Run: 89, exploration: 0.01, score: 25
Scores: (min: 8, avg: 17.528089887640448, max: 60)

Run: 90, exploration: 0.01, score: 34
Scores: (min: 8, avg: 17.711111111111112, max: 60)

Run: 91, exploration: 0.01, score: 18
Scores: (min: 8, avg: 17.714285714285715, max: 60)

Run: 92, exploration: 0.01, score: 20
Scores: (min: 8, avg: 17.73913043478261, max: 60)

Run: 93, exploration: 0.01, score: 17
Scores: (min: 8, avg: 17.731182795698924, max: 60)

Run: 94, exploration: 0.01, score: 24
Scores: (min: 8, avg: 17.79787234042553, max: 60)

Run: 95, exp

Run: 186, exploration: 0.01, score: 26
Scores: (min: 16, avg: 52.7, max: 189)

Run: 187, exploration: 0.01, score: 45
Scores: (min: 16, avg: 52.85, max: 189)

Run: 188, exploration: 0.01, score: 58
Scores: (min: 16, avg: 53.17, max: 189)

Run: 189, exploration: 0.01, score: 42
Scores: (min: 16, avg: 53.34, max: 189)

Run: 190, exploration: 0.01, score: 44
Scores: (min: 16, avg: 53.44, max: 189)

Run: 191, exploration: 0.01, score: 33
Scores: (min: 16, avg: 53.59, max: 189)

Run: 192, exploration: 0.01, score: 43
Scores: (min: 16, avg: 53.82, max: 189)

Run: 193, exploration: 0.01, score: 68
Scores: (min: 16, avg: 54.33, max: 189)

Run: 194, exploration: 0.01, score: 36
Scores: (min: 16, avg: 54.45, max: 189)

Run: 195, exploration: 0.01, score: 50
Scores: (min: 16, avg: 54.7, max: 189)

Run: 196, exploration: 0.01, score: 43
Scores: (min: 16, avg: 54.74, max: 189)

Run: 197, exploration: 0.01, score: 38
Scores: (min: 16, avg: 54.53, max: 189)

Run: 198, exploration: 0.01, score: 41
Sco

NameError: name 'exit' is not defined

In [23]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

#Experiment 3 - keep the decay rate and learning rate from previous experiments and modify exploration

GAMMA = 0.99 # kept the decay rate from the previous experiment 
LEARNING_RATE = 0.0001 # kept the learning rate from the previous experiment
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 0.75        # changed from 1.0 to 0.75
EXPLORATION_MIN = 0.2         # changed from 0.01 to 0.2
EXPLORATION_DECAY = 0.75      # changed from 0.995 to 0.75
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()

In [24]:
cartpole()

Run: 1, exploration: 0.75, score: 12
Scores: (min: 12, avg: 12, max: 12)

Run: 2, exploration: 0.31640625, score: 11
Scores: (min: 11, avg: 11.5, max: 12)

Run: 3, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.666666666666666, max: 12)

Run: 4, exploration: 0.2, score: 11
Scores: (min: 9, avg: 10.75, max: 12)

Run: 5, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.4, max: 12)

Run: 6, exploration: 0.2, score: 12
Scores: (min: 9, avg: 10.666666666666666, max: 12)

Run: 7, exploration: 0.2, score: 11
Scores: (min: 9, avg: 10.714285714285714, max: 12)

Run: 8, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.5, max: 12)

Run: 9, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.333333333333334, max: 12)

Run: 10, exploration: 0.2, score: 11
Scores: (min: 9, avg: 10.4, max: 12)

Run: 11, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.272727272727273, max: 12)

Run: 12, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.166666666666666, max: 12)

Run: 13, exploration: 

Run: 96, exploration: 0.2, score: 36
Scores: (min: 8, avg: 12.8125, max: 36)

Run: 97, exploration: 0.2, score: 20
Scores: (min: 8, avg: 12.88659793814433, max: 36)

Run: 98, exploration: 0.2, score: 20
Scores: (min: 8, avg: 12.959183673469388, max: 36)

Run: 99, exploration: 0.2, score: 31
Scores: (min: 8, avg: 13.141414141414142, max: 36)

Run: 100, exploration: 0.2, score: 34
Scores: (min: 8, avg: 13.35, max: 36)

Run: 101, exploration: 0.2, score: 31
Scores: (min: 8, avg: 13.54, max: 36)

Run: 102, exploration: 0.2, score: 45
Scores: (min: 8, avg: 13.88, max: 45)

Run: 103, exploration: 0.2, score: 38
Scores: (min: 8, avg: 14.17, max: 45)

Run: 104, exploration: 0.2, score: 18
Scores: (min: 8, avg: 14.24, max: 45)

Run: 105, exploration: 0.2, score: 35
Scores: (min: 8, avg: 14.5, max: 45)

Run: 106, exploration: 0.2, score: 14
Scores: (min: 8, avg: 14.52, max: 45)

Run: 107, exploration: 0.2, score: 14
Scores: (min: 8, avg: 14.55, max: 45)

Run: 108, exploration: 0.2, score: 18
Sco

Run: 201, exploration: 0.2, score: 149
Scores: (min: 14, avg: 66.33, max: 214)

Run: 202, exploration: 0.2, score: 221
Scores: (min: 14, avg: 68.09, max: 221)

Run: 203, exploration: 0.2, score: 151
Scores: (min: 14, avg: 69.22, max: 221)

Run: 204, exploration: 0.2, score: 183
Scores: (min: 14, avg: 70.87, max: 221)

Run: 205, exploration: 0.2, score: 153
Scores: (min: 14, avg: 72.05, max: 221)

Run: 206, exploration: 0.2, score: 213
Scores: (min: 14, avg: 74.04, max: 221)

Run: 207, exploration: 0.2, score: 500
Scores: (min: 15, avg: 78.9, max: 500)

Run: 208, exploration: 0.2, score: 324
Scores: (min: 15, avg: 81.96, max: 500)

Run: 209, exploration: 0.2, score: 204
Scores: (min: 15, avg: 83.76, max: 500)

Run: 210, exploration: 0.2, score: 198
Scores: (min: 15, avg: 85.57, max: 500)

Run: 211, exploration: 0.2, score: 221
Scores: (min: 15, avg: 87.35, max: 500)

Run: 212, exploration: 0.2, score: 275
Scores: (min: 18, avg: 89.95, max: 500)

Run: 213, exploration: 0.2, score: 205
Sc

NameError: name 'exit' is not defined

In [5]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

# Experiment 4 - Only changing the learning rate to see its impact

GAMMA = 0.95  
LEARNING_RATE = 0.0008  # changed from 0.001 to 0.0008
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay() 

In [6]:
cartpole()

Run: 1, exploration: 0.9046104802746175, score: 40
Scores: (min: 40, avg: 40, max: 40)

Run: 2, exploration: 0.7861544476842928, score: 29
Scores: (min: 29, avg: 34.5, max: 40)

Run: 3, exploration: 0.7328768546436799, score: 15
Scores: (min: 15, avg: 28, max: 40)

Run: 4, exploration: 0.6596532430440636, score: 22
Scores: (min: 15, avg: 26.5, max: 40)

Run: 5, exploration: 0.6274028820538087, score: 11
Scores: (min: 11, avg: 23.4, max: 40)

Run: 6, exploration: 0.6027415843082742, score: 9
Scores: (min: 9, avg: 21, max: 40)

Run: 7, exploration: 0.5290920728090721, score: 27
Scores: (min: 9, avg: 21.857142857142858, max: 40)

Run: 8, exploration: 0.4907693883854626, score: 16
Scores: (min: 9, avg: 21.125, max: 40)

Run: 9, exploration: 0.47147873742168567, score: 9
Scores: (min: 9, avg: 19.77777777777778, max: 40)

Run: 10, exploration: 0.4529463432347434, score: 9
Scores: (min: 9, avg: 18.7, max: 40)

Run: 11, exploration: 0.42013897252428334, score: 16
Scores: (min: 9, avg: 18.45454

Run: 87, exploration: 0.01, score: 147
Scores: (min: 8, avg: 37.18390804597701, max: 147)

Run: 88, exploration: 0.01, score: 127
Scores: (min: 8, avg: 38.20454545454545, max: 147)

Run: 89, exploration: 0.01, score: 76
Scores: (min: 8, avg: 38.62921348314607, max: 147)

Run: 90, exploration: 0.01, score: 108
Scores: (min: 8, avg: 39.4, max: 147)

Run: 91, exploration: 0.01, score: 212
Scores: (min: 8, avg: 41.2967032967033, max: 212)

Run: 92, exploration: 0.01, score: 124
Scores: (min: 8, avg: 42.19565217391305, max: 212)

Run: 93, exploration: 0.01, score: 132
Scores: (min: 8, avg: 43.16129032258065, max: 212)

Run: 94, exploration: 0.01, score: 73
Scores: (min: 8, avg: 43.47872340425532, max: 212)

Run: 95, exploration: 0.01, score: 97
Scores: (min: 8, avg: 44.04210526315789, max: 212)

Run: 96, exploration: 0.01, score: 49
Scores: (min: 8, avg: 44.09375, max: 212)

Run: 97, exploration: 0.01, score: 74
Scores: (min: 8, avg: 44.402061855670105, max: 212)

Run: 98, exploration: 0.01

NameError: name 'exit' is not defined

The following is an analysis of how reinforcement learning concepts apply to the cartpole problem, how experience replay is applied to the cartpole problem, and how neural networks are used in deep Q-learning.
<br><br>**Explain how reinforcement learning concepts apply to the cartpole problem.**
<br><br>•	**What is the goal of the agent in this case?**
<br><br>The goal of the agent in this case is to keep the cartpole balanced and prevent it from falling over, by applying the appropriate forces to the cartpole’s pivot point (Surma, 2018). 
<br><br>•	**What are the various state values?**
<br><br>According to Surma (2018), the observation space is the possible state values, which includes the cart position, cart velocity, pole angle, and pole velocity at the tip. The ranges for each are as follows: 
<br><br>Cart Position: (-4.8, 4.8)
<br>Cart Velocity: (-Infinity, Infinity)
<br>Pole Angle: (-24 degrees, 24 degrees)
<br>Pole Velocity At Tip: (-Infinity, Infinity)
<br><br>•	**What are the possible actions that can be performed?**
<br><br>There are only two possible actions that can occur to keep the cartpole balanced and that is to either push the cart to the right or to the left (Surma, 2018).
<br><br>•	**What reinforcement algorithm is used for this problem?**
<br><br>The reinforcement algorithm that is used for this problem is the Deep Q-Learning (DQN) algorithm, which is a technique where given a circumstance or observation, the goal is to choose the best action (Surma, 2018).
<br><br>**Analyze how experience replay is applied to the cartpole problem.**
<br><br>•	**How does experience replay work in this algorithm?**
<br><br>First, in Deep Q-learning, every possible action for observations has its own Q value, which is the quality of any given move (Surma, 2018). Inspired biologically, experience replay samples past experiences (from memory) and updates each entry’s Q value, which reduces the correlation between subsequent actions (Surma, 2018). Essentially, because of experience replay, instead of getting rid of past experiences, the experiences are stored in memory, which are then reused for training.
<br><br>•	**What is the effect of introducing a discount factor for calculating the future rewards?**
<br><br>The reason why a discount factor is introduced is because it determines the importance of future rewards (Venkatachalam, 2019). If this value is set to zero, only immediate rewards would be considered over future rewards. Increasing this value closer to one helps to balance short-term gains vs. future or long-term benefits, however, this can prevent conversion (Baeldung, 2023).
<br><br>**Analyze how neural networks are used in deep Q-learning.**
<br><br>•	**Explain the neural network architecture that is used in the cartpole problem.**
<br><br>In the cartpole problem, a Deep Q-learning algorithm uses a deep neural network to approximate values by feeding the initial state into the network, which then, based on the Q table, calculates all possible actions, which are then generated as the output. (Singh, 2024). Past experiences are stored in memory, and the future action is dependent on the maximum value output of the Q-network (Singh, 2024).
<br><br>•	**How does the neural network make the Q-Learning algorithm more efficient?**
<br><br>In the Q-learning algorithm, using the Q-value iteration, the Q-table is computed containing all Q-values of any state-action pair (Amine, 2020). Because every state-action pair is stored, while this could work well for finite states and action spaces, it would not work well for significantly larger spaces (Amine, 2020). This is where implementing a deep neural network enables us to make the Q-learning algorithm more efficient because of their efficiency to approximate functions (Amine, 2020). Using the deep neural network to approximate the Q-value function, allows us to deal with continuous spaces without problem (Amine, 2020).
<br><br>•	**What differences do you see in the algorithm performance when you increase or decrease the learning rate?** 
<br><br>In my last experiment, experiment 4, I chose to only modify the learning rate by decreasing it from 0.001 to 0.0008. In the first code block where no changes were made to the code, it solved the cartpole problem in 340 runs with a total of 440 runs. In experiment 4 where I decreased the learning rate by a small amount, it was able to solve the cartpole problem in 54 runs with a total of 154 runs. However, any time I tried making a significant change in the learning rate such as decreasing it to 0.0001 or increasing it to 0.01 or even 0.005, I observed drops in efficiency. The model would try to run for hundreds of epochs without solving the problem. 
<br><br>The learning rate is used in neural networks to determine how quickly a model is adapted to a problem and is often represented as a small value typically in the range of 0.0 to 1.0 (Brownlee, 2020). Decreasing the learning rate requires more training epochs since smaller changes are being made to the weights after each update, and increasing the learning rate results in more rapid changes to the weights, requiring a smaller number of training epochs (Brownlee, 2020). Having a learning rate that is too high can cause the network to converge too fast, which would lead to a solution that is suboptimal, and having a learning rate that is too small could cause the process to get stuck (Brownlee, 2020).

References:
<br><br>Amine, A. (2020, December 19). Deep Q-Networks: from theory to implementation. Medium. https://towardsdatascience.com/deep-q-networks-theory-and-implementation-37543f60dd67
<br><br>Baeldung. (2023, March 24). Epsilon-Greedy Q-learning. https://www.baeldung.com/cs/epsilon-greedy-q-learning
<br><br>Brownlee, J. (2020, September 12). Understand the Impact of Learning Rate on Neural Network Performance. Machine Learning Mastery. https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/
<br><br>Singh, S. (2024). How Are Neural Networks Used in Deep Q-Learning? Turing. https://www.turing.com/kb/how-are-neural-networks-used-in-deep-q-learning
<br><br>Surma, G. (2018, September 26). Cartpole - Introduction to Reinforcement Learning (DQN - Deep Q-Learning). Medium. https://gsurma.medium.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288
<br><br>Venkatachalam, M. (2019, October 2). Q-Learning – An introduction through a simple table based implementation with learning rate, discount factor and exploration. gotensor. https://gotensor.com/2019/10/02/q-learning-an-introduction-through-a-simple-table-based-implementation-with-learning-rate-discount-factor-and-exploration/
