# Module Five Assignment: Cartpole Problem

In [1]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



Using TensorFlow backend.


In [2]:
cartpole()

Run: 1, exploration: 1.0, score: 15
Scores: (min: 15, avg: 15, max: 15)

Run: 2, exploration: 0.8778091417340573, score: 31
Scores: (min: 15, avg: 23, max: 31)

Run: 3, exploration: 0.8061065909263957, score: 18
Scores: (min: 15, avg: 21.333333333333332, max: 31)

Run: 4, exploration: 0.736559652908221, score: 19
Scores: (min: 15, avg: 20.75, max: 31)

Run: 5, exploration: 0.6900935609921609, score: 14
Scores: (min: 14, avg: 19.4, max: 31)

Run: 6, exploration: 0.6242658676435396, score: 21
Scores: (min: 14, avg: 19.666666666666668, max: 31)

Run: 7, exploration: 0.5732736268885887, score: 18
Scores: (min: 14, avg: 19.428571428571427, max: 31)

Run: 8, exploration: 0.547986285490042, score: 10
Scores: (min: 10, avg: 18.25, max: 31)

Run: 9, exploration: 0.5032248303978422, score: 18
Scores: (min: 10, avg: 18.22222222222222, max: 31)

Run: 10, exploration: 0.46912134373457726, score: 15
Scores: (min: 10, avg: 17.9, max: 31)

Run: 11, exploration: 0.446186062443672, score: 11
Scores: (mi

NameError: name 'exit' is not defined

Note: If the code is running properly, you should begin to see output appearing above this code block. It will take several minutes, so it is recommended that you let this code run in the background while completing other work. When the code has finished, it will print output saying, "Solved in _ runs, _ total runs."

You may see an error about not having an exit command. This error does not affect the program's functionality and results from the steps taken to convert the code from Python 2.x to Python 3. Please disregard this error.

In [1]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 0.5  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



Using TensorFlow backend.


In [2]:
cartpole()

Run: 1, exploration: 0.5, score: 14
Scores: (min: 14, avg: 14, max: 14)

Run: 2, exploration: 0.47081140345718786, score: 18
Scores: (min: 14, avg: 16, max: 18)

Run: 3, exploration: 0.4523052401373088, score: 9
Scores: (min: 9, avg: 13.666666666666666, max: 18)

Run: 4, exploration: 0.389156278534321, score: 31
Scores: (min: 9, avg: 18, max: 31)

Run: 5, exploration: 0.2753699927085638, score: 70
Scores: (min: 9, avg: 28.4, max: 70)

Run: 6, exploration: 0.1807404651835882, score: 85
Scores: (min: 9, avg: 37.833333333333336, max: 85)

Run: 7, exploration: 0.14352154802212663, score: 47
Scores: (min: 9, avg: 39.142857142857146, max: 85)

Run: 8, exploration: 0.1192260340076466, score: 38
Scores: (min: 9, avg: 39, max: 85)

Run: 9, exploration: 0.05136279668727559, score: 169
Scores: (min: 9, avg: 53.44444444444444, max: 169)

Run: 10, exploration: 0.024955845615029167, score: 145
Scores: (min: 9, avg: 62.6, max: 169)

Run: 11, exploration: 0.01884797999012663, score: 57
Scores: (min: 9

NameError: name 'exit' is not defined

In [3]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.99  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [4]:
cartpole()

Run: 1, exploration: 0.8911090557802088, score: 43
Scores: (min: 43, avg: 43, max: 43)

Run: 2, exploration: 0.8142285204175609, score: 19
Scores: (min: 19, avg: 31, max: 43)

Run: 3, exploration: 0.7514768435208588, score: 17
Scores: (min: 17, avg: 26.333333333333332, max: 43)

Run: 4, exploration: 0.6763948591909945, score: 22
Scores: (min: 17, avg: 25.25, max: 43)

Run: 5, exploration: 0.6180388156137953, score: 19
Scores: (min: 17, avg: 24, max: 43)

Run: 6, exploration: 0.5937455908197752, score: 9
Scores: (min: 9, avg: 21.5, max: 43)

Run: 7, exploration: 0.5618938591163328, score: 12
Scores: (min: 9, avg: 20.142857142857142, max: 43)

Run: 8, exploration: 0.5057535983897912, score: 22
Scores: (min: 9, avg: 20.375, max: 43)

Run: 9, exploration: 0.46211964903917074, score: 19
Scores: (min: 9, avg: 20.22222222222222, max: 43)

Run: 10, exploration: 0.40769130904675194, score: 26
Scores: (min: 9, avg: 20.8, max: 43)

Run: 11, exploration: 0.3858205374665315, score: 12
Scores: (min:

NameError: name 'exit' is not defined

# Code Analysis
## Reinforcement learning experiments

Each module used the same deep Q-learning algorithm to solve the cartpole problem. The main difference between the three experiments lies in the values of the exploration factor, discount factor, and learning rates.

In the first experiment, the discount factor (GAMMA) is set to 0.95, the learning rate (LEARNING_RATE) is set to 0.001, the exploration factor (EXPLORATION_MAX) is set to 1.0, and the exploration decay (EXPLORATION_DECAY) is set to 0.995. The algorithm solved the problem in 122 runs out of 222 total runs.

In the second experiment, the discount factor (GAMMA) and learning rate (LEARNING_RATE) remain the same, but the exploration factor (EXPLORATION_MAX) is decreased to 0.5. The algorithm solved the problem in 4 runs out of 104 total runs, which is a significant improvement compared to the first experiment.

In the third experiment, the discount factor (GAMMA) is increased to 0.99, while the other parameters remain the same as the first experiment. The algorithm solved the problem in 52 runs out of 152 total runs.

These results implies that changing the exploration factor has a significant impact on the performance of the algorithm, while the discount factor and learning rate play a less significant role in this particular problem. The main difference between the three experiments lies in the values of the exploration factor, discount factor, and learning rates.


## How reinforcement learning experiments
The goal of the agent in this case is to balance the pole on top of the cart for as long as posible. The state values include the position and velocity of the cart and the angle velocity of the pole. These values are used as input to the neural network to make predictions about the best action to take. The posible actions that can be performed in the problem are either moving the acart to the left or to the right. The reinforcewment  algorithm used in the problem is Q-learning, a type of model-free reinforcement learning.  

## How experience replay is applied to the cartpole problem.
Experience replay is a technique used in reinforcement learning to store and reuse experiences to improve the stability and convergence of the learning process. In this cartpole problem, this means that after each episode, the agent will store the experiences it had (i.e., the state, action, reward, and next state) and use this data to update the Q-table. The discount factor is used to calculate short-term and long-term rewards in reinforcement learning. In the cartpole problem, the discount factor determines how much weight is given to future rewards compared to immediate rewards. Comparatively, a high discount factor will give more importance to future rewards, whereas a low discount factor will give more importance to immediate rewards.

## How neural networks are used in deep Q-learning.
Neural networks are used in deep Q-learning to approximate the Q-table. The neural network takes in the state as input and outputs a prediction for each action. 
The neural network architecture used in cartpole problem is a simple feedforward neural network with two hidden layers. 
The neural network makes Q-learning algorithm more efficient by allowing for predictions about the best action to be made much more quickly compared to an explicit Q-table. 
The learning rate determines the rate at which the neural network updates its weights during training. Increasing the learning rate will make the neural network update its weights more quickly, whereas decreasing the learning rate will make the neural network update its weights more slowly. A high learning rate can result in instability, whereas a low learning rate can result in slow convergence.

<h3 align="center">References</h3> 
 Brockman, G. (2020, September 1). OpenAI gym beta. OpenAI. Retrieved February 10, 2023, from https://openai.com/blog/openai-gym-beta/
 
 Surma, G. (2018). Cartpole. Github repository. Retrieved from https://github.com/gsurma/cartpole. 