# Assignment 3 - Reinforcement Learning with Deep Q-Networks

#### Kobee Raveendran
#### CAP 5610

### Imports (Gym environment and Deep Q-Network dependencies)

In [1]:
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

Using TensorFlow backend.


### Agent and Environment (Explanation)

#### Agent
The agent in the CartPole environment is simply the cart itself; it is responsible for doing the "learning" in this problem. The agent interacts with the environment and learns optimal actions that correspond to higher rewards through experience over time. It gets a sense for which actions are optimal by first performing the action, then receiving a reward from the observation of the next state given by the environment (which repeats in a cyclic fashion).

#### Agent Action Space
In this environment, there is a defined action space that represents the set of actions an agent can perform at each time step. In the CartPole environment, the cart (agent) can only perform one of two actions: move left or right. These two actions constitute the action space, and can be confirmed by viewing the elements or size of the action space (as in `env.action_space.n`).

#### Observation Space
Since this is a relatively simple environment, the observation space, which is the simplest representation for the current state of the environment at each time step, is fairly small. It consists of only 4 elements (confirmed by viewing the shape of `env.observation_space`), which are the **cart position**, **cart velocity**, **pole angle**, and **velocity of the pole at the tip** (confirmed by the cartpole environment source code).

#### Rewards
In the CartPole environment, rewards are fairly straightforward. For every time step that the pole is aloft and in a "surviving" (non-terminal) state, the agent receives a reward of +1. A state is deemed terminal if the pole's angle surpasses a set threshold (+/- 12 degrees) or if the cart has passed a certain distance away from the center (2.4 units), thus reaching the edge of the display (according to the CartPole source code).

In [5]:
def cartpole(param_name:str, param_vals):
    
    # set the default parameters (of which only one will vary per function call)
    params = {
        'learning_rate': 0.001, 
        'gamma': 0.95, 
        'epsilon': 1.0, 
        'epsilon_min': 0.01, 
        'epsilon_decay': 0.995
    }
    
    print('\n\nPARAM: {}'.format(param_name))
    
    
    
    for param_val in param_vals:
        env = gym.make('CartPole-v0')
        
        obs_space = env.observation_space.shape[0]
        action_space = env.action_space.n    
        
        params[param_name] = param_val
        
        print('\nVAL: {}'.format(param_val))
        
        dqn = DQN(
            obs_space, 
            action_space, 
            params['learning_rate'], 
            params['gamma'], 
            params['epsilon'], 
            params['epsilon_min'], 
            params['epsilon_decay']
        )
        
        scores = []
        
        for episode in range(100):
            state = env.reset()
            state.reshape([1, obs_space])

            time_step = 0
            done = False

            while not done:
                env.render()

                action = dqn.act(state)

                observation, reward, done, info = env.step(action)

                if done:
                    observation = env.reset()
                    #print('Terminal observation: ', observation)

                time_step += 1

                dqn.experience_replay()

            scores.append(time_step)

        mean_survival_time = np.mean(scores)
        

        print('Mean survival time: ', mean_survival_time)
    
        env.close()

#### Set up neural network architecture

In [6]:
class DQN:
    
    def __init__(self, 
                 obs_space, 
                 action_space, 
                 learning_rate = 0.001, 
                 gamma = 0.95, 
                 epsilon = 1.0, 
                 epsilon_min = 0.01,
                 epsilon_decay = 0.995
                ):
        self.expl_rate = 1.0
        self.action_space = action_space
        
        self.memory = deque(maxlen = 2000)
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        
        self.model = Sequential()
        self.model.add(Dense(24, input_dim = obs_space, activation = 'relu'))
        self.model.add(Dense(24, activation = 'relu'))
        self.model.add(Dense(self.action_space, activation = 'linear'))
        
        self.model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_space)
        
        q_vals = self.model.predict(state)
        
        print(q_vals)
        
    def experience_replay(self):
        if len(self.memory) < 20:
            return
        
        batch = random.sample(self.memory, 20)
        
        for state, action, reward, state_next, done in batch:
            q_update = reward
            
            if not done:
                q_update = (reward + gamma * np.amax(self.model.predict(state_next)[0]))
                
            q_values = self.model.predict(state)
            
            q_values[0][action] = q_update
            
            self.model.fit(state, q_values, verbose = 0)
            
            self.epsilon *= self.epsilon_decay
            self.epsilon = np.argmax(self.epsilon, self.epsilon_min)
            

# test param configurations in standalone runs (some configs will be duplicated as they match the default vals)
cartpole('learning_rate', [0.001, 0.01, 0.1, 1.0, 10.0])
#cartpole('gamma', [0.25, 0.5, 0.75, 0.95])
#cartpole('epsilon', [1.0, 1.5, 10.0])
#cartpole('epsilon_min', [0.001, 0.01, 0.1, 1.0])
#cartpole('epsilon_decay', [0.75, 0.9, 0.995, 1.0])



PARAM: learning_rate

VAL: 0.001
Mean survival time:  21.47

VAL: 0.01
Mean survival time:  22.44

VAL: 0.1
Mean survival time:  22.44

VAL: 1.0
Mean survival time:  21.75

VAL: 10.0
Mean survival time:  20.77
