### Imports

In [3]:
import gym
import numpy as np
import random
import collections
import keras
from keras.layers import Dense
from collections import deque

#### Set up agent environment

In [6]:
def cartpole():
    env = gym.make('CartPole-v0')
    obs_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn = DQN(obs_space, action_space)
    
    scores = []
    
    for episode in range(20):
        state = env.reset()
        state.reshape([1, obs_space])

        time_step = 0
        done = False
        
        while not done:
            env.render()
            
            action = dqn.act(state)
            
            observation, reward, done, info = env.step(action)
            
            if done:
                print('Terminal observation: ', observation)
                
            time_step += 1
            
            dqn.experience_replay()
                
        scores.append(time_step)
        
        mean_survival_time = np.mean(scores)
        

    print('Mean survival time: ', mean_survival_time)

#### Set up neural network architecture

In [7]:
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

class DQN:
    
    def __init__(self, obs_space, action_space):
        self.expl_rate = 1.0
        self.action_space = action_space
        
        self.memory = deque(maxlen = 1000000)
        
        self.model = Sequential()
        self.model.add(Dense(24, input_shape = (obs_space,), activation = 'relu'))
        self.model.add(Dense(24, activation = 'relu'))
        self.model.add(Dense(self.action_space, activation = 'linear'))
        
        self.model.compile(loss = 'mse', optimizer = Adam(lr = 0.001))
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append(state, action, reward, next_state, done)
        
    def act(self, state):
        if np.random.rand() < 1.0:
            return random.randrange(self.action_space)
        
        q_vals = self.model.predict(state)
        
        print(q_vals)
        
    def experience_replay(self):
        if len(self.memory) < 20:
            return
        
        batch = random.sample(self.memory, 20)
        
        for state, action, reward, state_next, done in batch:
            q_update = reward
            
            if not done:
                q_update = (reward + 0.95 * np.argmax(self.model.predict(state_next)[0]))
                
            q_values = self.model.predict(state)
            
            q_values[0][action] = q_update
            
            self.model.fit(state, q_values, verbose = 0)
            
            self.exlporation_rate *= 0.995
            self.exploration_rate = np.argmax(self.exploration_rate, 0.01)
            
        
cartpole()

Terminal observation:  [ 0.10106356  0.54746514 -0.22854109 -1.17852968]
Terminal observation:  [-0.12715561 -0.63176278  0.22012085  1.22809397]
Terminal observation:  [-0.29115897 -0.38980808  0.21511627  0.47275644]
Terminal observation:  [ 1.05375607  1.49469762 -0.21909243 -0.9542289 ]
Terminal observation:  [ 0.19121749  0.81944281 -0.21390629 -1.42301948]
Terminal observation:  [-0.0115392  -0.44228085  0.21215357  1.16452615]
Terminal observation:  [ 0.17693531  1.01687104 -0.21190343 -1.67106457]
Terminal observation:  [-0.15432944 -1.18811534  0.25120551  1.96506537]
Terminal observation:  [ 0.19531793  0.63532516 -0.21876429 -1.15718864]
Terminal observation:  [-0.38401793 -0.53138361 -0.21174535 -0.53685441]
Terminal observation:  [-0.31364004 -1.98272842  0.24765841  2.62378657]
Terminal observation:  [ 0.17991867  0.82994547 -0.21375143 -1.32183541]
Terminal observation:  [-0.05027574 -0.57364447  0.2096049   1.34532229]
Terminal observation:  [ 0.12945055  0.60244692 -0.