In [6]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Conv1D, MaxPooling1D, Flatten
from keras.optimizers import Adam
from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())
from IPython.display import display, clear_output

from sklearn import preprocessing


Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']


In [22]:
# Deep Q-learning Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.96    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.993
        self.model = self._build_model()
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(200, input_dim=self.state_size, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(64, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(32, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(self.action_size, activation='tanh'))
        model.compile(loss='mse',
                      optimizer=Adam())
        return model
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return (np.random.rand(4) * 2) -1
        act_values = self.model.predict(state)
        act_values = [item for sublist in act_values for item in sublist]
        return act_values # returns action
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                  target = reward + self.gamma * self.model.predict(next_state)[0]
            target_f = self.model.predict(state)
            
#             print("target_f", target_f, target)
            target_f[:] = target
#             print("target_f", target_f, target)
            self.model.fit(state, target_f, epochs= 1, verbose=0 )
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    def save_model(self):
        self.model.save('./checkpoint.h5')
    def load_model(self):
        self.model.load_weights('./checkpoint.h5')

In [23]:
def count_consec_300(x):
    cnt = 0
    for i in range(len(x)):
        if x[i] >= 300:
            cnt += 1
        else:
            cnt = 0
    return cnt

In [None]:

env = gym.make('BipedalWalker-v2')
state_size = 24
action_size = 4
agent = DQNAgent(state_size, action_size)
# agent.load_model()
done = False
batch_size = 32
game_history = [0]
t_steps = 0
for e in range(2000):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(450):
        # env.render()
        if time % 5 == 0:
            print(time, end=', ')
        


        action = agent.act(state)
        
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -100
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            break
        
        
        if len(agent.memory) > batch_size +5:

            agent.replay(batch_size + int(time / 10))
    print("\nscore: {}, count 300:{} max:{} min:{} mean:{}".format(time ,count_consec_300(game_history), max(game_history), min(game_history), np.mean(game_history)))
    game_history.append(time)
    done = False
    if e % 5 == 4:
        agent.save_model()
        batch_size += 4

0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 
score: 181, count 300:0 max:0 min:0 mean:0.0
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 
score: 204, count 300:0 max:181 min:0 mean:90.5
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 
score: 173, count 300:0 max:204 min:0 mean:128.33333333333334
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 
score: 116, count 300:0 max:204 min:0 mean:139.5
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 
score: 77, count 300:0 max:204 min:0 mean:134.8
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 1

In [13]:
import matplotlib.pyplot as plt
# plt.plot(game_history)
len(game_history), len(agent.memory)

(5, 446)

In [106]:
np.mean(game_history)
np.save('checkpoint350_gameHist',game_history)

In [111]:

1.0005 ** 100

1.0512579599480434