In [10]:
# Imports
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
import time
from IPython.display import clear_output
import pickle
from collections import deque
import os

# Import ML libraries
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

In [11]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [12]:
class Agent():
    def __init__(self, env, qnetwork_file=None, batch_size = 32, train = True):
        self.env = env
        self.epsilon = 1.0 if (train) else 0.
        self.gamma = 0.99
        self.batch_size = batch_size
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.9995 if (train) else 1.
        self.alpha = 0.05
        self.memory = deque(maxlen=10000)

        self.train = train # Easy way switch train/test

        # Qnetwork and target network
        self.qnetwork = self.defineNetwork() if (qnetwork_file == None) else load_model(qnetwork_file)
        self.target = self.defineNetwork()
        self.alignTarget()

    def defineNetwork(self):
        model = Sequential()
        model.add(Dense(16, input_dim=np.array(1), activation="linear"))
        #model.add(Reshape((16,)))
        model.add(Dense(16, activation="relu"))  # hidden layer
        model.add(Dense(16, activation="relu")) # hidden layer
        model.add(Dense(self.env.action_space.n, activation='linear')) # output layer
        model.compile(loss='mse',optimizer=Adam(lr=self.alpha))
        return model

    def alignTarget(self):
        self.target.set_weights(self.qnetwork.get_weights())

    def setSimParameters(self, episodes=100, ntimesteps=200):
        self.episodes = episodes if (self.train) else 10
        self.ntimesteps = ntimesteps

    def selectAction(self, state):
        if(random.uniform(0, 1) < self.epsilon):
            return env.action_space.sample()
        return np.argmax(self.qnetwork.predict(np.array(state)))

    def addToMemory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def optimize(self): # , state, action, reward, next_state, done
        if (self.train and (len(self.memory) > self.batch_size)):
            batch = np.array(random.sample(self.memory, self.batch_size))

            train_states = np.zeros((self.batch_size, 1))
            target_rewards = np.zeros((self.batch_size, self.env.action_space.n))

            for i, (state, action, reward, next_state, done) in enumerate(batch):
                target = self.qnetwork.predict(state)
                if (done):
                    target[0][action] = reward
                else:
                    t = self.target.predict(next_state)
                    target[0][action] = reward + self.gamma*np.amax(t)

                train_states[i] = state
                target_rewards[i, :] = target

            self.qnetwork.fit(np.squeeze(train_states), np.squeeze(target_rewards), epochs = 1, verbose = 0)

    def updateEpsilon(self):
        self.epsilon = np.maximum(self.epsilon_decay*self.epsilon, self.epsilon_min)

In [35]:
env = gym.make("Taxi-v3")

# Define agent, sim parameters
taxiAgent = Agent(env, qnetwork_file="taxi_qnetwork_5120.h5", batch_size = 64, train = False)
taxiAgent.setSimParameters(episodes = 10000, ntimesteps = 200)

taxiAgent.qnetwork.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                32        
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 102       
Total params: 678
Trainable params: 678
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Test sim run:
# taxiAgent.epsilon=0.5
for j in range(10): # 10 episodes
    state = env.reset()
    for i in range(200):
        action = taxiAgent.selectAction(np.array(state).reshape(1)) # env.action_space.sample()
        next_state, reward, done, info = env.step(action)

        taxiAgent.addToMemory(np.array(state).reshape(1), action, reward, \
                                    np.array(next_state).reshape(1), done)

        state = next_state

        env.render()

        print("Current episode, timestep, state, action taken: ", j, i, state, action)
        time.sleep(0.05)
        clear_output(wait=True)    

        if(done):
            print("success!")
            break

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+
  (South)
Current episode, timestep, state, action taken:  6 77 483 0


In [7]:
# Main loop
save_point = 10
current_reward = [0]
current_episode = [0]

for i in range(taxiAgent.episodes): # taxiAgent.episodes
    state = env.reset()
    clear_output(wait=True) # Works only inside jupyter notebooks
    print("Episode number: {} of {}. Current epsilon: {}".format(str(i),
                                                                 str(taxiAgent.episodes), str(taxiAgent.epsilon)))

    for j in range(taxiAgent.ntimesteps): # taxiAgent.ntimesteps
        if not taxiAgent.train:
            env.render()
            print("Current episode number, timesteps, epsilon: ", i, taxiAgent.ntimesteps, j, taxiAgent.epsilon)
            clear_output(wait=True)

        action = taxiAgent.selectAction(np.array(state).reshape(1))
        new_state, reward, done, _ = env.step(action)

        taxiAgent.addToMemory(np.array(state).reshape(1), action, reward, np.array(new_state).reshape(1), done)
        taxiAgent.optimize()

        state = new_state

        if (done):
            break

    taxiAgent.updateEpsilon()
    taxiAgent.alignTarget()

    if (taxiAgent.train):
        if(i == save_point):
            taxiAgent.qnetwork.save("taxi_qnetwork_"+str(i)+".h5")
            os.system("git add taxi_*.h5")
            os.system("git commit -m \"autoupdate saved taxi agent network \"")
            save_point *= 2
            current_reward.append(reward)
            current_episode.append(i)

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|[35mB[0m: |
+---------+
  (South)
Current episode number, timesteps, epsilon:  9 200 199 0.05


In [None]:
taxiAgent.qnetwork.save("taxi_qnetwork_"+str(i)+".h5")
os.system("git add taxi_qnetwork_*.h5")
os.system("git commit -m \"autoupdate saved model\"")
os.system("git push origin master")

In [None]:
plt.figure(figsize=(12,6))
plt.plot(current_episode, current_reward)
plt.title("Rewards vs Episode #", size=15)
plt.xlabel("Episode #", size=12)
plt.ylabel("Rewards", size=12)
plt.grid()
plt.savefig("taxi_rewards.png")
os.system("git add taxi_rewards.png")
os.system("git commit -m \"Taxi rewards plot\"")
os.system("git push origin master")