In [1]:
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
import time
from IPython.display import clear_output
import pickle

In [2]:
class Agent:
    def __init__(self, env):
        
        # Initializing Agent/Sim parameters
        self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
        
        self.alpha = 0.9 # learning rate
        self.gamma = 0.96 # discount factor
        self.epsilon = 0.85 # exploration rate
        self.action = None
        
        self.current_state = env.reset()
        
        self.n_episodes = 100000 # Episodes to play
        self.n_steps = 200 # Max steps in an episode
        
    def learn(self, new_state, reward):
        self.reward = reward
        
        # The Bellman Equation
        self.q_table[self.current_state, self.action] = self.q_table[self.current_state, self.action] + \
        self.alpha*(reward + self.gamma*np.max(self.q_table[new_state, :]) - \
                    self.q_table[self.current_state, self.action])
        
    def selectAction(self):
        if(random.uniform(0, 1) < self.epsilon):
            return env.action_space.sample()
        else:
            return np.argmax(self.q_table[self.current_state, :])
        
    def updateParameters(self):        
        self.epsilon = np.maximum(self.epsilon - 0.05, 0.1)
        #self.alpha = np.maximum(self.alpha - 0.05, 0.6)

In [3]:
env = gym.make("Taxi-v3")
agent = Agent(env)

In [8]:
#Load pickled q_table using:
pickle_in = open("pickled_qtable.pkl","rb")
example_dict = pickle.load(pickle_in)

In [5]:
# Random play
env.reset()

for i in range(100):
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    env.render()
    if(done):
        break
    time.sleep(0.01)
    clear_output(wait=True)

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+
  (Dropoff)


In [None]:
#agent = Agent(env)
#agent.epsilon = 0.1
for i in range(1000):
    agent.state = env.reset()
    for j in range(agent.n_steps):        
        
        agent.action = agent.selectAction()
        state, reward, done, info = env.step(agent.action)
        agent.learn(state, reward)
        agent.current_state = state

        env.render()
        
        if(done):
            break
        print("Current episode number: ", i)
        print(agent.q_table)
        time.sleep(0.01)
        clear_output(wait=True)
    
    #if ((i%10000) == 0):
        #agent.updateParameters()


+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
Current episode number:  810
[[ 3.49460914  1.29194272  1.0947871   1.22908524 -4.77033896 -4.71750921]
 [ 6.84832644  8.93735337  7.38071726  8.94651609  8.8788194  -0.83488278]
 [12.6341959  13.14030607 11.18903019 12.13640355 16.8481672   6.21601037]
 ...
 [14.04518902 16.14681    14.04680771 14.99393785  8.03869119  7.04796645]
 [10.50564301 11.10552172 10.19333743 11.16176139  1.67322276  2.15838816]
 [21.9026554  18.20066834 18.45340314 23.58466807 14.77497221 15.16391057]]


In [16]:
pickle_out = open("pickled_qtable.pkl","wb")
pickle.dump(agent.q_table, pickle_out)
pickle_out.close()