In [2]:
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
import time
from IPython.display import clear_output
import pickle

In [3]:
class Agent:
    def __init__(self, env):
        
        # Initializing Agent/Sim parameters
        self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
        
        self.alpha = 0.9 # learning rate
        self.gamma = 0.96 # discount factor
        self.epsilon = 0.85 # exploration rate
        self.action = None
        
        self.current_state = env.reset()
        
        self.n_episodes = 100000 # Episodes to play
        self.n_steps = 200 # Max steps in an episode
        
    def learn(self, new_state, reward):
        self.reward = reward
        
        # The Bellman Equation
        self.q_table[self.current_state, self.action] = self.q_table[self.current_state, self.action] + \
        self.alpha*(reward + self.gamma*np.max(self.q_table[new_state, :]) - \
                    self.q_table[self.current_state, self.action])
        
    def selectAction(self):
        if(random.uniform(0, 1) < self.epsilon):
            return env.action_space.sample()
        else:
            return np.argmax(self.q_table[self.current_state, :])
        
    def updateParameters(self):        
        self.epsilon = np.maximum(self.epsilon - 0.05, 0.1)
        #self.alpha = np.maximum(self.alpha - 0.05, 0.6)

In [4]:
env = gym.make("Taxi-v3")
agent = Agent(env)

In [5]:
#Load pickled q_table using:
pickle_in = open("pickled_qtable.pkl","rb")
agent.q_table = pickle.load(pickle_in)

In [10]:
# Random play
agent.current_state = env.reset()
rewards_list = []
agent.epsilon=0
for j in range(10):
    cumulative_reward = 0
    agent.current_state = env.reset()
    
    for i in range(200):
        action = agent.selectAction() # env.action_space.sample()
        state, reward, done, info = env.step(action)
        agent.current_state = state
        cumulative_reward += reward
        env.render()
        if(done):
            break
        time.sleep(0.05)
        clear_output(wait=True)
    rewards_list.append(cumulative_reward)

print(rewards_list)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
[-200, -200, -200, 6, 4, 6, 8, 7, -200, 11]


In [8]:
#agent = Agent(env)
agent.epsilon = 0.1
for i in range(1000):
    agent.state = env.reset()
    for j in range(agent.n_steps):        
        
        agent.action = agent.selectAction()
        state, reward, done, info = env.step(agent.action)
        agent.learn(state, reward)
        agent.current_state = state

        env.render()
        
        if(done):
            break
        print("Current episode number: ", i)
        print(agent.q_table)
        time.sleep(0.05)
        clear_output(wait=True)
    
    #if ((i%10000) == 0):
        #agent.updateParameters()


+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)


In [16]:
pickle_out = open("pickled_qtable.pkl","wb")
pickle.dump(agent.q_table, pickle_out)
pickle_out.close()