In [None]:
import gym
import math
import time
import os
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


env = gym.make('Taxi-v2').unwrapped

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Transition = namedtuple('Transition',
                        ('parameter', 'state', 'action', 'next_state', 'reward', 'done'))

LEARNING_RATE = 0.01
NUM_EPISODES = 5000
MEMORY = 200000
BATCH_SIZE = 1024
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.01
MAX_EPISODE = 100
TARGET_UPDATE = 3
HIDDEN_DIM = 64
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.n

SOLVE_TAXI_MESSAGE = """Task : \n
1) The cab(YELLOW) should find the shortest path to BLUE(passenger) 
2) Perform a "pickup" action to board the passenger which turns the cab(GREEN)
3) Take the passenger to the PINK(drop location) using the shortest path
4) Perform a "dropoff" action
"""
writer = SummaryWriter()



class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


class DQN(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(DQN, self).__init__()        
        self.embedding = nn.Embedding(input_dim, hidden_dim)

        self.layer1 = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim + 1, hidden_dim), # +1 for parameter
            torch.nn.BatchNorm1d(hidden_dim),
            torch.nn.PReLU()
        )

        self.layer2 = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.BatchNorm1d(hidden_dim),
            torch.nn.PReLU()
        )

        self.final = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, p, x):
        
        x = self.embedding(x)[:,0,:]
        x = torch.cat((p, x), dim=1)
        x = self.layer1(x)
        # x = self.layer2(x)
        x = self.final(x)
        return x

policy_net = DQN(N_STATES, N_ACTIONS,HIDDEN_DIM).to(device)
target_net = DQN(N_STATES, N_ACTIONS,HIDDEN_DIM).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

loss_fn = torch.nn.MSELoss()
optim = torch.optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

memory = ReplayMemory(MEMORY)


def get_Q(net, parameter, state):
    state = torch.tensor(np.array(state).reshape(-1, 1), dtype=torch.long).to(device) 
    parameter = torch.tensor(np.array(parameter), dtype=torch.float).reshape(-1, 1).to(device)
    
    net.train(mode=False)
    return net(parameter, state)


def get_action(parameter, state, eps):
    if np.random.rand() < eps:
        return np.random.choice(N_ACTIONS)
    else:
        policy_net.train(mode=False)
        scores = get_Q(policy_net, parameter, state)
        _, argmax = torch.max(scores.data, 1)
        return int(argmax.cpu().numpy())

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)

    states = np.vstack([x.state for x in transitions])
    actions = np.array([x.action for x in transitions])
    rewards = np.array([x.reward for x in transitions])
    parameters = np.array([x.parameter for x in transitions])
    
    next_states = np.vstack([x.next_state for x in transitions])
    done = np.array([x.done for x in transitions])

    Q_predict = get_Q(policy_net, parameters, states)
    
    Q_target = Q_predict.clone().data.cpu().numpy()

    # For DQN
    # Q_target[np.arange(len(Q_target)), actions] = rewards + GAMMA * np.max(get_Q(target_net, next_states).data.cpu().numpy(), axis=1) * ~done

    # For Double DQN
    Q_next_state = np.argmax(get_Q(policy_net, parameters, next_states).data.cpu().numpy(), axis=1).reshape(-1)
    Q_target[np.arange(len(Q_target)), actions] = rewards + GAMMA * np.choose(Q_next_state, get_Q(target_net, parameters, next_states).data.cpu().numpy().T) * ~done

    Q_target = torch.tensor(Q_target, dtype=torch.float).to(device)


    policy_net.train(mode=True)
    optim.zero_grad()
    loss = loss_fn(Q_predict, Q_target)
    loss.backward()
    optim.step()

def epsilon_annealing(episode, max_episode, min_eps, max_eps):
    if max_episode == 0:
        return min_eps
    slope = (min_eps - max_eps) / max_episode
    return max(slope * episode + max_eps, min_eps)

def clear_screen(delay=1):
    time.sleep(delay)
    os.system('clear')

parameter = 0 # Parameter

perf = 0
score  = 0
for i_episode in range(NUM_EPISODES):
    clear_screen(0)
    state = env.reset()
    total_reward = 0
    eps = epsilon_annealing(i_episode, MAX_EPISODE, EPS_END, EPS_START)
    done = False
    t = 0
    writer.add_scalar('train/epsilon', eps, i_episode)
    
    while not done:
        action = get_action(parameter, state, eps)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        memory.push(parameter, state, action, next_state, reward, done)

        state = next_state

        optimize_model()
        t += 1

        if done:
            writer.add_scalar('train/reward_total', total_reward, i_episode)
            writer.add_scalar('train/steps', t, i_episode)
            

    score += total_reward
    perf = score/(i_episode + 1)

    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())


In [5]:
def evaluate(param=0, n_episodes=10):
    env = gym.make('Taxi-v2').unwrapped
    perf = 0
    score  = 0

    for i_episode in range(n_episodes):
        clear_screen(0)
        state = env.reset()
        total_reward = 0

        done = False
        t = 0
        log_progress(env, delay=0.5, message=init_message(i_episode, perf), eps=0)
        while not done:
            
            action = get_action(param, state, 0)
            next_state, reward, done, _ = env.step(action)
            total_reward += reward

            if i_episode >= 250:
                log_progress(env, reward=reward, total_reward=total_reward, delay=0.5,message=perf_message(i_episode, perf), eps=eps)

            state = next_state
            if t % 1000 == 0:
                print(t)
            t += 1


        score += total_reward
        perf = score/(i_episode + 1)
        print(i_episode)

evaluate()

Initial State : Task : 

1) The cab(YELLOW) should find the shortest path to BLUE(passenger) 
2) Perform a "pickup" action to board the passenger which turns the cab(GREEN)
3) Take the passenger to the PINK(drop location) using the shortest path
4) Perform a "dropoff" action

Attempt: 1 | Average reward (until last episode): 0.00
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

Reward: 0
Cumulative reward 0
ε-greedy probability 0
0
1000
2000
3000
4000


KeyboardInterrupt: 