In [2]:
import gym
import torch
import random
import numpy as np
import torch
from PIL import Image
import torch
import torch.nn as nn
from collections import namedtuple
from collections import deque
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm
from matplotlib.pyplot import imshow
from PIL import Image
from wrappers import make_atari, wrap_deepmind, wrap_pytorch
import queue
from torch import optim
import matplotlib.pyplot as plt

# Model

In [89]:
class QNet(torch.nn.Module):
    def __init__(self,obs_shape,act_shape,atoms):
        super(QNet, self).__init__()
        self.atoms = atoms
        self.act_shape = act_shape
        self.conv1 = nn.Conv2d(1, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.relu = nn.ReLU()
        #Linear layers for dueling
        self.fc1 = nn.Linear(7*7*64,512)
        self.fc2 = nn.Linear(512,atoms)
        self.fc3 = nn.Linear(7*7*64,512)
        self.fc4 = nn.Linear(512,act_shape*atoms)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, x):
        #Conv
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.relu(x)
        x = x.view(x.shape[0],-1)
        
        #Fc
        x1 = x
        x = self.fc1(x)
        x = self.relu(x)
        v = self.fc2(x)
        
        x1 = self.fc3(x1)
        x1 = self.relu(x1)
        adv = self.fc4(x1)
        
        #Reshaping value and advantage functions to add probabilities of each atom for each action
        value = v.view(v.shape[0],1,self.atoms)
        adv = adv.view(adv.shape[0],self.act_shape,self.atoms)
        
        q_s_a = value + adv - adv.mean(1,keepdim=True)
        
        #probability of each atom for all actions
        q_s_a = self.softmax(q_s_a)
        
        return q_s_a

In [90]:
def eps_greedy(epsilon,state,net,atoms):
    if(np.random.random()<epsilon):
        action = np.random.randint(ACT_SHAPE)
    else:
        #Finding the expected value of each action (sum(pi*zi))
        qvalues = net(state)
        expected_values = torch.matmul(qvalues,atoms)
        action = torch.argmax(expected_values).item()
    return action    

In [91]:
class ReplayBuffer(object):
    def __init__(self,maxsize):
        self.q = deque(maxlen = maxsize)
        self.maxsize = maxsize
    def add(self,x):
        self.q.append(x)
        if(len(self.q)==self.maxsize):
            self.q.popleft()
    def getSize(self):
        return len(self.q)
    def sample(self,size):
        batch = random.sample(list(self.q),size)
        state,action,reward,next_state,done = map(list, zip(*batch))
        return state,action,reward,next_state,done

# Loss function

In [294]:
def compute_loss(size,atoms):
    current_state,action,reward,next_state,done = buffer.sample(size)
    current_state = torch.stack(current_state)
    current_state = current_state.squeeze(1)
    next_state = torch.stack(next_state)
    next_state = next_state.squeeze(1)
    done = np.array(done)
    done = done.astype(int)
    reward = torch.Tensor(reward).cuda()
    action = torch.as_tensor(action).cuda()
    done = torch.Tensor(done).cuda()
    
    #qvalues -> (batch,actions,atoms)
    # z(xt)
    dist_current = net(current_state)
    
    # z(xt+1)
    dist_next = net(next_state)
    target_net.eval()
    
    # z'(xt+1)
    dist_target = target_net(next_state)
    
    #Selecting optimal action a*
    
    qvalues_next = torch.matmul(dist_next,atoms)
    optimal_action = qvalues_next.max(1)[1]
    
    #Distribution of target with optimal action z(xt+1,a*)
    dist_target_optimal = dist_target.gather(1,optimal_action.view(-1,1).unsqueeze(2).repeat(1,1,N_ATOMS))
    dist_target_optimal = dist_target_optimal.squeeze(1)
    #Finding target distribution values (Tzj = r + gamma*Z(x,a*)) (not aligned)
    Tz = reward.unsqueeze(1).repeat(1,N_ATOMS) + GAMMA*atoms.unsqueeze(0).repeat(size,1)
    
    #Clipping the values
    Tz = torch.clamp(Tz,min=VMIN,max=VMAX)
    
    #Aligning the values
    
    deltaz = (VMAX-VMIN)/N_ATOMS
    indices = (Tz - VMIN)/deltaz
    lower = indices.floor()
    upper = indices.ceil()
    
    print(indices.shape)
    #Finding target probabilities
    target_distribution = torch.zeros(size,N_ATOMS).cuda()
    for col in range(N_ATOMS):
        l = lower[:,col]
        u = upper[:,col]
        target_distribution[:,l] = target_distribution[:,l] + dist_target_optimal[:,col]*(indices[col]-l)
        target_distribution[:,u] = target_distribution[:,u] + dist_target_optimal[:,col]*(u-indices[col])
    print(dist_target_optimal.shape)
     
    
    
#     #Q(s,a)
#     q_a = qvalues.gather(1, action.unsqueeze(1)).squeeze(1)
    
#     #Selecting action for target network
#     selected = qvalues_next.max(1)[1]
#     #Q'(s',argmax Q(s',a))
#     q_a_target = qvalues_target.gather(1,selected.unsqueeze(1)).squeeze(1)
    
#     #Computing target value
#     target = reward + GAMMA * q_a_target * (1 - done)
#     L = (target - q_a).pow(2).mean()
#     optimizer.zero_grad()
#     L.backward()
#     optimizer.step()
    return L

In [295]:
def update_target():
    target_net.load_state_dict(net.state_dict())

In [296]:
def epsilon_decay(ep):
    e = .01 + .99*np.exp(-ep/30000)
    return e

In [297]:
def addreward(id,item,filename):
    f=open(filename,'a+')
    f.write(str(id)+' '+str(item)+' '+'\n')
    f.close()

In [298]:
def addloss(id,loss,filename):
    f=open(filename,'a+')
    f.write(str(id)+' '+str(loss.item())+' '+'\n')
    f.close()

# Training

In [299]:
# env = gym.make('PongNoFrameskip-v4')
# env    = make_atari('PongNoFrameskip-v4')
env    = make_atari('BankHeist-v0')
env    = wrap_deepmind(env)
env    = wrap_pytorch(env)

print(env.observation_space)
print(env.action_space)


Box(1, 84, 84)
Discrete(18)


In [300]:
ITERATIONS = 1000000
epsilon = .99
OBS_SHAPE = env.observation_space.shape
ACT_SHAPE = env.action_space.n
REPLAY_SAMPLE = 5000
BATCH_SIZE = 32
GAMMA = .99
T_upd = 1000
N_ATOMS = 51
VMAX = 10
VMIN = -10
atoms = torch.linspace(VMIN,VMAX,N_ATOMS).cuda()

In [301]:
net = QNet(env.observation_space.shape,env.action_space.n,N_ATOMS)
net = net.cuda()
target_net = QNet(env.observation_space.shape,env.action_space.n,N_ATOMS)
target_net = target_net.cuda()
# update_target()
optimizer = optim.Adam(net.parameters(), lr=0.00001)

In [302]:
# net.load_state_dict(torch.load('dqn-model.pth'))
# target_net.load_state_dict(torch.load('dqn-model-target.pth'))
# optimizer = optim.Adam(net.parameters(), lr=0.00001)
update_target()

In [303]:
buffer = ReplayBuffer(10000)

episode_reward = 0
state = env.reset()
state = torch.Tensor(state).cuda()
state = state.unsqueeze(0)
count = 0
lossfile = './dueling-logs/losses.txt'
rewardsfile = './dueling-logs/rewards.txt'
losses = []
rewards = [] 
for i in tqdm(range(ITERATIONS)):
    epsilon = epsilon_decay(i)
    action = eps_greedy(epsilon,state,net,atoms)
    next_state, reward, done, info = env.step(action)
    next_state = torch.Tensor(next_state).cuda().unsqueeze(0)
    buffer.add((state,action,reward,next_state,done))
    episode_reward+=reward
    state=next_state
    if(buffer.getSize()>REPLAY_SAMPLE):
        loss = compute_loss(BATCH_SIZE,atoms)
#         losses.append(loss)
#         if(i%500==0):
#             addloss(i,loss,lossfile)
    if done:
        count+=1
        rewards.append(episode_reward)
#         if(count%5==0):
#             addreward(count,episode_reward,rewardsfile)
#             print(episode_reward)
        episode_reward = 0 
        state = env.reset()
        state = torch.Tensor(state).cuda()
        state = state.unsqueeze(0)
    if(i%T_upd==0):
#         torch.save(net.state_dict(),'./dueling-logs/dqn-model.pth')
#         torch.save(target_net.state_dict(),'./dueling-logs/dqn-model-target.pth')
        update_target()


HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

torch.Size([32, 51])



RuntimeError: tensors used as indices must be long or byte tensors