In [1]:
import gym
import torch
import random
import numpy as np
import torch
from PIL import Image
import torch
import torch.nn as nn
from collections import namedtuple
from collections import deque
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm
from matplotlib.pyplot import imshow
from PIL import Image
from wrappers import make_atari, wrap_deepmind, wrap_pytorch
import queue
from torch import optim
import matplotlib.pyplot as plt

# Model

In [2]:
class QNet(torch.nn.Module):
    def __init__(self,obs_shape,act_shape):
        super(QNet, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(7*7*64,512)
        self.fc2 = nn.Linear(512,1)
        self.fc3 = nn.Linear(7*7*64,512)
        self.fc4 = nn.Linear(512,act_shape)
        
    def forward(self, x):
        #Conv
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.relu(x)
        x = x.view(x.shape[0],-1)
        
        #Fc
        x1 = x
        x = self.fc1(x)
        x = self.relu(x)
        v = self.fc2(x)
        
        x1 = self.fc3(x1)
        x1 = self.relu(x1)
        adv = self.fc4(x1)
        
        q_s_a = v + adv - adv.mean()
        
        return q_s_a

In [3]:
def eps_greedy(epsilon,state,net):
    if(np.random.random()<epsilon):
        action = np.random.randint(ACT_SHAPE)
    else:
        qvalues = net(state)
        action = torch.argmax(qvalues).item()
    return action    

In [4]:
class ReplayBuffer(object):
    def __init__(self,maxsize):
        self.q = deque(maxlen = maxsize)
        self.maxsize = maxsize
    def add(self,x):
        self.q.append(x)
        if(len(self.q)==self.maxsize):
            self.q.popleft()
    def getSize(self):
        return len(self.q)
    def sample(self,size):
        batch = random.sample(list(self.q),size)
        state,action,reward,next_state,done = map(list, zip(*batch))
        return state,action,reward,next_state,done
           

# Loss function

In [5]:
def compute_loss(size):
    current_state,action,reward,next_state,done = buffer.sample(size)
    current_state = torch.stack(current_state)
    current_state = current_state.squeeze(1)
    next_state = torch.stack(next_state)
    next_state = next_state.squeeze(1)
    done = np.array(done)
    done = done.astype(int)
    reward = torch.Tensor(reward).cuda()
    action = torch.as_tensor(action).cuda()
    done = torch.Tensor(done).cuda()
    
    qvalues = net(current_state)
    qvalues_next = net(next_state)
    target_net.eval()
    qvalues_target = target_net(next_state)
    
    #Q(s,a)
    q_a = qvalues.gather(1, action.unsqueeze(1)).squeeze(1)
    
    #Selecting action for target network
    selected = qvalues_next.max(1)[1]
    #Q'(s',argmax Q(s',a))
    q_a_target = qvalues_target.gather(1,selected.unsqueeze(1)).squeeze(1)
    
    #Computing target value
    target = reward + GAMMA * q_a_target * (1 - done)
    L = (target - q_a).pow(2).mean()
    optimizer.zero_grad()
    L.backward()
    optimizer.step()
    return L

In [6]:
def update_target():
    target_net.load_state_dict(net.state_dict())

In [7]:
def epsilon_decay(ep):
    e = .01 + .99*np.exp(-ep/30000)
    return e

In [8]:
def addreward(id,item,filename):
    f=open(filename,'a+')
    f.write(str(id)+' '+str(item)+' '+'\n')
    f.close()

In [9]:
def addloss(id,loss,filename):
    f=open(filename,'a+')
    f.write(str(id)+' '+str(loss.item())+' '+'\n')
    f.close()

# Training

In [10]:
# env = gym.make('PongNoFrameskip-v4')
env    = make_atari('PongNoFrameskip-v4')
# env    = make_atari('Pong-v0')
env    = wrap_deepmind(env)
env    = wrap_pytorch(env)

print(env.observation_space)
print(env.action_space)


Box(4, 84, 84)
Discrete(6)


In [12]:
net = QNet(env.observation_space.shape,env.action_space.n)
net = net.cuda()
target_net = QNet(env.observation_space.shape,env.action_space.n)
target_net = target_net.cuda()
# update_target()
optimizer = optim.Adam(net.parameters(), lr=0.00001)

In [12]:
ITERATIONS = 5e6
epsilon = .99
OBS_SHAPE = env.observation_space.shape
ACT_SHAPE = env.action_space.n
REPLAY_SAMPLE = 20000
BATCH_SIZE = 32
GAMMA = .99
T_upd = 10000

In [13]:
net.load_state_dict(torch.load('./dueling-logs/dueling-model2000000.pth'))
target_net.load_state_dict(torch.load('./dueling-logs/dueling-model-target2000000.pth'))
optimizer = optim.Adam(net.parameters(), lr=0.00001)
# update_target()
lossfile = './dueling-logs/losses.csv'
rewardsfile = './dueling-logs/rewards.csv'
# if os.path.exists(lossfile):
#     os.remove(lossfile)
# if os.path.exists(rewardsfile):
#     os.remove(rewardsfile)

In [None]:
buffer = ReplayBuffer(100000)
episode_reward = 0
state = env.reset()
state = torch.Tensor(state).cuda()
state = state.unsqueeze(0)
count = 0
rewards_dict = {}
loss_dict = {}
losses = []
rewards = [] 
for i in tqdm(range(1,ITERATIONS+1)):
    epsilon = epsilon_decay(i)
    action = eps_greedy(epsilon,state,net)
    next_state, reward, done, info = env.step(action)
    next_state = torch.Tensor(next_state).cuda().unsqueeze(0)
    buffer.add((state,action,reward,next_state,done))
    episode_reward+=reward
    state=next_state
    if(buffer.getSize()>REPLAY_SAMPLE):
        loss = compute_loss(BATCH_SIZE)
#         losses.append(loss)
#         if(i%500==0):
#             addloss(i,loss,lossfile)
        if i % loss_log_interval == 0:
            loss_dict[i] = {}
            loss_dict[i]['loss'] = loss.item()
            loss_count += 1
            if loss_count == 10: 
                loss_df = pd.DataFrame.from_dict(data = loss_dict, orient = 'index').reset_index()
                if not os.path.exists(lossfile):
                    loss_df.to_csv(lossfile,index=None, header='column_names')
                else: # else it exists so append without writing the header
                    loss_df.to_csv(lossfile, mode='a',index=None, header=False)
                loss_dict = {}
                loss_count = 0
    if done:
        count += 1
        rewards.append(episode_reward)
       
        if count%10 == 0:
                print(i,count,episode_reward) 
        if count%episode_reward_interval == 0:
                rewards_dict[i] = {}
                rewards_dict[i]['episode'] = count
                rewards_dict[i]['reward'] = episode_reward
                rewards_df = pd.DataFrame.from_dict(data = rewards_dict, orient = 'index').reset_index()
                if not os.path.exists(rewardsfile):
                    rewards_df.to_csv(rewardsfile,index=None, header='column_names')
                else: # else it exists so append without writing the header
                    rewards_df.to_csv(rewardsfile, mode='a',index=None, header=False)
                rewards_dict = {}
        episode_reward = 0 
        state = env.reset()
        state = torch.Tensor(state).cuda()
        state = state.unsqueeze(0)
    if(i%T_upd==0):
        update_target()

    if(i%save_model_interval == 0):
        torch.save(net.state_dict(),'./dueling-logs/dueling-model' + str(i) + '.pth')
        torch.save(target_net.state_dict(),'./dueling-logs/dueling-model-target' + str(i) + '.pth')

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

-21.0
-20.0
-21.0
-21.0
-21.0
-21.0
-20.0
-21.0
-19.0
-21.0
-20.0
-21.0
-19.0
-21.0
-21.0
-20.0
-21.0
-21.0
-21.0
-19.0
-20.0
-20.0
-21.0
-18.0
-21.0
-21.0
-21.0
-21.0
-20.0
-20.0
-21.0
-21.0
-20.0
-21.0
-21.0
-20.0
-21.0
-21.0
-19.0
-21.0
-21.0
-21.0
-20.0
-20.0
-21.0
-19.0
-19.0
-21.0
-20.0
-20.0
-21.0
-21.0
-21.0
-21.0
-19.0
-19.0
-20.0
-21.0
-20.0
-20.0
-20.0
-19.0
-19.0
-21.0
-21.0
-20.0
-19.0
-21.0
-16.0
-17.0
-21.0
-20.0
-20.0
-19.0
-19.0
-19.0
-17.0
-21.0
-19.0
-17.0
-21.0
-17.0
-19.0
-14.0
-17.0
-20.0
-18.0
-19.0
-18.0
-18.0
-17.0
-15.0
-16.0
-15.0
-18.0
-11.0
-13.0
-16.0
-19.0
-19.0
-16.0
-17.0
-20.0
-18.0
-16.0
-18.0
-19.0
-18.0
-18.0
-18.0
-14.0
-13.0
-15.0
-15.0
-16.0
-17.0
-17.0
-15.0
