In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim

from env_CatchPigs import EnvCatchPigs

In [2]:
class ReplayBuffer():
    def __init__(self, size, obs_dims):
        self.mem_size = size
        self.obs_mem = torch.zeros((self.mem_size, *obs_dims))
        self.act_mem = torch.zeros(self.mem_size, dtype=torch.int64)
        self.rew_mem = torch.zeros(self.mem_size, dtype=torch.float32)
        self.next_obs_mem = torch.zeros((self.mem_size, *obs_dims))
        self.done_mem = torch.zeros(self.mem_size, dtype=torch.bool)
        self.cntr = 0

    def push(self, obs, act, rew, next_obs, done):
        """
        obs :: torch tensor shape==(channels, height, width)
        act :: int
        rew :: int
        obs_ :: torch tensor shape==(channels, height, width)
        done :: bool
        """

        idx = self.cntr % self.mem_size
        self.obs_mem[idx] = obs
        self.act_mem[idx] = act
        self.rew_mem[idx] = rew
        self.next_obs_mem[idx] = next_obs
        self.done_mem[idx] = done 
        self.cntr += 1

    def sample(self, batch_size):
        max_idx = min(self.mem_size, self.cntr)
        idxs = np.random.choice(max_idx, batch_size, replace=False)
        obs_batch = self.obs_mem[idxs]
        act_batch = self.act_mem[idxs]
        rew_batch = self.rew_mem[idxs]
        next_obs_batch = self.next_obs_mem[idxs]
        done_batch = self.done_mem[idxs]

        return obs_batch, act_batch, rew_batch, next_obs_batch, done_batch

In [3]:
class DeepQNetwork(nn.Module):
    def __init__(self, obs_dims, num_acts, lr=1e-3):
        super(DeepQNetwork, self).__init__()
        
        self.conv1 = nn.Conv2d(obs_dims[0], 32, 4, stride=1)
        self.conv2 = nn.Conv2d(32, 64, 3, stride=1)
        self.conv3 = nn.Conv2d(64, 64, 2, stride=1)
        
        linear_input_dims = self._calc_conv_output_dims(obs_dims)
        
        self.linear1 = nn.Linear(linear_input_dims, 512)
        self.linear2 = nn.Linear(512, num_acts)
        
        self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        
        
    def _calc_conv_output_dims(self, input_dims):
        tmp = torch.zeros((1, *input_dims))
        tmp = self.conv1(tmp)
        tmp = self.conv2(tmp)
        tmp = self.conv3(tmp)
        return int(np.prod(tmp.size()))
    
    
    def forward(self, obs):
        h = F.relu(self.conv1(obs))
        h = F.relu(self.conv2(h))
        h = F.relu(self.conv3(h))
        # flatten conv layer output
        h = h.view(h.size()[0], -1)
        # conv_state shape is BS x (n_filters * H * W)
        h = F.relu(self.linear1(h))
        acts = self.linear2(h)

        return acts
        
        

In [4]:
class DQNAgent(object):
    def __init__(self, obs_dims, num_acts, gamma=0.99, epsilon=1, lr=0.01,
                 mem_size=10000, batch_size=32, eps_min=0.01, eps_dec=7e-5,
                 replace=100, chkpt_dir='tmp/dqn'):
        
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.num_acts = num_acts
        self.obs_dims = obs_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.chkpt_dir = chkpt_dir
        self.act_space = [i for i in range(num_acts)]
        self.learn_cnt = 0

        self.memory = ReplayBuffer(mem_size, obs_dims)
        self.q_eval = DeepQNetwork(self.obs_dims, self.num_acts, self.lr)
        self.q_next = DeepQNetwork(self.obs_dims, self.num_acts, self.lr)
        
        
    def choose_act(self, obs):
        """
        obs :: torch tensor shape==(3, 96, 96)
        """
        if np.random.random() > self.epsilon:
            acts = self.q_eval.forward(obs.unsqueeze(0))
            act = torch.argmax(acts).item()
        else:
            act = np.random.choice(self.act_space)

        return int(act)
    
    
    def store_transition(self, obs, act, rew, next_obs, done):
        self.memory.push(obs, act, rew, next_obs, done)
        
        
    def sample_memory(self):
        obs, act, rew, next_obs, done = self.memory.sample(self.batch_size)
        return obs, act, rew, next_obs, done
    
    
    def replace_target_network(self):
        if self.learn_cnt % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

            
    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        
    
    def learn(self):
        if self.memory.cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        obs, act, rew, next_obs, done = self.sample_memory()
        indxs = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(obs)[indxs, act]
        q_next = self.q_next.forward(next_obs).max(dim=1)[0]

        q_next[done] = 0.0
        q_target = rew + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_cnt += 1

        self.decrement_epsilon()

In [5]:
def train(max_iter, env, agent1, agent2):
    
    obs_list = env.get_obs()
    obs1 = torch.tensor(obs_list[0], dtype=torch.float).permute(2, 0, 1)
    obs2 = torch.tensor(obs_list[1], dtype=torch.float).permute(2, 0, 1)
    
    last_100_rew = [0 for i in range(100)]
    for i in range(max_iter):
        act1 = agent1.choose_act(obs1)
        act2 = agent2.choose_act(obs2)
        act_list = [act1, act2]
        # print("iter= ", i, env.agt1_pos, env.agt2_pos, env.pig_pos, env.agt1_ori, env.agt2_ori, 'action', act1, act2)
#         env.render()
        rew_list, done = env.step(act_list)
        rew1 = rew_list[0]
        rew2 = rew_list[1]
        # print(rew1)
        _obs_list = env.get_obs()
        _obs1 = torch.tensor(_obs_list[0], dtype=torch.float).permute(2, 0, 1)
        _obs2 = torch.tensor(_obs_list[1], dtype=torch.float).permute(2, 0, 1)
        agent1.store_transition(obs1, act1, rew1, _obs1, done)
        agent2.store_transition(obs2, act2, rew2, _obs2, done)
        obs1 = _obs1
        obs2 = _obs2
        agent1.learn()
        agent2.learn()
        last_100_rew[i % 100] = rew1 + rew2
        
        #env.plot_scene()
        if rew1 + rew2 > 0:
            print("iter= ", i)
            print("Goal found!")

        if i % 100 == 0:
            print(f"Iter: {i}, Epsilon:{agent1.epsilon}, Reward: {sum(last_100_rew)}")

In [6]:
env = EnvCatchPigs(7, False)
max_iter = 10000

dqn1 = DQNAgent((3,21,21), 5)
dqn2 = DQNAgent((3,21,21), 5)

train(max_iter, env, dqn1, dqn2)

size of map should be an odd integer no smaller than 7
Iter: 0, Epsilon:1, Reward: -20
Iter: 100, Epsilon:0.995099999999999, Reward: -580
Iter: 200, Epsilon:0.9880999999999975, Reward: -460
Iter: 300, Epsilon:0.9810999999999961, Reward: -640
Iter: 400, Epsilon:0.9740999999999946, Reward: -660
Iter: 500, Epsilon:0.9670999999999932, Reward: -640
Iter: 600, Epsilon:0.9600999999999917, Reward: -520
Iter: 700, Epsilon:0.9530999999999903, Reward: -420
Iter: 800, Epsilon:0.9460999999999888, Reward: -600
Iter: 900, Epsilon:0.9390999999999874, Reward: -820
Iter: 1000, Epsilon:0.9320999999999859, Reward: -660
Iter: 1100, Epsilon:0.9250999999999845, Reward: -540
Iter: 1200, Epsilon:0.918099999999983, Reward: -400
Iter: 1300, Epsilon:0.9110999999999816, Reward: -560
Iter: 1400, Epsilon:0.9040999999999801, Reward: -460
Iter: 1500, Epsilon:0.8970999999999787, Reward: -680
Iter: 1600, Epsilon:0.8900999999999772, Reward: -480
Iter: 1700, Epsilon:0.8830999999999758, Reward: -600
Iter: 1800, Epsilon:0.8

In [7]:
dqn1.epsilon = 0.6
dqn2.epsilon = 0.6
train(10000, env, dqn1, dqn2)

Iter: 0, Epsilon:0.59993, Reward: 0
Iter: 100, Epsilon:0.5929299999999985, Reward: -240
Iter: 200, Epsilon:0.5859299999999971, Reward: -220
Iter: 300, Epsilon:0.5789299999999956, Reward: -400
Iter: 400, Epsilon:0.5719299999999942, Reward: -360
Iter: 500, Epsilon:0.5649299999999927, Reward: -260
Iter: 600, Epsilon:0.5579299999999913, Reward: -380
Iter: 700, Epsilon:0.5509299999999898, Reward: -240
Iter: 800, Epsilon:0.5439299999999884, Reward: -380
Iter: 900, Epsilon:0.5369299999999869, Reward: -280
Iter: 1000, Epsilon:0.5299299999999855, Reward: -220
Iter: 1100, Epsilon:0.522929999999984, Reward: -540
Iter: 1200, Epsilon:0.5159299999999826, Reward: -360
Iter: 1300, Epsilon:0.5089299999999811, Reward: -220
Iter: 1400, Epsilon:0.5019299999999797, Reward: -260
Iter: 1500, Epsilon:0.4949299999999782, Reward: -200
Iter: 1600, Epsilon:0.48792999999997677, Reward: -360
Iter: 1700, Epsilon:0.4809299999999753, Reward: -220
Iter: 1800, Epsilon:0.47392999999997387, Reward: -300
Iter: 1900, Epsilo

In [8]:
dqn1.epsilon = 0.6
dqn2.epsilon = 0.6
train(5000, env, dqn1, dqn2)

Iter: 0, Epsilon:0.59993, Reward: 0
Iter: 100, Epsilon:0.5929299999999985, Reward: -280
Iter: 200, Epsilon:0.5859299999999971, Reward: -380
Iter: 300, Epsilon:0.5789299999999956, Reward: -280
Iter: 400, Epsilon:0.5719299999999942, Reward: -240
Iter: 500, Epsilon:0.5649299999999927, Reward: -260
Iter: 600, Epsilon:0.5579299999999913, Reward: -300
Iter: 700, Epsilon:0.5509299999999898, Reward: -160
Iter: 800, Epsilon:0.5439299999999884, Reward: -200
Iter: 900, Epsilon:0.5369299999999869, Reward: -260
Iter: 1000, Epsilon:0.5299299999999855, Reward: -280
Iter: 1100, Epsilon:0.522929999999984, Reward: -240
Iter: 1200, Epsilon:0.5159299999999826, Reward: -220
Iter: 1300, Epsilon:0.5089299999999811, Reward: -280
Iter: 1400, Epsilon:0.5019299999999797, Reward: -220
Iter: 1500, Epsilon:0.4949299999999782, Reward: -340
Iter: 1600, Epsilon:0.48792999999997677, Reward: -220
Iter: 1700, Epsilon:0.4809299999999753, Reward: -340
Iter: 1800, Epsilon:0.47392999999997387, Reward: -160
Iter: 1900, Epsilo

In [9]:
dqn1.epsilon = 0.6
dqn2.epsilon = 0.6
train(5000, env, dqn1, dqn2)

Iter: 0, Epsilon:0.59993, Reward: 0
Iter: 100, Epsilon:0.5929299999999985, Reward: -180
Iter: 200, Epsilon:0.5859299999999971, Reward: -260
Iter: 300, Epsilon:0.5789299999999956, Reward: -480
Iter: 400, Epsilon:0.5719299999999942, Reward: -380
Iter: 500, Epsilon:0.5649299999999927, Reward: -360
Iter: 600, Epsilon:0.5579299999999913, Reward: -240
Iter: 700, Epsilon:0.5509299999999898, Reward: -240
Iter: 800, Epsilon:0.5439299999999884, Reward: -280
Iter: 900, Epsilon:0.5369299999999869, Reward: -320
Iter: 1000, Epsilon:0.5299299999999855, Reward: -360
Iter: 1100, Epsilon:0.522929999999984, Reward: -140
Iter: 1200, Epsilon:0.5159299999999826, Reward: -240
Iter: 1300, Epsilon:0.5089299999999811, Reward: -320
Iter: 1400, Epsilon:0.5019299999999797, Reward: -200
Iter: 1500, Epsilon:0.4949299999999782, Reward: -460
Iter: 1600, Epsilon:0.48792999999997677, Reward: -120
Iter: 1700, Epsilon:0.4809299999999753, Reward: -340
Iter: 1800, Epsilon:0.47392999999997387, Reward: -300
Iter: 1900, Epsilo

In [10]:
dqn1.epsilon = 0.6
dqn2.epsilon = 0.6
train(10000, env, dqn1, dqn2)

Iter: 0, Epsilon:0.59993, Reward: -20
Iter: 100, Epsilon:0.5929299999999985, Reward: -240
Iter: 200, Epsilon:0.5859299999999971, Reward: -260
Iter: 300, Epsilon:0.5789299999999956, Reward: -320
Iter: 400, Epsilon:0.5719299999999942, Reward: -380
Iter: 500, Epsilon:0.5649299999999927, Reward: -460
Iter: 600, Epsilon:0.5579299999999913, Reward: -280
Iter: 700, Epsilon:0.5509299999999898, Reward: -300
Iter: 800, Epsilon:0.5439299999999884, Reward: -360
Iter: 900, Epsilon:0.5369299999999869, Reward: -260
Iter: 1000, Epsilon:0.5299299999999855, Reward: -480
Iter: 1100, Epsilon:0.522929999999984, Reward: -420
Iter: 1200, Epsilon:0.5159299999999826, Reward: -480
Iter: 1300, Epsilon:0.5089299999999811, Reward: -200
Iter: 1400, Epsilon:0.5019299999999797, Reward: -340
Iter: 1500, Epsilon:0.4949299999999782, Reward: -280
Iter: 1600, Epsilon:0.48792999999997677, Reward: -220
Iter: 1700, Epsilon:0.4809299999999753, Reward: -260
Iter: 1800, Epsilon:0.47392999999997387, Reward: -320
Iter: 1900, Epsi

In [11]:
dqn1.eps_dec=7e-6
dqn2.eps_dec=7e-6
dqn1.epsilon = 0.6
dqn2.epsilon = 0.6
train(100000, env, dqn1, dqn2)

Iter: 0, Epsilon:0.599993, Reward: -20
Iter: 100, Epsilon:0.5992930000000021, Reward: -240
Iter: 200, Epsilon:0.5985930000000041, Reward: -220
Iter: 300, Epsilon:0.5978930000000062, Reward: -240
iter=  377
Goal found!
Iter: 400, Epsilon:0.5971930000000083, Reward: 620
Iter: 500, Epsilon:0.5964930000000104, Reward: -360
Iter: 600, Epsilon:0.5957930000000125, Reward: -380
Iter: 700, Epsilon:0.5950930000000145, Reward: -240
Iter: 800, Epsilon:0.5943930000000166, Reward: -380
Iter: 900, Epsilon:0.5936930000000187, Reward: -400
Iter: 1000, Epsilon:0.5929930000000208, Reward: -400
Iter: 1100, Epsilon:0.5922930000000228, Reward: -220
Iter: 1200, Epsilon:0.5915930000000249, Reward: -260
Iter: 1300, Epsilon:0.590893000000027, Reward: -400
Iter: 1400, Epsilon:0.590193000000029, Reward: -300
Iter: 1500, Epsilon:0.5894930000000311, Reward: -320
Iter: 1600, Epsilon:0.5887930000000332, Reward: -260
Iter: 1700, Epsilon:0.5880930000000353, Reward: -420
Iter: 1800, Epsilon:0.5873930000000374, Reward: -

Iter: 15000, Epsilon:0.4949930000003113, Reward: -220
Iter: 15100, Epsilon:0.4942930000003134, Reward: -240
Iter: 15200, Epsilon:0.4935930000003155, Reward: -360
Iter: 15300, Epsilon:0.49289300000031755, Reward: -280
Iter: 15400, Epsilon:0.4921930000003196, Reward: -380
Iter: 15500, Epsilon:0.4914930000003217, Reward: -280
Iter: 15600, Epsilon:0.4907930000003238, Reward: -360
Iter: 15700, Epsilon:0.49009300000032585, Reward: -220
Iter: 15800, Epsilon:0.4893930000003279, Reward: -360
Iter: 15900, Epsilon:0.48869300000033, Reward: -380
Iter: 16000, Epsilon:0.4879930000003321, Reward: -260
Iter: 16100, Epsilon:0.48729300000033415, Reward: -260
Iter: 16200, Epsilon:0.48659300000033623, Reward: -220
Iter: 16300, Epsilon:0.4858930000003383, Reward: -280
Iter: 16400, Epsilon:0.4851930000003404, Reward: -380
Iter: 16500, Epsilon:0.48449300000034246, Reward: -320
Iter: 16600, Epsilon:0.48379300000034453, Reward: -260
Iter: 16700, Epsilon:0.4830930000003466, Reward: -320
Iter: 16800, Epsilon:0.4

Iter: 30000, Epsilon:0.38999300000062265, Reward: -240
Iter: 30100, Epsilon:0.3892930000006247, Reward: -220
Iter: 30200, Epsilon:0.3885930000006268, Reward: -200
Iter: 30300, Epsilon:0.3878930000006289, Reward: -200
Iter: 30400, Epsilon:0.38719300000063095, Reward: -200
Iter: 30500, Epsilon:0.386493000000633, Reward: -180
Iter: 30600, Epsilon:0.3857930000006351, Reward: -300
Iter: 30700, Epsilon:0.3850930000006372, Reward: -280
Iter: 30800, Epsilon:0.38439300000063925, Reward: -220
Iter: 30900, Epsilon:0.3836930000006413, Reward: -180
Iter: 31000, Epsilon:0.3829930000006434, Reward: -60
Iter: 31100, Epsilon:0.3822930000006455, Reward: -220
Iter: 31200, Epsilon:0.38159300000064755, Reward: -180
Iter: 31300, Epsilon:0.38089300000064963, Reward: -200
Iter: 31400, Epsilon:0.3801930000006517, Reward: -160
Iter: 31500, Epsilon:0.3794930000006538, Reward: -140
Iter: 31600, Epsilon:0.37879300000065586, Reward: -60
iter=  31624
Goal found!
Iter: 31700, Epsilon:0.37809300000065793, Reward: 780


Iter: 45000, Epsilon:0.28499300000093397, Reward: -180
Iter: 45100, Epsilon:0.28429300000093605, Reward: -100
Iter: 45200, Epsilon:0.2835930000009381, Reward: -280
Iter: 45300, Epsilon:0.2828930000009402, Reward: -40
Iter: 45400, Epsilon:0.2821930000009423, Reward: -260
Iter: 45500, Epsilon:0.28149300000094435, Reward: -260
Iter: 45600, Epsilon:0.2807930000009464, Reward: -100
Iter: 45700, Epsilon:0.2800930000009485, Reward: -180
Iter: 45800, Epsilon:0.2793930000009506, Reward: -120
Iter: 45900, Epsilon:0.27869300000095265, Reward: -160
Iter: 46000, Epsilon:0.2779930000009547, Reward: -100
Iter: 46100, Epsilon:0.2772930000009568, Reward: -120
Iter: 46200, Epsilon:0.2765930000009589, Reward: -220
Iter: 46300, Epsilon:0.27589300000096095, Reward: -180
Iter: 46400, Epsilon:0.27519300000096303, Reward: -100
Iter: 46500, Epsilon:0.2744930000009651, Reward: -100
Iter: 46600, Epsilon:0.2737930000009672, Reward: -80
Iter: 46700, Epsilon:0.27309300000096925, Reward: -200
iter=  46736
Goal found

Iter: 59900, Epsilon:0.1806930000009684, Reward: -20
Iter: 60000, Epsilon:0.1799930000009677, Reward: -40
Iter: 60100, Epsilon:0.179293000000967, Reward: -140
Iter: 60200, Epsilon:0.1785930000009663, Reward: -40
Iter: 60300, Epsilon:0.1778930000009656, Reward: -120
Iter: 60400, Epsilon:0.1771930000009649, Reward: -60
Iter: 60500, Epsilon:0.1764930000009642, Reward: -60
Iter: 60600, Epsilon:0.1757930000009635, Reward: -160
Iter: 60700, Epsilon:0.1750930000009628, Reward: -180
Iter: 60800, Epsilon:0.1743930000009621, Reward: -80
Iter: 60900, Epsilon:0.1736930000009614, Reward: -160
Iter: 61000, Epsilon:0.1729930000009607, Reward: -140
Iter: 61100, Epsilon:0.17229300000096, Reward: -20
Iter: 61200, Epsilon:0.1715930000009593, Reward: -80
Iter: 61300, Epsilon:0.1708930000009586, Reward: -160
Iter: 61400, Epsilon:0.1701930000009579, Reward: -140
Iter: 61500, Epsilon:0.1694930000009572, Reward: -80
Iter: 61600, Epsilon:0.1687930000009565, Reward: -100
Iter: 61700, Epsilon:0.1680930000009558,

Iter: 75200, Epsilon:0.07359300000096322, Reward: -80
Iter: 75300, Epsilon:0.07289300000096391, Reward: -20
Iter: 75400, Epsilon:0.0721930000009646, Reward: -60
Iter: 75500, Epsilon:0.07149300000096528, Reward: -20
Iter: 75600, Epsilon:0.07079300000096597, Reward: 0
Iter: 75700, Epsilon:0.07009300000096666, Reward: -40
Iter: 75800, Epsilon:0.06939300000096735, Reward: 0
Iter: 75900, Epsilon:0.06869300000096804, Reward: -40
Iter: 76000, Epsilon:0.06799300000096872, Reward: -80
Iter: 76100, Epsilon:0.06729300000096941, Reward: 0
Iter: 76200, Epsilon:0.0665930000009701, Reward: -20
Iter: 76300, Epsilon:0.06589300000097079, Reward: -20
Iter: 76400, Epsilon:0.06519300000097147, Reward: -60
Iter: 76500, Epsilon:0.06449300000097216, Reward: -60
Iter: 76600, Epsilon:0.06379300000097285, Reward: -60
Iter: 76700, Epsilon:0.06309300000097354, Reward: -40
Iter: 76800, Epsilon:0.062393000000974114, Reward: 0
iter=  76896
Goal found!
Iter: 76900, Epsilon:0.06169300000097411, Reward: 1000
Iter: 77000

Iter: 93100, Epsilon:0.01, Reward: -40
Iter: 93200, Epsilon:0.01, Reward: 0
Iter: 93300, Epsilon:0.01, Reward: 0
Iter: 93400, Epsilon:0.01, Reward: -40
Iter: 93500, Epsilon:0.01, Reward: -20
Iter: 93600, Epsilon:0.01, Reward: 0
Iter: 93700, Epsilon:0.01, Reward: -60
Iter: 93800, Epsilon:0.01, Reward: 0
Iter: 93900, Epsilon:0.01, Reward: 0
Iter: 94000, Epsilon:0.01, Reward: 0
Iter: 94100, Epsilon:0.01, Reward: -20
Iter: 94200, Epsilon:0.01, Reward: 0
Iter: 94300, Epsilon:0.01, Reward: 0
Iter: 94400, Epsilon:0.01, Reward: 0
Iter: 94500, Epsilon:0.01, Reward: 0
Iter: 94600, Epsilon:0.01, Reward: 0
Iter: 94700, Epsilon:0.01, Reward: 0
Iter: 94800, Epsilon:0.01, Reward: 0
Iter: 94900, Epsilon:0.01, Reward: 0
Iter: 95000, Epsilon:0.01, Reward: 0
Iter: 95100, Epsilon:0.01, Reward: 0
Iter: 95200, Epsilon:0.01, Reward: 0
Iter: 95300, Epsilon:0.01, Reward: 0
Iter: 95400, Epsilon:0.01, Reward: 0
Iter: 95500, Epsilon:0.01, Reward: 0
Iter: 95600, Epsilon:0.01, Reward: 0
Iter: 95700, Epsilon:0.01, R

In [12]:
dqn1.eps_dec=7e-6
dqn2.eps_dec=7e-6
dqn1.epsilon = 0.6
dqn2.epsilon = 0.6
train(100000, env, dqn1, dqn2)

Iter: 0, Epsilon:0.599993, Reward: 0
Iter: 100, Epsilon:0.5992930000000021, Reward: -240
Iter: 200, Epsilon:0.5985930000000041, Reward: -260
Iter: 300, Epsilon:0.5978930000000062, Reward: -560
Iter: 400, Epsilon:0.5971930000000083, Reward: -260
Iter: 500, Epsilon:0.5964930000000104, Reward: -260
Iter: 600, Epsilon:0.5957930000000125, Reward: -440
Iter: 700, Epsilon:0.5950930000000145, Reward: -360
Iter: 800, Epsilon:0.5943930000000166, Reward: -280
Iter: 900, Epsilon:0.5936930000000187, Reward: -340
Iter: 1000, Epsilon:0.5929930000000208, Reward: -220
Iter: 1100, Epsilon:0.5922930000000228, Reward: -300
Iter: 1200, Epsilon:0.5915930000000249, Reward: -280
Iter: 1300, Epsilon:0.590893000000027, Reward: -220
Iter: 1400, Epsilon:0.590193000000029, Reward: -360
Iter: 1500, Epsilon:0.5894930000000311, Reward: -400
Iter: 1600, Epsilon:0.5887930000000332, Reward: -480
Iter: 1700, Epsilon:0.5880930000000353, Reward: -260
Iter: 1800, Epsilon:0.5873930000000374, Reward: -480
Iter: 1900, Epsilon:

Iter: 15300, Epsilon:0.49289300000031755, Reward: -300
Iter: 15400, Epsilon:0.4921930000003196, Reward: -320
Iter: 15500, Epsilon:0.4914930000003217, Reward: -240
Iter: 15600, Epsilon:0.4907930000003238, Reward: -320
Iter: 15700, Epsilon:0.49009300000032585, Reward: -280
Iter: 15800, Epsilon:0.4893930000003279, Reward: -220
Iter: 15900, Epsilon:0.48869300000033, Reward: -180
Iter: 16000, Epsilon:0.4879930000003321, Reward: -340
Iter: 16100, Epsilon:0.48729300000033415, Reward: -300
Iter: 16200, Epsilon:0.48659300000033623, Reward: -300
Iter: 16300, Epsilon:0.4858930000003383, Reward: -460
Iter: 16400, Epsilon:0.4851930000003404, Reward: -360
Iter: 16500, Epsilon:0.48449300000034246, Reward: -420
Iter: 16600, Epsilon:0.48379300000034453, Reward: -220
Iter: 16700, Epsilon:0.4830930000003466, Reward: -180
Iter: 16800, Epsilon:0.4823930000003487, Reward: -360
Iter: 16900, Epsilon:0.48169300000035076, Reward: -140
Iter: 17000, Epsilon:0.48099300000035283, Reward: -200
Iter: 17100, Epsilon:0

Iter: 30300, Epsilon:0.3878930000006289, Reward: -180
Iter: 30400, Epsilon:0.38719300000063095, Reward: -140
Iter: 30500, Epsilon:0.386493000000633, Reward: -180
Iter: 30600, Epsilon:0.3857930000006351, Reward: -160
Iter: 30700, Epsilon:0.3850930000006372, Reward: -300
Iter: 30800, Epsilon:0.38439300000063925, Reward: -200
Iter: 30900, Epsilon:0.3836930000006413, Reward: -180
Iter: 31000, Epsilon:0.3829930000006434, Reward: -220
Iter: 31100, Epsilon:0.3822930000006455, Reward: -260
Iter: 31200, Epsilon:0.38159300000064755, Reward: -260
Iter: 31300, Epsilon:0.38089300000064963, Reward: -80
Iter: 31400, Epsilon:0.3801930000006517, Reward: -280
Iter: 31500, Epsilon:0.3794930000006538, Reward: -80
Iter: 31600, Epsilon:0.37879300000065586, Reward: -180
Iter: 31700, Epsilon:0.37809300000065793, Reward: -100
Iter: 31800, Epsilon:0.37739300000066, Reward: -160
Iter: 31900, Epsilon:0.3766930000006621, Reward: -160
Iter: 32000, Epsilon:0.37599300000066416, Reward: -180
Iter: 32100, Epsilon:0.375

Iter: 45200, Epsilon:0.2835930000009381, Reward: -160
Iter: 45300, Epsilon:0.2828930000009402, Reward: -240
Iter: 45400, Epsilon:0.2821930000009423, Reward: -260
Iter: 45500, Epsilon:0.28149300000094435, Reward: -240
Iter: 45600, Epsilon:0.2807930000009464, Reward: -60
Iter: 45700, Epsilon:0.2800930000009485, Reward: -60
Iter: 45800, Epsilon:0.2793930000009506, Reward: -120
Iter: 45900, Epsilon:0.27869300000095265, Reward: -160
Iter: 46000, Epsilon:0.2779930000009547, Reward: -240
Iter: 46100, Epsilon:0.2772930000009568, Reward: -240
Iter: 46200, Epsilon:0.2765930000009589, Reward: -180
Iter: 46300, Epsilon:0.27589300000096095, Reward: -200
iter=  46310
Goal found!
Iter: 46400, Epsilon:0.27519300000096303, Reward: 780
iter=  46441
Goal found!
iter=  46488
Goal found!
Iter: 46500, Epsilon:0.2744930000009651, Reward: 1960
Iter: 46600, Epsilon:0.2737930000009672, Reward: -140
Iter: 46700, Epsilon:0.27309300000096925, Reward: -160
Iter: 46800, Epsilon:0.27239300000097133, Reward: -40
Iter:

KeyboardInterrupt: 