In [None]:
import gym
import numpy as np
import torch as th
import torch.nn as nn
import copy
from random import sample
import matplotlib.pyplot as plt


def rollout(e, q, eps=0, T=2000):
    traj = []
    # Reset environment and get initial state
    x = e.reset()
    reward = 0
    for t in range(T):
        # Get action from policy (q network)
        u = q.control(th.from_numpy(x).float().unsqueeze(0), eps=eps)
        u = u.int().numpy().squeeze()
        # Execute action in the environment
        xp, r, d, info = e.step(u)
        t = dict(x=x, xp=xp, r=r, u=u, d=d, info=info)
        traj.append(t)
        reward = reward + r
        # Update current state
        x = xp
        # If done, terminate rollout
        if d:
            break
    return traj, reward


class q_t(nn.Module):
    def __init__(s, xdim=3, udim=1):
        super().__init__()
        """
        con2d layers
        160 * 210 pixels to probability of 18 choices
        """
        s.m = nn.Sequential(
            nn.Conv2d(3,6, kernel_size=(3,3), padding=1),
            nn.BatchNorm2d(6),
            nn.ReLU(True),
            nn.Conv2d(6,12, kernel_size=(3,3), padding=1),
            nn.BatchNorm2d(6),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(12,1, kernel_size=(3,3), padding=1),
            nn.BatchNorm2d(1),
            nn.MaxPool2d(kernel_size=2),
        )

        s.linear_layers = nn.Sequential(
            nn.Linear(2080, udim)
        )

    def forward(s, x):
        x = x.view(1, 3, 160, 210)

        x = s.m(x)

        x = x.view(1, -1)
        x = s.linear_layers(x)
        return x

    def control(s, x, eps=0):
        # Get q values for all controls
        x = x.view(1, 3, 160, 210)

        x = s.m(x)

        x = x.view(1, -1)
        q = s.linear_layers(x)

        ### TODO: XXXXXXXXXXXX
        # eps-greedy strategy to choose control input
        # note that for eps=0 you should return the correct control u

        prob = th.tensor([eps, 1-eps])
        binary = th.distributions.categorical.Categorical(prob)
        greedy = binary.sample()

        if greedy:
            u = th.argmax(q)
        else:
            length = q.size()[-1]
            prob_uniform = th.ones(length)
            prob_uniform = prob_uniform / length
            uniform = th.distributions.categorical.Categorical(prob_uniform)
            u = uniform.sample()
        return u


def loss(q, qc, ds):
    ### TODO: XXXXXXXXXXXX
    # 1. sample mini-batch from dataset ds

    batch = 25
    sam = sample(ds, batch)
    f = 0

    # 2. code up dqn with double-q trick
    for i in range(batch):
        traj = sam[i]

        j = sample(range(len(traj)), 1)[0]
        j = int(j)

        xp = traj[j]['xp']
        xp = th.from_numpy(xp).float().unsqueeze(0)[0]
        x = traj[j]['x']
        x = th.from_numpy(x).float().unsqueeze(0)[0]
        r = traj[j]['r']
        d = traj[j]['d']
        u = traj[j]['u']

        # select the best action for the next step with original Q
        q_u = q(xp)
        up = th.argmax(q_u)
        # evaluate q using delayed q
        # print(q_u.size())
        q_targ = q(xp)[0][up]
        # compute target
        target = r + (1 - d) * 0.9 * q_targ
        # get the Q value of the current state-action pair
        q_now = q(x)[0][u]

        f += (q_now - target)**2 / batch

        '''
        # Using robust regression to fit the Q-function
        if th.abs(q_now - target) < 1:
            f = f + (q_now - target)**2 / 2 / batch
        else:
            f = f + (th.abs(q_now - target) - 0.5) / batch
        '''
    # 3. return the objective f
    return f


def evaluate(q, eps):
    ### TODO: XXXXXXXXXXXX
    # 1. create a new environment e

    env = gym.make('Boxing-v0')

    # 2. run the learnt q network for 100 trajectories on this new environment
    # to take control actions. Remember that you should not perform
    # epsilon-greedy exploration in the evaluation phase

    r = 0
    for i in range(10):
        traj, rwd1 = rollout(env, q, eps=eps, T=2000)
        r = r + rwd1
    r = r / 10
    # 3. report the average discounted return of these 100 trajectories

    return r

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install gym==0.19.0
!pip install atari_py==0.2.6

In [None]:
if __name__=='__main__':
    # Create environment
    e = gym.make('Boxing-v0')
    xdim, udim = e.observation_space.shape[0], e.action_space.n

    # Create q network
    q = q_t(xdim, udim,)
    q = th.load('/content/dqnS1.pth')
    optim = th.optim.Adam(q.parameters(), lr=1e-3, weight_decay=1e-4)

    # Dataset of trajectories
    ds = []

    num_iteration = 120
    cumulative_reward = 0
    # Collect few random trajectories with eps=1
    for i in range(20):
        traj0, rwd0 = rollout(e, q, eps=1, T=2000)
        ds.append(traj0)
        cumulative_reward += rwd0
    for i in range(20):
        traj0, rwd0 = rollout(e, q, eps=0.9, T=2000)
        ds.append(traj0)
        cumulative_reward += rwd0
    for i in range(50):
        traj0, rwd0 = rollout(e, q, eps=0.7, T=2000)
        ds.append(traj0)
        cumulative_reward += rwd0

    qc = th.load('/content/dqnSc1.pth')
    
    alpha = 0.05

    num_rec = 20
    evaluation_t = np.zeros(num_rec)
    # evaluation_t[0] = cumulative_reward / 1000
    evaluation = np.zeros(num_rec)
    # evaluation[0] = evaluate(q)
    eps_value = 0.7

    for mm in range(num_rec-1):
        for i in range(num_iteration):

            q.train()
            t, rwd = rollout(e, q, eps=eps_value)
            if mm < 8:
              ds.append(t)
            else:
              pint = sample(range(1050), 1)[0]
              ds[pint] = t
            cumulative_reward += rwd

            # Perform weights updates on the q network
            # need to call zero grad on q function to clear the gradient buffer
            q.zero_grad()
            f = loss(q, qc, ds)
            f.backward()
            optim.step()

            # Exponential averaging for the target
            param_1 = q.state_dict()
            param_2 = qc.state_dict()
            for k in param_2:
                param_2[k] = (1 - alpha) * param_2[k] + (alpha * param_1[k])
            qc.load_state_dict(param_2)
            if i == 1:
                test = evaluate(q, 0)

        evaluation[mm+1] = evaluate(q, 0)
        evaluation_t[mm + 1] = evaluate(q, eps_value)
        eps_value = max(eps_value * 0.95, 0.1)
        print('Logging data to plot')
        print(evaluation)
        print(evaluation_t)
        th.save(q, './dqnS1.pth')
        th.save(qc, './dqnSc1.pth')


    xaxis = np.array(range(num_rec)) * 120
    plt.plot(xaxis, evaluation)
    plt.ylabel('evaluation value')
    plt.xlabel('iteration')
    plt.savefig('./test2.jpg')
    plt.show()
    plt.plot(xaxis, evaluation_t)
    plt.ylabel('train value')
    plt.xlabel('iteration')
    plt.savefig('./test3.jpg')
    plt.show()
    th.save(q, './dqnS.pth')
    th.save(qc, './dqnSc.pth')