In [None]:
!pip install pettingzoo
!pip install multi-agent-ale-py
!pip install autorom
!AutoROM --install-dir /.
!AutoROM --accept-license

In [None]:
import torch.nn as nn
import torch
import copy
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
# %tensorflow_version 1.x
# import tensorflow as tf

In [None]:
%tensorflow_version 1.x
!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install stable-baselines[mpi]==2.10.2

import tensorflow as tf


device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
!pip install pygame
import os
os.environ['SDL_VIDEODRIVER']='dummy'
import pygame
pygame.display.set_mode((640,480))

In [None]:
import random
import numpy as np
from collections import defaultdict
import dill

from pettingzoo.atari import boxing_v2

In [None]:
env = boxing_v2.env()

env.reset()

env.agents

In [None]:
env.action_space('first_0')

In [None]:
env.observation_space('first_0')

In [None]:
# def policy(observation, agent):
#     action = env.action_space(agent).sample()
#     return action

In [None]:
# env.reset()
# for agent in env.agent_iter(max_iter = 1000):
#     # print(agent)
#     observation, reward, done, info = env.last()
#     action = policy(observation, agent) if not done else None
#     env.step(action)
#     env.render() # this visualizes a single game

In [None]:
parallel_env = boxing_v2.parallel_env()

parallel_env.reset()

parallel_env.agents

In [None]:
# observations = parallel_env.reset()
# max_cycles = 5000
# total_rew1, total_rew2 = 0, 0
# for step in range(max_cycles):
#     actions = {agent: policy(observations[agent], agent) for agent in parallel_env.agents}
#     print(actions)
#     observations, rewards, dones, infos = parallel_env.step(actions)
#     total_rew1 += rewards[0]
#     total_rew2 += rewards[1]

# print(total_rew1, "   ", total_rew2)

In [None]:
observation.shape

In [None]:
env.action_space(agent)

In [None]:
action_space_size = 18

In [None]:
def preprocess_observation(obs):
    return obs.astype(np.uint8).reshape((1, 3, 210, 160))

In [None]:

class BoxingAgent(nn.Module):

    def __init__(s, name, epsilon=0.9):
        super().__init__()
        s.eps = epsilon

        s.conv_1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        s.linear_1 = nn.Linear(210 * 160 * 16, 128)
        s.linear_2 = nn.Linear(128, 18)
        s.relu = nn.ReLU()

        s.name = name

    def forward(s, x):
        
        x = s.relu(s.conv_1(x))
        # print(x.shape)
        x = x.reshape((x.shape[0], -1)) # Flatten the image
        x = s.relu(s.linear_1(x))
        x = s.relu(s.linear_2(x))
        # return torch.argmax(x)
        return x

    def control(s, x, eps=0):
      
        q = s.forward(x.float())

        q_values = q.detach().numpy().squeeze()
        
        # Choose the epsilon greedy control
        optim_control = np.argmax(q_values).item()
        
        # eps-greedy strategy to choose control input
        # note that for eps=0 you should return the correct control u
        if (random.random() > s.eps):
            u = optim_control 
        else:
            u = random.choice(range(action_space_size))
        
        return u

In [None]:
def rollout(env, agents, eps=0, T=200, prin = False):
    traj_1 = []
    traj_2 = []
    # Reset environment and get initial state
    env.reset()
    # x = {agent.name: env.observe(agent.name) for agent in agents}
    actions = {'first_0': 1, 'second_0': 1}
    x, _, _, _ = env.step(actions)
    
    for t in range(T):
        actions = {agent.name: agent.control(torch.from_numpy(preprocess_observation(x[agent.name]))) for agent in agents}
        # print(actions)
        obs, rwds, dones, infos = env.step(actions)
        
        tr_1 = dict(x=x[agents[0].name], xp=obs[agents[0].name], r=rwds[agents[0].name], u=actions[agents[0].name], d=dones[agents[0].name])
        traj_1.append(tr_1)
        tr_2 = dict(x=x[agents[1].name], xp=obs[agents[1].name], r=rwds[agents[1].name], u=actions[agents[1].name], d=dones[agents[1].name])
        traj_2.append(tr_2)
        
        x = obs
        
        if dones:
            break
        
    return traj_1, traj_2

In [None]:
def loss(agent, target_agent, ds):
    
    gamma = 0.999
    batch_size = 1
    
    random_idx = np.random.choice(len(ds), batch_size)
    
    s = np.zeros((0, 3, 210, 160), dtype=np.float32)
    a = np.zeros((0,1), dtype=np.int64)
    s_prime = np.zeros((0, 3, 210, 160), dtype=np.float32)
    r = np.zeros((0,1), dtype=np.float32)
    d = np.zeros((0,1), dtype=np.float32)
    
    for i in random_idx:
        
        trajectory = ds[i]

        for step in trajectory:
            # print(s.shape)
            # print(step['x'].shape)
            s = np.vstack((s, step['x'].reshape((1, 3, 210, 160))))
            a = np.vstack((a, step['u']))
            s_prime = np.vstack((s_prime, step['xp'].reshape((1, 3, 210, 160))))
            r = np.vstack((r, step['r']))
            d = np.vstack((d, step['d']))
    
    mask = torch.where(torch.from_numpy(d) == 0.0, 1.0, 0.0).float()
    
    # print(agent(torch.from_numpy(s)))
    # print(torch.from_numpy(a))
    q_pred = agent(torch.from_numpy(s)).gather(1, torch.from_numpy(a))


    with torch.no_grad():
        q_phi = agent(torch.from_numpy(s_prime))
        max_acts = q_phi.argmax(dim=1).view(-1, 1)
        q_target_phi = target_agent(torch.from_numpy(s_prime))
        q_target = q_target_phi.gather(1, max_acts)
        q_target = torch.from_numpy(r) + gamma * q_target.detach() * mask

    f = (q_pred-q_target).pow(2).mean()
    
    return f

In [None]:
def run():

    # Create environment and agents
    env = boxing_v2.parallel_env()
    env.reset()
    agents = [BoxingAgent(name) for name in env.agents]

    agent_targets = [copy.deepcopy(agent) for agent in agents]
    optimizers = [torch.optim.Adam(agent.parameters(), lr=1e-3, weight_decay=1e-4) for agent in agents]

    # Dataset of trajectories
    ds_1 = []
    ds_2 = []
    counter = 0
    eps = 1

    # Collect few random trajectories with eps=1
    for i in range(3):
        traj_1, traj_2 = rollout(env, agents, T=40)
        ds_1.append(traj_1)
        ds_2.append(traj_2)

    recent_rewards_1 = []  # records the most recent 1000 training rewards
    eval_rewards_1 = []  # records the average evaluation rewards for every 1000 episodes
    train_rewards_1 = []  # records the average training rewards for every 1000 episodes

    recent_rewards_2 = []  # records the most recent 1000 training rewards
    eval_rewards_2 = []  # records the average evaluation rewards for every 1000 episodes
    train_rewards_2 = []  # records the average training rewards for every 1000 episodes

    for i in tqdm(range(1000)):
        for agent in agents:
          agent.train()
        traj_1, traj_2 = rollout(env, agents, eps=0.9, T=40)
        ds_1.append(traj_1)
        ds_2.append(traj_2)

        dss = [ds_1, ds_2]
        
        eps = np.max((eps*0.9995, 0.01))

        for a in range(2):
          curr_agent = agents[a]
          curr_targt = agent_targets[a]
          agent.zero_grad()
          f = loss(curr_agent, curr_targt, dss[a])
          f.backward()
          optimizers[a].step()

          if (i % 10 == 0):
              for q_target_param, q_param in zip(curr_targt.parameters(), curr_agent.parameters()):
                  q_target_param.data.copy_(q_target_param * 0.95 + q_param.data * 0.05)


        training_reward_1 = 0
        training_reward_2 = 0
        
        trajectories_1 = []
        trajectories_2 = []
        for tr in range(3):
            trajectory_1, trajectory_2 = rollout(env, agents, eps=0.9, T=40)
            trajectories_1.append(trajectory_1)
            trajectories_2.append(trajectory_2)
            
        for trajectory in trajectories_1: 
            training_reward_1 += sum([step['r'] for step in trajectory])

        for trajectory in trajectories_2: 
            training_reward_2 += sum([step['r'] for step in trajectory])
        
        train_rewards_1.append(training_reward_1 / 3)
        print(training_reward_1 / 3)
        train_rewards_2.append(training_reward_2 / 3)
        print(training_reward_2 / 3)


    plt.figure()
    plt.ylabel('reward')
    plt.title("Train_1")
    plt.plot(np.arange(0, len(train_rewards_1) * 500, 500), train_rewards_1)
    plt.show()

    plt.figure()
    plt.ylabel('reward')
    plt.title("Train_2")
    plt.plot(np.arange(0, len(train_rewards_2) * 500, 500), train_rewards_2)
    plt.show()

In [None]:
run()

#Adversarial Rollout
In this part, we can define any agents from the other files of our project, and use them to adversarially train against each other.

In [None]:
class ImportedAgent(nn.Module):

    def __init__(s, name, epsilon=0.9):
        super().__init__()
        s.eps = epsilon
        s.name = name

    def control(s, x):
      
        q = s.forward(x.float())

        q_values = q.detach().numpy().squeeze()
        
        # Choose the epsilon greedy control
        optim_control = np.argmax(q_values).item()
        
        # eps-greedy strategy to choose control input
        # note that for eps=0 you should return the correct control u
        if (random.random() > s.eps):
            u = optim_control 
        else:
            u = random.choice(range(action_space_size))
        
        return u

In [None]:
# This 

def adversarial_rollout():

    # Create environment and agents
    env = boxing_v2.parallel_env()
    env.reset()

    # Change this initialization with the new agents
    # agent_1 = ImportedAgent('first_0', 0.9)
    # agent_1.load_state_dict(torch.load('/content/drive/MyDrive/ESE650 Final Project 4.29/dqnS.pth'))

    # agent_2 = ImportedAgent('second_0', 0.9)
    # agent_2.load_state_dict(torch.load('/content/drive/MyDrive/ESE650 Final Project 4.29/dqnS.pth'))

    agents = [agent_1, agent_2]

    # Dataset of trajectories
    ds_1 = []
    ds_2 = []
    counter = 0wo
    eps = 1

    # Collect few random trajectories with eps=1
    for i in range(100):
        traj_1, traj_2 = rollout(env, agents, T=40)
        ds_1.append(traj_1)
        ds_2.append(traj_2)

    train_rewards_1 = [] 
    train_rewards_2 = [] 
    trajectories_1 = []
    trajectories_2 = []

    for tr in range(100):
        trajectory_1, trajectory_2 = rollout(env, agents, eps=0.9, T=40)
        trajectories_1.append(trajectory_1)
        trajectories_2.append(trajectory_2)

        for trajectory in trajectories_1: 
            train_rewards_1.append(sum([step['r'] for step in trajectory]))

        for trajectory in trajectories_2: 
            train_rewards_2.append(sum([step['r'] for step in trajectory]))


    plt.figure()
    plt.ylabel('reward')
    plt.title("Train_1")
    plt.plot(np.arange(0, len(train_rewards_1) * 500, 500), train_rewards_1)
    plt.show()

    plt.figure()
    plt.ylabel('reward')
    plt.title("Train_2")
    plt.plot(np.arange(0, len(train_rewards_2) * 500, 500), train_rewards_2)
    plt.show()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
adversarial_rollout()