# Actor Critic Method

## Setup the enviroment

In [None]:
# GFootball environment.
!pip install kaggle_environments
!apt-get update -y
!apt-get install -y libsdl2-gfx-dev libsdl2-ttf-dev
!git clone -b v2.3 https://github.com/google-research/football.git
!mkdir -p football/third_party/gfootball_engine/lib
!wget https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_v2.3.so -O football/third_party/gfootball_engine/lib/prebuilt_gameplayfootball.so
!cd football && GFOOTBALL_USE_PREBUILT_SO=1 pip3 install .

# Some helper code
!git clone https://github.com/garethjns/kaggle-football.git
!pip install reinforcement_learning_keras==0.6.0

In [None]:
from __future__ import division
from __future__ import print_function
import collections
from typing import Union, Callable, List, Tuple, Iterable, Any, Dict
from dataclasses import dataclass
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from tensorflow import keras
import tensorflow as tf
import seaborn as sns
import gym
import gfootball
import glob 
import imageio
import pathlib
import zlib
import pickle
import tempfile
import os
import sys
from IPython.display import Image, display
from gfootball.env import observation_preprocessing
sns.set()

# In TF > 2, training keras models in a loop with eager execution on causes memory leaks and terrible performance.
tf.compat.v1.disable_eager_execution()

sys.path.append("/kaggle/working/kaggle-football/")

import itertools as it
from random import sample, randint, random
from time import time, sleep
import numpy as np
import skimage.color, skimage.transform
import tensorflow as tf
from tqdm import trange
from argparse import ArgumentParser

In [None]:
import random                # Handling random number generation
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

from collections import deque# Ordered collection with ends
from collections import namedtuple
import torch
import torch.optim as optim
import numpy as np
stacked_size = 3

class DataPreprocess():
    
    def __init__(self , stack_size ):
        self.stack_size = stack_size
        self.stacked_frames = deque([ np.zeros([4,72,96] ,dtype=int) for i_dx in range(stacked_size) ] , maxlen=self.stack_size)

    def preprocess(self , frame):
        """
        screen frames are in grayscale format defalut
        1.normalize the images
        2. apply some transformations
        """
        frame = frame / 255.0
        frame = transform.resize(frame ,[4,72,96])

        return frame

    def reset(self ):
        
        self.stacked_frames = deque([ np.zeros([4,72,96] ,dtype=int) for i_dx in range(stacked_size) ] , maxlen=self.stack_size)
        
    def stack_frames(self , frame , new_episode=False):
        """
        stack multiple frames with each other to idenitify the temporal movemnts of the objects
        1. preprocess the new frame
        """
        processed_frame = self.preprocess(frame)
        if(new_episode):
            #for initial step after new episode add same frame to all the stacked frames
            self.reset()
            self.stacked_frames.append(processed_frame)
            self.stacked_frames.append(processed_frame)

            stack_states = np.concatenate(self.stacked_frames , axis=0)

            return stack_states

        else:
            self.stacked_frames.append(processed_frame)

            stack_states = np.concatenate(self.stacked_frames , axis=0)

            return stack_states

In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self,stack_size , action_size , seed):
        super(Actor, self).__init__()
        """
        define a simple model with some conv layers and later with fully connected layers
        """
        self.in_size = stack_size
        self.out_size = action_size
        self.seed = torch.manual_seed(seed)
        self.conv_block1 = nn.Conv2d(self.in_size , 32 ,    kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block2 = nn.Conv2d(32 , 128 ,  kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block3 = nn.Conv2d(128 , 512 , kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block4 = nn.Conv2d(512 , 1024 ,kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.flatten_size = 1024*5*6
        self.fc1 = nn.Linear(self.flatten_size , 256)
        self.fc_bn = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256 , self.out_size)
        
    def forward(self, x):

        #layer 1 infernce
        x = self.conv_block1(x)
        # layer 2 inference
        x = self.conv_block2(x)
        # layer 3 infernce
        x = self.conv_block3(x)
        # layer 4 infernce
        x = self.conv_block4(x)
        x = x.view(-1,self.flatten_size)
        x = F.dropout(F.relu(self.fc_bn(self.fc1(x))) ,p=0.4 )
        # model output
        x = self.fc2(x) 
        
        return x

class Critic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, action_size, seed, fcs1_units=400, fc2_units=300):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fcs1_units (int): Number of nodes in the first hidden layer
            fc2_units (int): Number of nodes in the second hidden layer
        """
        super(Critic, self).__init__()
        
        self.seed = torch.manual_seed(seed)
        
        self.in_size = state_size
        self.out_size = action_size
        self.seed = torch.manual_seed(seed)
        
        self.conv_block1 = nn.Conv2d(self.in_size ,  32 ,  kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block2 = nn.Conv2d(32 , 64 ,  kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block3 = nn.Conv2d(64 , 128 , kernel_size=3 , stride=2 , padding=1 , bias=False)
        
        self.flatten_size = 128*9*12
        self.fc1 = nn.Linear(self.flatten_size , fcs1_units)
        
        self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)

    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = x.view(-1,self.flatten_size)
        xs = F.dropout(F.relu(self.fc1(x)) ,p=0.4 )
        
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        
        return self.fc3(x)


In [None]:
import numpy as np
import random
import copy
from collections import namedtuple, deque

import torch
import torch.nn.functional as F
import torch.optim as optim

BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 128        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-4         # learning rate of the actor 
LR_CRITIC = 1e-3        # learning rate of the critic
WEIGHT_DECAY = 0        # L2 weight decay

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer( BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        
        return np.argmax(action)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state


class ReplayBuffer :
    def __init__(self , batch_size , buffer_size , seed):
        self.batch_size = batch_size 
        self.buffer_size = buffer_size 
        self.seed = seed
        random.seed(self.seed)
        self.memory = deque(maxlen=self.buffer_size)
        self.experience = namedtuple('Experience' , field_names=['state','action','reward','next_state','done'])
        
    def add(self , state , action , reward , next_state , done):
        experience = self.experience(state , action , reward , next_state , done)
        self.memory.append(experience)
        
    def sample(self):
        
        experinece_batch = random.sample(self.memory , k=self.batch_size)
        
        states = torch.from_numpy( np.stack([ e.state.reshape(8,72,96) for e in experinece_batch if e is not None ],axis=0) ).float().to(device)
        action = torch.from_numpy( np.stack([ (e.action).reshape(1,) for e in experinece_batch if e is not None] , axis=0) ).long().to(device)
        reward = torch.from_numpy( np.stack([ np.array((e.reward)).reshape(1,) for e in experinece_batch if e is not None] , axis=0) ).float().to(device)
        next_state = torch.from_numpy( np.stack([ e.next_state.reshape(8,72,96) for e in experinece_batch if e is not None ] , axis=0) ).float().to(device)
        done = torch.from_numpy( np.stack([ np.array((e.done)).reshape(1,) for e in experinece_batch if e is not None], axis=0).astype(np.uint8) ).float().to(device)
        
        return ( states , action , reward , next_state , done )
    
    def __len__(self):
        return len(self.memory)

In [None]:
football_env = gym.make("GFootball-11_vs_11_kaggle-SMM-v0")

In [None]:
agent = Agent(state_size=8, action_size=19, random_seed=10)
data_processor = DataPreprocess(stack_size=2)

In [None]:
def ddpg(n_episodes=200, max_t=1000):
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf
    for i_episode in range(1, n_episodes+1):
        state = football_env.reset()
        # Remember that stack frame function also call our preprocess function.
        state  = data_processor.stack_frames(state , new_episode=True)
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = football_env.step(action)
            #prepare the next frame
            next_state = data_processor.stack_frames( next_state, False)
            #set the agent with the new data
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
    return scores

scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
torch.save(agent.actor_local.state_dict() , "checkpoint_actor.pth")
torch.save(agent.critic_local.state_dict() , "checkpoint_actor.pth")

In [None]:
%%writefile main.py

#from kaggle_environments.envs.football.helpers import *

# @human_readable_agent wrapper modifies raw observations 
# provided by the environment:
# https://github.com/google-research/football/blob/master/gfootball/doc/observation.md#raw-observations
# into a form easier to work with by humans.
# Following modifications are applied:
# - Action, PlayerRole and GameMode enums are introduced.
# - 'sticky_actions' are turned into a set of active actions (Action enum)
#    see usage example below.
# - 'game_mode' is turned into GameMode enum.
# - 'designated' field is removed, as it always equals to 'active'
#    when a single player is controlled on the team.
# - 'left_team_roles'/'right_team_roles' are turned into PlayerRole enums.
# - Action enum is to be returned by the agent function.


import collections
import pickle
import zlib
from typing import Tuple, Dict, Any, Union, Callable, List

import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim

import gym
import numpy as np
from gfootball.env import observation_preprocessing

import random                # Handling random number generation
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

from collections import deque# Ordered collection with ends
from collections import namedtuple


class Data_Processing():
    
    def __init__(self , stack_size ):
        self.stack_size = stack_size
        self.stacked_frames = deque([ np.zeros([4,72,96] ,dtype=int) for i_dx in range(stacked_size) ] , maxlen=self.stack_size)

    def preprocess(self , frame):
        """
        screen frames are in grayscale format defalut
        1.normalize the images
        2. apply some transformations
        """
        frame = frame / 255.0
        frame = transform.resize(frame ,[4,72,96])

        return frame

    def reset(self ):
        
        self.stacked_frames = deque([ np.zeros([4,72,96] ,dtype=int) for i_dx in range(stacked_size) ] , maxlen=self.stack_size)
        
    def stack_frames(self , frame , new_episode=False):
        """
        stack multiple frames with each other to idenitify the temporal movemnts of the objects
        1. preprocess the new frame
        """
        processed_frame = self.preprocess(frame)
        if(new_episode):
            #for initial step after new episode add same frame to all the stacked frames
            self.reset()
            self.stacked_frames.append(processed_frame)
            self.stacked_frames.append(processed_frame)

            stack_states = np.concatenate(self.stacked_frames , axis=0)

            return stack_states

        else:
            self.stacked_frames.append(processed_frame)

            stack_states = np.concatenate(self.stacked_frames , axis=0)

            return stack_states
    

class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self,stack_size , action_size , seed):
        super(Actor, self).__init__()
        """
        define a simple model with some conv layers and later with fully connected layers
        """
        self.in_size = stack_size
        self.out_size = action_size
        self.seed = torch.manual_seed(seed)
        self.conv_block1 = nn.Conv2d(self.in_size , 32 ,    kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block2 = nn.Conv2d(32 , 128 ,  kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block3 = nn.Conv2d(128 , 512 , kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.conv_block4 = nn.Conv2d(512 , 1024 ,kernel_size=3 , stride=2 , padding=1 , bias=False)
        self.flatten_size = 1024*5*6
        self.fc1 = nn.Linear(self.flatten_size , 256)
        self.fc_bn = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256 , self.out_size)
        
    def forward(self, x):

        #layer 1 infernce
        x = self.conv_block1(x)
        # layer 2 inference
        x = self.conv_block2(x)
        # layer 3 infernce
        x = self.conv_block3(x)
        # layer 4 infernce
        x = self.conv_block4(x)
        x = x.view(-1,self.flatten_size)
        x = F.dropout(F.relu(self.fc_bn(self.fc1(x))) ,p=0.4 )
        # model output
        x = self.fc2(x) 
        
        return x
    
    
actor = Actor(8,19,2333)
try:
    actor.load_state_dict(torch.load("/kaggle_simulations/agent/checkpoint_actor.pth" , map_location="cpu"))
    
except (FileNotFoundError, ValueError):
    actor.load_state_dict(torch.load("checkpoint_actor.pth" , map_location="cpu")
                          
actor.eval()


data_buffer = Data_Processing(stack_size=2)

#@human_readable_agent
def agent(obs):

    # Get the raw observations return by the environment
    obs = obs['players_raw'][0]
    # Convert these to the same output as the SMMWrapper we used in training
    obs = observation_preprocessing.generate_smm([obs]).squeeze()
    
    state = data_buffer.stack_frames(obs ,False)
    
    actor_state = torch.from_numpy(state).float().unsqueeze(0)
    #inference the model
    actor_action = actor.forward(actor_state)

    # Use the SMMFrameProcessWrapper to do the buffering, but not enviroment
    # stepping or anything related to the Gym API.
    action = np.argmax(actor_action.to('cpu').detach().numpy()) 

    return [int(action)]

In [None]:
from typing import Tuple, Dict, List, Any

from kaggle_environments import make

env = make("football", debug=True,configuration={"save_video": True,
                                      "scenario_name": "11_vs_11_kaggle"})

# Define players
left_player = "/kaggle/working/main.py"  # A custom agent, eg. random_agent.py or example_agent.py
right_player = "run_right"  # eg. A built in 'AI' agent or the agent again


output: List[Tuple[Dict[str, Any], Dict[str, Any]]] = env.run([left_player, right_player])

#print(f"Final score: {sum([r['reward'] for r in output[0]])} : {sum([r['reward'] for r in output[1]])}")
env.render(mode="human", width=800, height=600)

In [None]:
!tar -czvf submission.tar.gz ./main.py*  ./checkpoint_actor.pth*