# Proximal Policy Optimization

### Import the libraries

In [167]:
# Calculations
import numpy as np
from numpy import random, sqrt

# Copying
from copy import deepcopy

# Network libraries
import torch
import torch.nn.functional as F, torch.nn as nn, torch.optim as optim
from torch.distributions import Categorical as Categorical

# Game library
from TicTacToe import initialState, randomState, printState, drawState, move, possibleMoves, possibleMovesMask, isOver, gameScore

### Generalized Advantage Estimate (GAE)

The GAE is calculated using the formula:

$ \begin{equation*}
\hat{A_t}^{GAE(\gamma, \lambda)} = (1-\lambda)\left( \hat{A}^{(1)}_t + \lambda\hat{A}^{(2)}_t + \lambda^2\hat{A}^{(3)}_t + ...\right) = \sum_{l=0}^{\infty}(\gamma\lambda)^l\delta^V_{t+l}
\end{equation*}$

where $\delta^V_{t}$ is the TD residual defined by:

$ \begin{equation*}
\delta^V_{t} = r_t + \gamma V^\pi(s_{t+1})-V^\pi(s_t)
\end{equation*}
$



Note that by the definition, we can easily evaulate $ \hat{A_t}^{GAE(\gamma, \lambda)} $ based on $ \hat{A_{t+1}}^{GAE(\gamma, \lambda)} $:

\begin{equation*}
\hat{A_t}^{GAE(\gamma, \lambda)} = \delta^V_t + \gamma\lambda \hat{A}_{t+1}^{GAE(\gamma, \lambda)} 
\end{equation*}


In [168]:
def gae(values, rewards, masks, lmbda = 0.95, gamma = 0.99):
	# Initialize the advangate
	adv = 0
	# Initialize the results
	results = []

	# Starting with the last action, iterate over actions and calculate the advantages
	for t in reversed(range(len(rewards))):
		# Calculate the current value of delta
		delta = rewards[t] + gamma * values[t+1] * masks[t] - values[t]
		# Update the current advantage
		adv = delta + gamma * lmbda * adv * masks[t]
		# Add the estimates advantage to the value, to obtain a predicition of the real value
		results.append(adv+values[t])
	
	# Reverse the results to obtain the right order
	results.reverse()

	# Return the results
	return results

### Actor-Critic Network

In [169]:
class ActorCritic(nn.Module):

	# Initialize the actor and critic
	def __init__(self):
		
		super(ActorCritic, self).__init__()

		# Joint
		self.joint = nn.Sequential(
			nn.Conv2d(1, 16, 3, padding = 1),
			nn.ReLU(),
			nn.Conv2d(16, 32, 3, padding = 1),
			nn.ReLU(),
			nn.Conv2d(32, 32, 2),
			nn.Flatten()
		)

		# Actor
		self.actor = nn.Sequential(
			nn.Linear(32*4, 9)
		)

		# Critic
		self.critic = nn.Sequential(
			nn.Linear(32*4, 1)
		)

		self.softmax = nn.Softmax(dim=-1)

	
	# Instead of using a forward function, we will split it into 
	# one action for the actor, and one for the critic
	def forward(self):
		raise NotImplementedError
	

	def get_probabilities(self, state):
		# Change the state into suitable format
		tensor_state = torch.reshape(state, (-1,1,3,3))

		# Calculate the initial output of the actor
		probs = self.actor(self.joint(tensor_state)).float()
		
		# Apply the invalid action masking
		probs = torch.where(possibleMovesMask(state, 1), probs, torch.tensor(-100000.0))

		# Apply the softmax function to obtain probabilities
		probs = self.softmax(probs)

		return probs


	def get_evaluation(self, state):
		# Change the state into suitable format
		tensor_state = torch.reshape(state, (-1,1,3,3))

		# Calculate the evaluation of the position
		state_eval = self.critic(self.joint(tensor_state))

		return state_eval


	# For a given state, return an action according to a current policy and the log_prob
	# of performing this action
	def act(self, state):
		# Change the state into suitable format
		tensor_state = torch.reshape(state, (-1,1,3,3))

		# Calculate the initial output of the actor
		probs = self.actor(self.joint(tensor_state)).float()
		
		# Apply the invalid action masking
		probs = torch.where(possibleMovesMask(state, 1), probs, torch.tensor(-10000000.0))

		# Apply the softmax function to obtain probabilities
		probs = self.softmax(probs)

		# Create a distribution
		dist = Categorical(probs)

		# Pick an action using the generated distribution
		action = dist.sample()

		# Evaluate the log_prob of the chosen action
		log_prob = dist.log_prob(action)

		# Return the action and the log_probabilties
		return action.detach(), log_prob.detach()


	# For a given state and action, return:
	# 	- 	the log_probabilities of the action, according to the current policy, 
	# 	-	the estimated value of the state, according to the critic,
	#	-	the entropy of the distribution
	def evaluate(self, state, action):
		# Change the state into suitable format
		tensor_state = self.joint(torch.reshape(state, (-1,1,3,3)))

		# Calculate the initial output of the actor
		probs = self.actor(tensor_state).float()
		
		# Apply the invalid action masking
		probs = torch.where(possibleMovesMask(state, 1), probs, torch.tensor(-10000000.0))

		# Apply the softmax function to obtain probabilities
		probs = self.softmax(probs)

		# Create a distribution
		dist = Categorical(probs)

		# Evaluate the log_prob of the chosen action
		log_prob = dist.log_prob(action)

		# Calculate the entropy of the distribution
		dist_entropy = dist.entropy()

		# Calculate the evaluation of the position
		state_eval = self.critic(tensor_state)

		# Return the action and the log_probabilties
		return log_prob, state_eval, dist_entropy

### Hyperparameters

In [170]:
# Learning rate
learning_rate = 0.001

# Objectiive function parameters
c_1 = 0.5
c_2 = 0.001

# PPO epochs
epochs = 5

# Lambda
lmbda = 0.95

# Discount factor
gamma = 0.99


### Model declaration

In [171]:
# Network model
model = ActorCritic()
# Optimizer
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
# MSE Loss
mse_loss = nn.MSELoss()

### Data preparation

In [172]:
# For given states, actions, log_probabilities of actions, advantages and predicted values,
# prepare batches of a given size
def generate_batches(states, actions, log_probs, advantages, pred_values, batch_size):
	# Evaluate the number of batches that we can create
	num_batches = (states.size(0))//batch_size
	# Create a permutation of indices
	perm = np.array(range(len(states)))
	np.random.shuffle(perm)

	# Based on the permutation, prepare and yield batches
	for i in range(num_batches):
		_range = perm[i*batch_size:(i+1)*batch_size]
		yield states[_range], actions[_range], log_probs[_range], advantages[_range], pred_values[_range]

### PPO Step

PPO aims to maximize the following objective function:

\begin{equation*}
L_t^{CLIP+VF+S}(\theta) = \hat{\mathbb{E}}_t \left[ L_t^{CLIP}(\theta) - c_1 L_t^{VF}(\theta) + c_2 S[\pi_\theta](s_t) \right]
\end{equation*}

where:
\begin{equation*}
L_t^{CLIP}(\theta) = \hat{\mathbb{E}}_t \left[ \text{min}\left(r_t(\theta)\hat{A}_t\text{, clip}\left( r_t(\theta), 1-\epsilon, 1+\epsilon \right)\hat{A}_t\right) \right]
\end{equation*}

\begin{equation*}
L_t^{VF}(\theta) = \left( V_\theta(s_t) - V_t^{targ} \right)^2
\end{equation*}


$c_1$ and $c_2$ are hyperparameters, $S[\pi](s_t)$ is an entropy bonus and the ratio $r_t(\theta)$ is defined as:

\begin{equation*}
r_t(\theta) = \frac{\pi_\theta(a_t | s_t)}{\pi_{\theta_{old}}(a_t|s_t)} = \text{exp}\left(\text{log }\pi_\theta(a_t | s_t) - \text{ log }\pi_{\theta_{old}}(a_t|s_t)\right)
\end{equation*}

In [230]:
# Perform the PPO step for the given number of epochs and data
def ppo_step(epochs, batch_size, states, actions, old_log_probs, advantages, pred_values, epsilon = 0.2, c_1 = 0.5, c_2 = 0.003):
	# In every epoch
	for _ in range(epochs):
		# Iterate over generated batches:
		for _states, _actions, _log_probs, _advantages, _values in generate_batches(states, actions, old_log_probs, advantages, pred_values, batch_size):
			# Calculate the new log_probabilities, evaluation and entropy
			new_log_prob, state_eval, dist_entropy = model.evaluate(_states, _actions)

			# Evaluate the value function loss
			state_eval = torch.flatten(state_eval)
			l_vf = mse_loss(state_eval, _values).mean()

			# Evaluate the clipped actor loss
			ratio = (new_log_prob-_log_probs).exp()
			l_clip = torch.min(ratio*_advantages, torch.clamp(ratio, 1-epsilon, 1+epsilon)*_advantages).mean()
			
			# Evaluate the entropy factor
			entropy = dist_entropy.mean()

			# Evaluate the objective function (note that we need to swap signs, as the algorithm minimizes the objective)
			loss = -l_clip + c_1*l_vf - c_2*entropy


			# Perform the optimization step
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

### Training

In [231]:
def training_iter(num_games, num_epochs, batch_size):
	# Initialize arrays
	states		= [[], []]
	actions		= [[], []]
	rewards 	= [[], []]
	log_probs	= [[], []]
	values		= [[], []]
	advantages	= [[], []]
	masks		= [[], []]
	pred_values	= [[], []]

	
	# For each game
	for game_id in range(num_games):
		# Initialize the gamestate
		state = torch.tensor(initialState()).float()
		isFinished = False
		player = 1

		# For each move, generate and save the relevant data
		while not isFinished:
			# Choose an action using the old policy
			with torch.no_grad():
				action, log_prob = model.act(state)


			# Perform the chosen action
			next_state, reward, next_player, isFinished = move(state, player, action)
			
			# Evaluate the current position and probability of the action
			log_prob2, value, _ = model.evaluate(state, action)

			# Change player from 1/-1 to 0/1
			player = (1-player)//2

			states[player].append(state)
			actions[player].append(action)
			rewards[player].append(reward)
			log_probs[player].append(log_prob)
			values[player].append(value)
			masks[player].append(1)

			# Update the state of the game
			state = deepcopy(next_state)
			player = next_player
		
		# Upadate the rewards
		if rewards[0][-1] != 0: 	rewards[1][-1] = -rewards[0][-1]
		elif rewards[1][-1] != 0: 	rewards[0][-1] = -rewards[1][-1]

		# Update the masks
		masks[0][-1] = 0
		masks[1][-1] = 0

	# Calculate the predicted values obtained by using GAE
	pred_values[0] = torch.tensor(gae(values[0]+[0], rewards[0], masks[0]))
	pred_values[1] = torch.tensor(gae(values[1]+[0], rewards[1], masks[1]))

	# Prepare states as a tensor
	states[0] = torch.stack(states[0])
	states[1] = torch.stack(states[1])

	# Prepare actions as a tensor
	actions[0] = torch.cat(actions[0])
	actions[1] = torch.cat(actions[1])
	
	# Prepare log_probabilities as a tensor
	log_probs[0] = torch.flatten(torch.tensor(log_probs[0]).detach())
	log_probs[1] = torch.flatten(torch.tensor(log_probs[1]).detach())


	# Prepare pred_values
	values[0] = torch.flatten(torch.tensor(values[0]).detach())
	values[1] = torch.flatten(torch.tensor(values[1]).detach())
	
	# Prepare advantages
	advantages[0] = pred_values[0] - values[0]
	advantages[1] = pred_values[1] - values[1]

	ppo_step(num_epochs, batch_size, states[0], actions[0], log_probs[0], advantages[0], values[0])
	ppo_step(num_epochs, batch_size, states[1], actions[1], log_probs[1], advantages[1], values[1])

	with torch.no_grad():
		print("-------------------------------------------------------------------")
		print(model.get_probabilities(torch.tensor(initialState()).float()))
		print(model.get_probabilities(torch.tensor([0,0,0,0,1,0,0,0,0]).float()))
			

In [232]:
for _ in range(600):
	training_iter(100, 3, 4)

-------------------------------------------------------------------
tensor([[0.0243, 0.0064, 0.6915, 0.0025, 0.1147, 0.0061, 0.0974, 0.0023, 0.0547]])
tensor([[0.1498, 0.0661, 0.1861, 0.0490, 0.0000, 0.1423, 0.1395, 0.0695, 0.1977]])
-------------------------------------------------------------------
tensor([[0.0204, 0.0057, 0.7063, 0.0027, 0.0936, 0.0054, 0.0771, 0.0021, 0.0866]])
tensor([[0.1633, 0.0718, 0.1438, 0.0535, 0.0000, 0.1250, 0.1396, 0.0683, 0.2346]])
-------------------------------------------------------------------
tensor([[0.0289, 0.0088, 0.6859, 0.0041, 0.0920, 0.0082, 0.0696, 0.0029, 0.0996]])
tensor([[0.1728, 0.0661, 0.1160, 0.0480, 0.0000, 0.1495, 0.1638, 0.0614, 0.2222]])
-------------------------------------------------------------------
tensor([[0.0192, 0.0048, 0.7484, 0.0020, 0.0695, 0.0047, 0.0432, 0.0019, 0.1064]])
tensor([[0.1463, 0.0538, 0.1114, 0.0444, 0.0000, 0.1571, 0.1437, 0.0586, 0.2847]])
----------------------------------------------------------------

In [176]:
probs = nn.Softmax(dim=-1)(torch.tensor([0,0,0,1,0,0]).float())
print(probs)
dist = Categorical(probs)
print(dist.entropy())

tensor([0.1296, 0.1296, 0.1296, 0.3522, 0.1296, 0.1296])
tensor(1.6914)


In [229]:
st = torch.tensor(randomState()).float()
print(torch.reshape(st, (3,3)))
with torch.no_grad():
	print(model.get_probabilities(st))
	print(model.act(st))
	print(model.get_evaluation(st))
	

tensor([[ 0., -1.,  0.],
        [ 0.,  1., -1.],
        [ 0.,  0.,  1.]])
tensor([[0.0211, 0.0000, 0.1805, 0.0462, 0.0000, 0.0000, 0.7086, 0.0436, 0.0000]])
(tensor([6]), tensor([-0.3445]))
tensor([[0.0517]])
