# Proximal Policy Optimization

### Import the libraries

In [3]:
# Calculations
import numpy as np
from numpy import random, sqrt

# Network libraries
import torch
import torch.nn.functional as F, torch.nn as nn, torch.optim as optim
from torch.distributions import Categorical as Categorical

# Game library
from TicTacToe import initialState, randomState, printState, drawState, move, possibleMoves, possibleMovesMask, isOver, gameScore

### Neural Network

In [18]:
class ActorCritic(nn.Module):

	# Initialize the actor and critic
	def __init__(self):
		
		super(ActorCritic, self).__init__()

		# Actor
		self.actor = nn.Sequential(
			nn.Conv2d(1, 8, 2),
			nn.ReLU(),
			nn.Conv2d(8, 32, 2),
			nn.ReLU(),
			nn.Flatten(),
			nn.Linear(32, 9)
		)

		# Critic
		self.critic = nn.Sequential(
			nn.Conv2d(1, 8, 2),
			nn.ReLU(),
			nn.Conv2d(8, 32, 2),
			nn.ReLU(),
			nn.Linear(32, 1)
		)

	
	# Instead of using a forward function, we will split it into 
	# one action for the actor, and one for the critic
	def forward(self):
		raise NotImplementedError
	

	# Obtain the probabilities of actions and a chosen action
	def act(self, state):
		# Change the state into suitable format
		tensor_state = torch.from_numpy(state.reshape(-1, 1, 3,3)).float()

		# Calculate the initial output of the actor
		probs = self.actor(tensor_state)
		
		# Apply the invalid action masking
		probs = torch.where(possibleMovesMask(state, 1), probs, torch.tensor([-1e8]*9))

		# Apply the softmax function to obtain probabilities
		probs = nn.Softmax(dim=-1)(probs)

		# Create a distribution
		dist = Categorical(probs)

		# Pick an action using the generated distribution
		action = dist.sample()

		# Evaluate the log_prob of the chosen action
		log_prob = dist.log_prob(action)

		# Return the action and the log_probabilties
		return action.detach(), log_prob.detach()


	# 

In [19]:
myNet = ActorCritic()

state = randomState()

myNet.act(state)

(tensor([4]), tensor([-2.1596]))