**A2C Method**

**Name:** Marcos Augusto Burgos Saavedra

**Student ID:** S4740705

In [1]:
import sys
import os

# Specify the absolute path to the game_models directory
absolute_path_to_game_models = r'G:\Mi unidad\[00 GENERAL\04 Proyectos personales\04 Rubiks-cube - Vaz, Glassenbury, Hendriawan, Fauzan, Burgos\rubiks-cube'
sys.path.insert(0, absolute_path_to_game_models)

In [2]:
import sys
import torch  
from game_models.rc_entropy_v01 import *
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

# hyperparameters
hidden_size = 512
learning_rate = 3e-4

# Constants
GAMMA = 0.99
num_steps = 1000 # It does not mind since it will truncate before always
max_episodes = 1000*1000 # I want an status each 1000 games

In [3]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size):
        super(ActorCritic, self).__init__()

        # Number of elements in the action space
        self.num_actions = num_actions
        # Build a critic model
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear3 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear4 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear5 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear6 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear7 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear8 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear9 = nn.Linear(hidden_size, hidden_size)
        self.critic_linear10 = nn.Linear(hidden_size, 1)
        # Build an actor model
        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear3 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear4 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear5 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear6 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear7 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear8 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear9 = nn.Linear(hidden_size, hidden_size)
        self.actor_linear10 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        # Implementation of the Critic Neural Network
        state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        value = F.relu(self.critic_linear1(state))
        value = F.relu(self.critic_linear2(value))
        value = F.relu(self.critic_linear3(value))
        value = F.relu(self.critic_linear4(value))
        value = F.relu(self.critic_linear5(value))
        value = F.relu(self.critic_linear6(value))
        value = F.relu(self.critic_linear7(value))
        value = F.relu(self.critic_linear8(value))
        value = F.relu(self.critic_linear9(value))
        value = self.critic_linear10(value)
        # Implementation of the Actor Neural Network
        policy_dist = F.relu(self.actor_linear1(state))
        policy_dist = F.relu(self.actor_linear2(policy_dist))
        policy_dist = F.relu(self.actor_linear3(policy_dist))
        policy_dist = F.relu(self.actor_linear4(policy_dist))
        policy_dist = F.relu(self.actor_linear5(policy_dist))
        policy_dist = F.relu(self.actor_linear6(policy_dist))
        policy_dist = F.relu(self.actor_linear7(policy_dist))
        policy_dist = F.relu(self.actor_linear8(policy_dist))
        policy_dist = F.relu(self.actor_linear9(policy_dist))
        policy_dist = F.softmax(self.actor_linear10(policy_dist), dim=1)

        print("Probabilities:", policy_dist)  # Debugging line to check for NaNs

        return value, policy_dist

In [4]:
def a2c(env):
    num_inputs = env.environment_space
    num_outputs = env.action_space
    
    actor_critic = ActorCritic(num_inputs, num_outputs, hidden_size)
    ac_optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate)

    all_lengths = []
    average_lengths = []
    all_rewards = []
    completed_games = 0 # Count the number of episodes that conclude the Rubiks cube

    print("Start of training\n")

    for episode in range(max_episodes):
        log_probs = []
        values = []
        rewards = []

        # The Initial State
        state = env.reset()
        state = np.array(state)

        for steps in range(num_steps):
            # Compute the value from the critic and the policy distribution from the actor Neural Network
            value, policy_dist = actor_critic.forward(state)
            value = value.detach().numpy()[0,0]
            dist = policy_dist.detach().numpy() 
            # Obtain the action that yields the higher rewards
            action = np.random.choice(num_outputs, p=np.squeeze(dist))
            # Compute the log for the cost function of the actor neural network
            log_prob = torch.log(policy_dist.squeeze(0)[action])
            # The actor performs its action
            new_state, reward, terminated, truncated, completed = env.step(action)

            # Storage the important features
            rewards.append(reward)
            values.append(value)
            log_probs.append(log_prob)

            state = new_state
            
            # When finish the episode
            if terminated or truncated:
                # Compute the values from the critic neural network
                Qval, _ = actor_critic.forward(new_state)
                Qval = Qval.detach().numpy()[0,0]

                # Storage information to highlight the performance
                all_rewards.append(np.sum(rewards))
                all_lengths.append(steps)
                average_lengths.append(np.mean(all_lengths[-10:]))
                if completed: completed_games+=1
                break
        
        # Compute Q values starting from the last step in the episode since it needs to use the St+1
        Qvals = np.zeros_like(values)
        for t in reversed(range(len(rewards))):
            Qval = rewards[t] + GAMMA * Qval
            Qvals[t] = Qval
  
        # The following elements are part of the combined cost function
        values = torch.FloatTensor(values)
        Qvals = torch.FloatTensor(Qvals)
        log_probs = torch.stack(log_probs)
        
        advantage = Qvals - values
        actor_loss = (-log_probs * advantage).mean()
        critic_loss = 0.5 * advantage.pow(2).mean()
        ac_loss = actor_loss + critic_loss

        # Update the actor critic
        ac_optimizer.zero_grad()
        ac_loss.backward()
        ac_optimizer.step()        

        if (episode+1) % 10 == 0:
            torch.save(actor_critic, 'policy_a2c_net.pth')
    
        if (episode+1) % 1000 == 0:
            print(f"\nBunch of episodes number: {episode//1000}")
            # Data in a dictionary format where the keys are column names
            data = {
                'Min Reward:': [np.min(all_rewards)],
                'Mean Reward': [np.mean(all_rewards)],
                'Std Reward': [np.std(all_rewards)],
                'Max Reward:': [np.max(all_rewards)],
                'Min length': [np.min(all_lengths)],
                'Mean length': [np.mean(all_lengths)],
                'Std length': [np.std(all_lengths)],
                'Max length': [np.max(all_lengths)],
                'Completed_games': [completed_games]
                }

            # Creating DataFrame from the dictionary
            df = pd.DataFrame(data)
            print(df)

            # Append the DataFrame to a text file
            with open('progress_A2C.txt', 'a') as file:  # 'a' is for append mode
                file.write(df.to_string(index=False))
                file.write("\n")  # Add extra newline for separation between entries

            all_lengths = []
            average_lengths = []
            all_rewards = []
            completed_games = 0
    
    print("\nEnd of training")

In [5]:
env = RC_entropy(max_number_scrambles=1, number_moves_allowed=30)
a2c(env)

Start of training

Probabilities: tensor([[0.0846, 0.0858, 0.0801, 0.0814, 0.0842, 0.0834, 0.0786, 0.0850, 0.0866,
         0.0849, 0.0826, 0.0828]], grad_fn=<SoftmaxBackward0>)
Probabilities: tensor([[0.0845, 0.0858, 0.0801, 0.0814, 0.0843, 0.0835, 0.0786, 0.0850, 0.0866,
         0.0849, 0.0825, 0.0828]], grad_fn=<SoftmaxBackward0>)
Probabilities: tensor([[0.0843, 0.0855, 0.0818, 0.0811, 0.0843, 0.0833, 0.0785, 0.0849, 0.0867,
         0.0847, 0.0822, 0.0826]], grad_fn=<SoftmaxBackward0>)
Probabilities: tensor([[0.0844, 0.0855, 0.0819, 0.0811, 0.0843, 0.0833, 0.0785, 0.0849, 0.0867,
         0.0847, 0.0822, 0.0826]], grad_fn=<SoftmaxBackward0>)
Probabilities: tensor([[0.0844, 0.0855, 0.0819, 0.0811, 0.0843, 0.0833, 0.0785, 0.0849, 0.0867,
         0.0847, 0.0822, 0.0826]], grad_fn=<SoftmaxBackward0>)
Probabilities: tensor([[0.0844, 0.0855, 0.0818, 0.0811, 0.0843, 0.0833, 0.0785, 0.0849, 0.0867,
         0.0847, 0.0822, 0.0827]], grad_fn=<SoftmaxBackward0>)
Probabilities: tensor([[0.0

KeyboardInterrupt: 