In [11]:
import ale_py
# if using gymnasium
import shimmy
import gymnasium as gym
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset
from torch.nn import functional as F
from torchvision.datasets import MNIST
from torchvision import datasets, transforms
import torchvision.transforms.functional as TF
import os
import random
import copy
import time
import pickle
import numpy as np
from torch.distributions import Categorical
from torchviz import make_dot, make_dot_from_trace

1. Initilize model

In [12]:
class NeuralNetwork(nn.Module):

    def __init__(self):
        super(NeuralNetwork,self).__init__()
        self.common = nn.Linear(4, 128)
        self.activation = nn.ReLU()
        self.critic = nn.Linear(128,1)
        self.actor = nn.Linear(128,2)
        self.softmax = nn.Softmax(dim=-1)
            
    def forward(self, x):
        x = torch.tensor(x)
        x = torch.unsqueeze(x,dim=0)
        x = self.common(x)
        x = self.activation(x)
        return self.softmax(self.actor(x)), self.critic(x)
    
    def calculate_loss(self, G, V, A ):
        advantage = G - V
        vector = -torch.mul(A,advantage)
        loss_actor = vector.sum()
        loss_critic = F.smooth_l1_loss(V, G, reduction='sum')
        return loss_actor, loss_critic
        
        
        

In [13]:
class ActorCritic:
    """
    A class representing an Actor-Critic model.
    """

    def __init__(self, gamma):
        """
        Initializes the ActorCritic object.

        Args:
            gamma (float): The discount factor for calculating expected returns.
        """
        self.nn_model = NeuralNetwork()
        self.optimizer = torch.optim.Adam(self.nn_model.parameters(), lr=0.005)
        self.gamma = gamma
        self.values = []
        self.actions = []
        self.rewards = []

    def act(self, state):
        """
        Selects an action based on the current state.

        Args:
            state: The current state of the environment.

        Returns:
            tuple: A tuple containing the selected action, its log probability, and the predicted value for the state.
        """
        action_probs, value = self.nn_model(state)
        m = Categorical(action_probs)
        action = m.sample()
        action_prob = m.log_prob(action)
        return action.item(), action_prob, value.squeeze(dim=1)

    def train_step(self):
        """
        Performs a single training step on the neural network.
        """
        V = torch.stack(self.values)
        A = torch.stack(self.actions)
        G = self.expected_return()
        self.optimizer.zero_grad()
        actor_loss, critic_loss = self.nn_model.calculate_loss(G, V, A)
        loss = actor_loss + critic_loss
        loss.backward()
        self.optimizer.step()

    def save_data(self, action_prob, value, reward):
        """
        Saves the data from a single step in the training process.

        Args:
            action_prob: The log probability of the selected action.
            value: The predicted value for the state.
            reward: The reward received for the action.
        """
        self.actions.append(action_prob)
        self.values.append(value)
        self.rewards.append(reward)

    def reset(self):
        """
        Resets the values, actions, and rewards lists.
        """
        self.values = []
        self.actions = []
        self.rewards = []

    def expected_return(self):
        """
        Calculates the expected return (discounted rewards) for the current episode.

        Returns:
            torch.Tensor: The expected return values, normalized by subtracting the mean and dividing by the standard deviation.
        """
        dis_reward = 0
        g_array = []
        for reward in reversed(self.rewards):
            dis_reward = reward + self.gamma * dis_reward
            g_array.insert(0, dis_reward)
        g = torch.tensor(g_array).unsqueeze(dim=1)
        eps = np.finfo(np.float32).eps.item()
        g = (g - g.mean()) / (g.std()+eps)
        return g 

In [14]:
env = gym.make("CartPole-v1")
torch.manual_seed(543)
max_episodes = 10000
max_steps = 9999
model = ActorCritic(gamma=0.99)
for episode in range(max_episodes): 
    state, _ = env.reset()
    episode_reward = 0
    for step in range(1, max_steps):
        action, action_prob, value = model.act(state)
        state, reward, done, _, _ = env.step(action) 
        model.save_data(action_prob,value,reward)
        episode_reward += reward
        if done:
            observation, info = env.reset()
            print(f"Episode {episode} ended. Episode reward: {episode_reward} ")
            break
        if step == max_steps - 1:
            print("Max steps reached.")
    model.train_step()
    model.reset()
env.close()

Episode 0 ended. Episode reward: 13.0 
Episode 1 ended. Episode reward: 22.0 
Episode 2 ended. Episode reward: 25.0 
Episode 3 ended. Episode reward: 46.0 
Episode 4 ended. Episode reward: 16.0 
Episode 5 ended. Episode reward: 12.0 
Episode 6 ended. Episode reward: 14.0 
Episode 7 ended. Episode reward: 27.0 
Episode 8 ended. Episode reward: 18.0 
Episode 9 ended. Episode reward: 16.0 
Episode 10 ended. Episode reward: 41.0 
Episode 11 ended. Episode reward: 21.0 
Episode 12 ended. Episode reward: 41.0 
Episode 13 ended. Episode reward: 12.0 
Episode 14 ended. Episode reward: 16.0 
Episode 15 ended. Episode reward: 16.0 
Episode 16 ended. Episode reward: 77.0 
Episode 17 ended. Episode reward: 38.0 
Episode 18 ended. Episode reward: 25.0 
Episode 19 ended. Episode reward: 44.0 
Episode 20 ended. Episode reward: 26.0 
Episode 21 ended. Episode reward: 19.0 
Episode 22 ended. Episode reward: 32.0 
Episode 23 ended. Episode reward: 34.0 
Episode 24 ended. Episode reward: 40.0 
Episode 25

KeyboardInterrupt: 