In [13]:
! mkdir logs
! mkdir videos

In [14]:
#import ale_py
#import shimmy
import gymnasium as gym
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset
from torch.nn import functional as F
from torchvision import datasets, transforms
import torchvision.transforms.functional as TF
from torch.distributions import Categorical
import random
import copy
import time
import pickle
import torchvision
import matplotlib.pyplot as plt
import math
import json
import numpy as np

1. Initilize model

In [11]:
class NeuralNetwork(nn.Module):

    def __init__(self):
        super(NeuralNetwork,self).__init__()
        self.common = nn.Linear(4, 128)
        self.activation = nn.ReLU()
        self.critic = nn.Linear(128,1)
        self.actor = nn.Linear(128,2)
        self.softmax = nn.Softmax(dim=-1)
            
    def forward(self, x):
        x = torch.tensor(x)
        x = torch.unsqueeze(x,dim=0)
        x = self.common(x)
        x = self.activation(x)
        return self.softmax(self.actor(x)), self.critic(x)
    
    def calculate_loss(self, G, V, A ):
        advantage = G - V
        vector = -torch.mul(A,advantage)
        loss_actor = vector.sum()
        loss_critic = F.smooth_l1_loss(V, G, reduction='sum')
        return loss_actor, loss_critic
        
        
        

In [12]:
class ActorCritic:
    """
    A class representing an Actor-Critic model.
    """

    def __init__(self, gamma):
        """
        Initializes the ActorCritic object.

        Args:
            gamma (float): The discount factor for calculating expected returns.
        """
        self.nn_model = NeuralNetwork()
        self.optimizer = torch.optim.Adam(self.nn_model.parameters(), lr=0.005)
        self.gamma = gamma
        self.values = []
        self.actions = []
        self.rewards = []

    def act(self, state):
        """
        Selects an action based on the current state.

        Args:
            state: The current state of the environment.

        Returns:
            tuple: A tuple containing the selected action, its log probability, and the predicted value for the state.
        """
        action_probs, value = self.nn_model(state)
        m = Categorical(action_probs)
        action = m.sample()
        action_prob = m.log_prob(action)
        return action.item(), action_prob, value.squeeze(dim=1)

    def train_step(self):
        """
        Performs a single training step on the neural network.
        """
        V = torch.stack(self.values)
        A = torch.stack(self.actions)
        G = self.expected_return()
        self.optimizer.zero_grad()
        actor_loss, critic_loss = self.nn_model.calculate_loss(G, V, A)
        loss = actor_loss + critic_loss
        loss.backward()
        self.optimizer.step()
        return loss

    def save_data(self, action_prob, value, reward):
        """
        Saves the data from a single step in the training process.

        Args:
            action_prob: The log probability of the selected action.
            value: The predicted value for the state.
            reward: The reward received for the action.
        """
        self.actions.append(action_prob)
        self.values.append(value)
        self.rewards.append(reward)

    def reset(self):
        """
        Resets the values, actions, and rewards lists.
        """
        self.values = []
        self.actions = []
        self.rewards = []

    def expected_return(self):
        """
        Calculates the expected return (discounted rewards) for the current episode.

        Returns:
            torch.Tensor: The expected return values, normalized by subtracting the mean and dividing by the standard deviation.
        """
        dis_reward = 0
        g_array = []
        for reward in reversed(self.rewards):
            dis_reward = reward + self.gamma * dis_reward
            g_array.insert(0, dis_reward)
        g = torch.tensor(g_array).unsqueeze(dim=1)
        eps = np.finfo(np.float32).eps.item()
        g = (g - g.mean()) / (g.std()+eps)
        return g 

In [15]:
env = gym.make("CartPole-v1",render_mode="rgb_array")
env = gym.wrappers.RecordVideo(env, './videos', episode_trigger = lambda x: (x+1) % 100 == 0)
torch.manual_seed(543)
max_episodes = 500
max_steps = 9999
model = ActorCritic(gamma=0.99)
episodes = []
losses = []
logger = {"episodes":[],"losses":[]}
for episode in range(max_episodes): 
    state, _ = env.reset()
    episode_reward = 0
    steps = 0
    for step in range(1, max_steps):
        action, action_prob, value = model.act(state)
        state, reward, done, _, _ = env.step(action) 
        model.save_data(action_prob,value,reward)
        episode_reward += reward
        if done:
            observation, info = env.reset()
            print(f"Episode {episode} ended. Episode reward: {episode_reward} ")
            break
        if step == max_steps - 1:
            print("Max steps reached.")
        steps = step
    episode_loss = model.train_step()
    model.reset()
    if episode % 10 == 0:
            logger["episodes"].append(episodes)
            logger["losses"].append(losses)
            episodes = []
            losses = []
    episodes.append(episode_reward)
    losses.append(episode_loss.item())
env.close()
with open('logs/logger_dqn.json', 'w') as fp:
        json.dump(logger, fp)

  logger.warn(


Episode 0 ended. Episode reward: 15.0 
Episode 1 ended. Episode reward: 67.0 
Episode 2 ended. Episode reward: 31.0 
Episode 3 ended. Episode reward: 11.0 
Episode 4 ended. Episode reward: 15.0 
Episode 5 ended. Episode reward: 21.0 
Episode 6 ended. Episode reward: 24.0 
Episode 7 ended. Episode reward: 31.0 
Episode 8 ended. Episode reward: 30.0 
Episode 9 ended. Episode reward: 11.0 
Episode 10 ended. Episode reward: 27.0 
Episode 11 ended. Episode reward: 31.0 
Episode 12 ended. Episode reward: 11.0 
Episode 13 ended. Episode reward: 17.0 
Episode 14 ended. Episode reward: 11.0 
Episode 15 ended. Episode reward: 15.0 
Episode 16 ended. Episode reward: 21.0 
Episode 17 ended. Episode reward: 25.0 
Episode 18 ended. Episode reward: 16.0 
Episode 19 ended. Episode reward: 12.0 
Episode 20 ended. Episode reward: 11.0 
Episode 21 ended. Episode reward: 40.0 
Episode 22 ended. Episode reward: 34.0 
Episode 23 ended. Episode reward: 25.0 
Episode 24 ended. Episode reward: 25.0 
Episode 25

                                                   

Moviepy - Done !
Moviepy - video ready /home/konrad/Repos/NCML_project/videos/rl-video-episode-99.mp4
Episode 99 ended. Episode reward: 13.0 
Episode 100 ended. Episode reward: 23.0 




Episode 101 ended. Episode reward: 21.0 
Episode 102 ended. Episode reward: 18.0 
Episode 103 ended. Episode reward: 19.0 
Episode 104 ended. Episode reward: 13.0 
Episode 105 ended. Episode reward: 22.0 
Episode 106 ended. Episode reward: 13.0 
Episode 107 ended. Episode reward: 28.0 
Episode 108 ended. Episode reward: 40.0 
Episode 109 ended. Episode reward: 22.0 
Episode 110 ended. Episode reward: 21.0 
Episode 111 ended. Episode reward: 18.0 
Episode 112 ended. Episode reward: 32.0 
Episode 113 ended. Episode reward: 17.0 
Episode 114 ended. Episode reward: 13.0 
Episode 115 ended. Episode reward: 21.0 
Episode 116 ended. Episode reward: 19.0 
Episode 117 ended. Episode reward: 15.0 
Episode 118 ended. Episode reward: 60.0 
Episode 119 ended. Episode reward: 26.0 
Episode 120 ended. Episode reward: 53.0 
Episode 121 ended. Episode reward: 38.0 
Episode 122 ended. Episode reward: 15.0 
Episode 123 ended. Episode reward: 13.0 
Episode 124 ended. Episode reward: 22.0 
Episode 125 ende

                                                   

Moviepy - Done !
Moviepy - video ready /home/konrad/Repos/NCML_project/videos/rl-video-episode-199.mp4
Episode 199 ended. Episode reward: 21.0 




Episode 200 ended. Episode reward: 36.0 
Episode 201 ended. Episode reward: 31.0 
Episode 202 ended. Episode reward: 35.0 
Episode 203 ended. Episode reward: 45.0 
Episode 204 ended. Episode reward: 81.0 
Episode 205 ended. Episode reward: 48.0 
Episode 206 ended. Episode reward: 60.0 
Episode 207 ended. Episode reward: 39.0 
Episode 208 ended. Episode reward: 78.0 
Episode 209 ended. Episode reward: 70.0 
Episode 210 ended. Episode reward: 55.0 
Episode 211 ended. Episode reward: 86.0 
Episode 212 ended. Episode reward: 85.0 
Episode 213 ended. Episode reward: 68.0 
Episode 214 ended. Episode reward: 37.0 
Episode 215 ended. Episode reward: 48.0 
Episode 216 ended. Episode reward: 124.0 
Episode 217 ended. Episode reward: 102.0 
Episode 218 ended. Episode reward: 34.0 
Episode 219 ended. Episode reward: 41.0 
Episode 220 ended. Episode reward: 68.0 
Episode 221 ended. Episode reward: 79.0 
Episode 222 ended. Episode reward: 54.0 
Episode 223 ended. Episode reward: 72.0 
Episode 224 en

                                                             

Moviepy - Done !
Moviepy - video ready /home/konrad/Repos/NCML_project/videos/rl-video-episode-299.mp4
Episode 299 ended. Episode reward: 58.0 




Episode 300 ended. Episode reward: 99.0 
Episode 301 ended. Episode reward: 84.0 
Episode 302 ended. Episode reward: 85.0 
Episode 303 ended. Episode reward: 53.0 
Episode 304 ended. Episode reward: 65.0 
Episode 305 ended. Episode reward: 79.0 
Episode 306 ended. Episode reward: 56.0 
Episode 307 ended. Episode reward: 49.0 
Episode 308 ended. Episode reward: 63.0 
Episode 309 ended. Episode reward: 38.0 
Episode 310 ended. Episode reward: 27.0 
Episode 311 ended. Episode reward: 35.0 
Episode 312 ended. Episode reward: 55.0 
Episode 313 ended. Episode reward: 51.0 
Episode 314 ended. Episode reward: 61.0 
Episode 315 ended. Episode reward: 37.0 
Episode 316 ended. Episode reward: 79.0 
Episode 317 ended. Episode reward: 74.0 
Episode 318 ended. Episode reward: 96.0 
Episode 319 ended. Episode reward: 73.0 
Episode 320 ended. Episode reward: 166.0 
Episode 321 ended. Episode reward: 64.0 
Episode 322 ended. Episode reward: 110.0 
Episode 323 ended. Episode reward: 120.0 
Episode 324 e

                                                               

Moviepy - Done !
Moviepy - video ready /home/konrad/Repos/NCML_project/videos/rl-video-episode-399.mp4
Episode 399 ended. Episode reward: 471.0 
Episode 400 ended. Episode reward: 302.0 
Episode 401 ended. Episode reward: 358.0 
Episode 402 ended. Episode reward: 398.0 
Episode 403 ended. Episode reward: 357.0 
Episode 404 ended. Episode reward: 264.0 
Episode 405 ended. Episode reward: 234.0 
Episode 406 ended. Episode reward: 293.0 
Episode 407 ended. Episode reward: 245.0 
Episode 408 ended. Episode reward: 329.0 
Episode 409 ended. Episode reward: 306.0 
Episode 410 ended. Episode reward: 482.0 
Episode 411 ended. Episode reward: 521.0 
Episode 412 ended. Episode reward: 837.0 
Episode 413 ended. Episode reward: 814.0 
Episode 414 ended. Episode reward: 907.0 
Episode 415 ended. Episode reward: 1779.0 
Episode 416 ended. Episode reward: 1786.0 
Episode 417 ended. Episode reward: 1663.0 
Episode 418 ended. Episode reward: 991.0 
Episode 419 ended. Episode reward: 764.0 
Episode 420 

                                                              

Moviepy - Done !
Moviepy - video ready /home/konrad/Repos/NCML_project/videos/rl-video-episode-499.mp4
Episode 499 ended. Episode reward: 96.0 


