In [2]:
! pip install gymnasium
! pip install "gymnasium[atari, accept-rom-license]"
! pip install torch
! pip install torchvision
! pip install numpy
! pip install matplotlib



In [13]:
! mkdir logs
! mkdir videos

mkdir: cannot create directory ‘videos’: File exists


In [3]:
#import ale_py
#import shimmy
import gymnasium as gym
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset
from torch.nn import functional as F
from torchvision import datasets, transforms
import torchvision.transforms.functional as TF
import random
import copy
import time
import pickle
import torchvision
import matplotlib.pyplot as plt
import math
import json


In [4]:
class CNN(nn.Module):

    def __init__(self):
        super().__init__()   
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        self.fc = nn.Sequential(
            nn.Linear(in_features=64*7*7 , out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=5)
        )

    def forward(self, x):
        conv_out = self.conv(x)
        x = torch.flatten(conv_out, start_dim=1)
        return self.fc(x)
        

In [5]:
class Memory():
    
    def __init__(self,size):
        self.size = size
        self.experiences = []
    
    def sample(self,batch_size):
        return random.choices(self.experiences, k=batch_size)
        
    def add(self, experience):
        self.experiences.append(experience)
        if len(self.experiences) > self.size:
            self.experiences.pop(0)
                    
    def __len__(self):
        return len(self.experiences)

In [14]:
class DQN_agent:


    def __init__(self, lr=0.0001 ,gamma=0.99, epsilon_params=(0.9,0.05,1000)):
        # Get cpu, gpu or mps device for training.
        self.device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )
        print(f"Using {self.device} device")
        self.pred_NN = CNN().to(self.device)
        self.target_NN = copy.deepcopy(self.pred_NN)
        self.target_NN.eval()
        self.gamma = gamma
        self.epsilon_start = epsilon_params[0]
        self.epsilon_end = epsilon_params[1]
        self.epsilon_decay = epsilon_params[2]
        self.optimizer = torch.optim.RMSprop(self.pred_NN.parameters(), lr=lr)
        self.steps_done = 0
        
    def predict(self, x):
        self.steps_done += 1
        return self.pred_NN.forward(x)
    
    def action(self, pred):
        eps = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.steps_done / self.epsilon_decay)
        return (
            random.randint(0, pred.size(dim=0) - 1)
            if random.random() < eps
            else torch.argmax(pred).item()
        )
    
    def train(self, experience_batch):
        loss_fn = nn.SmoothL1Loss()
        epoch_loss = 0
        states = torch.stack([experience_batch[i][0].squeeze(0) for i in range(len(experience_batch))]).to(self.device)
        actions = torch.tensor([experience_batch[i][1] for i in range(len(experience_batch))])
        rewards = torch.tensor([experience_batch[i][2] for i in range(len(experience_batch))]).to(self.device)#torch.tensor(experience_batch[:][2])
        next_states = torch.stack([experience_batch[i][3].squeeze(0)  for i in range(len(experience_batch))]).to(self.device)
        terminated = torch.tensor([not experience_batch[i][4] for i in range(len(experience_batch))]).to(self.device)
        y = self.estimated_value( rewards, next_states, terminated)
        self.optimizer.zero_grad()
        pred = self.pred_NN(states)
        indicies = torch.LongTensor(actions)
        indicies =indicies.unsqueeze(dim=0).T
        pred = pred.gather(1,indicies.to(self.device))
        loss = loss_fn(y, pred)
        loss.backward()
        self.optimizer.step()
        epoch_loss += loss.item()
        return epoch_loss
        
    def copy(self):
        self.target_NN.load_state_dict(self.pred_NN.state_dict())  
        
    def estimated_value(self, reward, next_state, done):
        with torch.no_grad():# vectorize it
                target_pred = self.target_NN.forward(next_state.to(self.device))
                max_pred = torch.max(target_pred,1)[0].unsqueeze(1)
                done = done.unsqueeze(1)
                target = reward.unsqueeze(1) + self.gamma * torch.mul(max_pred,done)
        return target
        

In [7]:
def adjust_to_torch(t):
    t = t.unsqueeze(dim=0)
    return torch.movedim(t, 3, 1)

In [15]:
#initialize environment
env = gym.make("ALE/Pacman-v5", render_mode="rgb_array")
env.seed(543)
env = gym.wrappers.AtariPreprocessing(env, screen_size=84, grayscale_obs=False, frame_skip=1, noop_max=30)
env = gym.wrappers.RecordVideo(env, './videos', episode_trigger = lambda x: (x+1) % 5 == 0)# PATH
# env = NoopResetEnv(env, noop_max=30)
replay_buffer = Memory(5000)
torch.manual_seed(53407)
actions = range(env.action_space.n)
device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
#hyperparams
max_steps = 9999
training_freq = 1
copying_freq = 10
batch_size = 64

#initialize agent
agent = DQN_agent(lr=2e-4,gamma=0.99)

training_session = 0
max_episode = 5000

#loops until max_time is reached
memory = Memory(10000)

total_steps = 0
last_rewards = []

episodes = []
losses = []
logger = {"episodes":[],"losses":[]}
for episode  in range(1,max_episode):
    #get first states
    state = env.reset()
    state = state[0]/255
    state = torch.Tensor(state)
    state = adjust_to_torch(state)
    #loops until experience_capacity is reached
    episode_reward = 0 
    episode_loss = 0
    steps = 0
    for i in range(1, max_steps):
        #predict q-values and choose action
        with torch.no_grad():
            pred = agent.predict(state.to(device))
        action = agent.action(pred)
        #get next states
        next_state, reward, terminated, truncated, info = env.step(action)
        episode_reward += reward
        if i == max_steps - 1:
            print("Max steps reached.")
        next_state = adjust_to_torch(torch.tensor(next_state)/255)
        experience = [state, action, reward, next_state, terminated] 
        memory.add(experience)
        if terminated or truncated:
            break
        state = next_state      
        if len(memory) > 1000:
            experiences_train = memory.sample(batch_size)
            episode_loss += agent.train(experiences_train)
            training_session += 1
            steps += 1
        total_steps += 1
        steps = i
    #if episode % 100 == 0:
    if episode % copying_freq == 0:
        agent.copy()
    if episode % 2 == 0:
        logger["episodes"].append(episodes)
        logger["losses"].append(losses)
        episodes = []
        losses = []
    if episode % 2 == 0:
        with open('./logs/logger_ddqn.json', 'w') as fp:# PATH
            json.dump(logger, fp)
    last_rewards.append(episode_reward)
    episodes.append(episode_reward)
    losses.append(episode_loss/steps)
    if steps > 0:
      print(f"Episode: {episode} Reward: {episode_reward} loss: {episode_loss/steps} last rewards: {sum(last_rewards)/len(last_rewards)}")
      
torch.save(agent.pred_NN.state_dict(), "./pred.pt")
torch.save(agent.target_NN.state_dict(), "./target.pt")

  logger.warn(


Using cpu device
Episode: 1 Reward: 1.0 loss: 0.0 last rewards: 1.0
Episode: 2 Reward: 1.0 loss: 0.0 last rewards: 1.0
Episode: 3 Reward: 8.0 loss: 0.0021332684939934224 last rewards: 3.3333333333333335


  logger.warn(


Episode: 4 Reward: 2.0 loss: 0.004169668286619468 last rewards: 3.0
Moviepy - Building video /home/konrad/Repos/NCML_project/videos/rl-video-episode-4.mp4.
Moviepy - Writing video /home/konrad/Repos/NCML_project/videos/rl-video-episode-4.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /home/konrad/Repos/NCML_project/videos/rl-video-episode-4.mp4
Episode: 5 Reward: 3.0 loss: 0.0035990033330712485 last rewards: 3.0




Episode: 6 Reward: 3.0 loss: 0.0031959493024910986 last rewards: 3.0
Episode: 7 Reward: 8.0 loss: 0.002635872571649458 last rewards: 3.7142857142857144
Episode: 8 Reward: 4.0 loss: 0.0030655450593332634 last rewards: 3.75
Episode: 9 Reward: 12.0 loss: 0.002775955599896579 last rewards: 4.666666666666667
Moviepy - Building video /home/konrad/Repos/NCML_project/videos/rl-video-episode-9.mp4.
Moviepy - Writing video /home/konrad/Repos/NCML_project/videos/rl-video-episode-9.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /home/konrad/Repos/NCML_project/videos/rl-video-episode-9.mp4
Episode: 10 Reward: 9.0 loss: 0.003384070122228496 last rewards: 5.1
Episode: 11 Reward: 8.0 loss: 0.005966637090273176 last rewards: 5.363636363636363


KeyboardInterrupt: 