In [236]:
#import ale_py
#import shimmy
import gymnasium as gym
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset
from torch.nn import functional as F
from torchvision import datasets, transforms
import torchvision.transforms.functional as TF
import random
import copy
import time
import pickle
import torchvision
import matplotlib.pyplot as plt
import math


In [237]:

class CNN(nn.Module):

    def __init__(self):
        super().__init__()   
        self.linear1 = nn.Linear(4, 128)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(128,128)
        self.activation = nn.ReLU()
        self.linear3 = nn.Linear(128,2)

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.activation(x)
        x = self.linear3(x)
        
        return x


In [238]:
class Memory():
    
    def __init__(self,size):
        self.size = size
        self.experiences = []
    
    def sample(self,batch_size):
        return random.choices(self.experiences, k=batch_size)
        
    def add(self, experience):
        self.experiences.append(experience)
        if len(self.experiences) > self.size:
            self.experiences.pop(0)
                    
    def __len__(self):
        return len(self.experiences)

In [239]:
class DQN_agent:


    def __init__(self, lr=0.0001 ,gamma=0.99, epsilon_params=(0.9,0.05,1000)):
        # Get cpu, gpu or mps device for training.
        self.device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )
        print(f"Using {self.device} device")
        self.pred_NN = CNN().to(self.device)
        self.target_NN = copy.deepcopy(self.pred_NN)
        self.target_NN.eval()
        self.gamma = gamma
        self.epsilon_start = epsilon_params[0]
        self.epsilon_end = epsilon_params[1]
        self.epsilon_decay = epsilon_params[2]
        self.optimizer = torch.optim.AdamW(self.pred_NN.parameters(), lr=lr,amsgrad=True)
        self.steps_done = 0
        
    def predict(self, x):
        self.steps_done += 1
        return self.pred_NN.forward(x)
    
    def action(self, pred):
        eps = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.steps_done / self.epsilon_decay)
        return (
            random.randint(0, pred.size(dim=0) - 1)
            if random.random() < eps
            else torch.argmax(pred).item()
        )
    
    def train(self, experience_batch):
        loss_fn = nn.SmoothL1Loss()
        epoch_loss = 0
        states = torch.stack([experience_batch[i][0] for i in range(len(experience_batch))])
        actions = torch.tensor([experience_batch[i][1] for i in range(len(experience_batch))])
        rewards = torch.tensor([experience_batch[i][2] for i in range(len(experience_batch))])#torch.tensor(experience_batch[:][2])
        next_states = torch.stack([experience_batch[i][3] for i in range(len(experience_batch))])
        terminated = torch.tensor([not experience_batch[i][4] for i in range(len(experience_batch))])
        y = self.estimated_value( rewards, next_states, terminated)
        self.optimizer.zero_grad()
        pred = self.pred_NN(states)
        indicies = torch.LongTensor(actions)
        indicies =indicies.unsqueeze(dim=0).T
        pred = pred.gather(1,indicies)
        loss = loss_fn(y, pred)
        loss.backward()
        self.optimizer.step()
        epoch_loss += loss.item()
        return epoch_loss
        
    def copy(self):
        self.target_NN.load_state_dict(self.pred_NN.state_dict())  
        
    def estimated_value(self, reward, next_state, done):
        with torch.no_grad():# vectorize it
                target_pred = self.target_NN.forward(next_state)
                max_pred = torch.max(target_pred,1)[0].unsqueeze(1)
                done = done.unsqueeze(1)
                target = reward + self.gamma * torch.mul(max_pred,done)
        return target
        

In [240]:
#initialize environment
env = gym.make("CartPole-v1",render_mode="rgb_array")
actions = range(env.action_space.n)

#hyperparams
max_steps = 9999
training_freq = 1
copying_freq = 200
batch_size = 128

#initialize agent
agent = DQN_agent()

training_session = 0
max_episode = 5000

#loops until max_time is reached
memory = Memory(10000)

total_steps = 0
for episode  in range(1,max_episode):
    #get first states
    state, info = env.reset()
    #loops until experience_capacity is reached
    episode_reward = 0 
    episode_loss = 0
    for i in range(1, max_steps):
        #predict q-values and choose action
        with torch.no_grad():
            pred = agent.predict(torch.tensor(state))
        action = agent.action(pred)
        #get next states
        next_state, reward, terminated, truncated, info = env.step(action)
        episode_reward += reward
        if i == max_steps - 1:
            print("Max steps reached.")
        next_state = torch.from_numpy(next_state)
        experience = [torch.tensor(state), action, reward, torch.tensor(next_state), terminated] 
        memory.add(experience)
        if terminated or truncated:
            break
        state = next_state      
        if len(memory) > batch_size:
            experiences_train = memory.sample(batch_size)
            episode_loss += agent.train(experiences_train)
            training_session += 1
    #if episode % 100 == 0:
    if episode % copying_freq == 0:
        agent.copy()
    print(f"Episode: {episode} Reward: {episode_reward} loss: {episode_loss/steps}")

Using cpu device
Episode: 1 Reward: 21.0 loss: 0.0
Episode: 2 Reward: 19.0 loss: 0.0
Episode: 3 Reward: 23.0 loss: 0.0
Episode: 4 Reward: 45.0 loss: 0.0


  experience = [torch.tensor(state), action, reward, torch.tensor(next_state), terminated]
  pred = agent.predict(torch.tensor(state))
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)


Episode: 5 Reward: 24.0 loss: 0.3849935084581375
Episode: 6 Reward: 22.0 loss: 2.3949854224920273
Episode: 7 Reward: 15.0 loss: 1.308267280459404
Episode: 8 Reward: 25.0 loss: 1.7614447996020317
Episode: 9 Reward: 16.0 loss: 0.8083614595234394
Episode: 10 Reward: 28.0 loss: 1.0289516486227512
Episode: 11 Reward: 22.0 loss: 0.47228330560028553
Episode: 12 Reward: 37.0 loss: 0.41656651766970754
Episode: 13 Reward: 9.0 loss: 0.0514238765463233
Episode: 14 Reward: 12.0 loss: 0.058012235909700394
Episode: 15 Reward: 16.0 loss: 0.06746982643380761
Episode: 16 Reward: 13.0 loss: 0.04484356287866831
Episode: 17 Reward: 38.0 loss: 0.10949288681149483
Episode: 18 Reward: 17.0 loss: 0.03647244430612773
Episode: 19 Reward: 12.0 loss: 0.022146382834762335
Episode: 20 Reward: 12.0 loss: 0.01889724109787494
Episode: 21 Reward: 13.0 loss: 0.017699699848890305
Episode: 22 Reward: 14.0 loss: 0.01861540472600609
Episode: 23 Reward: 11.0 loss: 0.011669886764138937
Episode: 24 Reward: 14.0 loss: 0.01336200

KeyboardInterrupt: 

In [None]:

print(f'*****stopped training after {elapsed_time} seconds*****\n')

print('*****Plot filters in first layer*****\n')
kernels = agent.pred_NN.stack[0].weight.detach().clone()
kernels = kernels - kernels.min()
kernels = kernels / kernels.max()
filter_img = torchvision.utils.make_grid(kernels, nrow = 8)
plt.imshow(filter_img.permute(1, 2, 0))
plt.show() 

print('*****Pickle Dumping model*****\n')

file = open('safetyPickleDump', 'wb')
pickle.dump(agent, file)
file.close()

print("*****Saving model*****")
torch.save(agent.pred_NN.state_dict(), "model")

env.close()

In [192]:
A = [[1,2,3,4],[1,2,3,4],[1,2,3,4]]
A[0][:]

[1, 2, 3, 4]

In [72]:
A = torch.tensor([[1,2],[3,4],[5,6],[7,8]])
indicies = torch.LongTensor([0,1,0,1])
indicies =indicies.unsqueeze(dim=0).T

In [73]:
indicies

tensor([[0],
        [1],
        [0],
        [1]])

In [74]:
A.gather(1,indicies)

tensor([[1],
        [4],
        [5],
        [8]])