# Resources
Pytorch tutorial: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

In [49]:
import gymnasium as gym
import matplotlib.pyplot as plt
import time
import pygame
import seaborn as sns
from collections import deque, namedtuple
import random
from torch import nn
from torch.nn import functional as F
import torch
import math
from itertools import count

### CartPole environment

In [2]:
env = gym.make('CartPole-v1', render_mode = 'human')

#Action
#0: move left
#1: move right

#Observation       
#Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity 

#Goal: is to keep the pole upright as long as possible. There's a no terminal state for CartPole

#Rewards
#+1 for every step the pole is upright position
obs = env.reset()
MAX_TIMESTEP = 1000
timestep = 0
terminated = False
history = []

while timestep < MAX_TIMESTEP and not terminated:
    random_action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(random_action)
    env.render()
    timestep += 1
    history.append(reward)

env.close()
pygame.quit()

### Replay Memory


stores the (s,a,r,s') values into a deque with a fixed length (drop furthest element when capacity is reached)
Why?

- breaking correlated sequential data. E.g: actions take earlier will greatly influence the current actions, result in biased learning
- By randomly sampling transitions from this memory during training, the agent learns from a diverse and uncorrelated dataset. 
- allows the agent to revisit and learn from past experiences multiple times. This is particularly beneficial when dealing with sparse rewards, where rewards might be infrequent or delayed.

In [8]:
Tuple = namedtuple("Tuple", ('x', 'y', 'z'))
a = Tuple(1,2,3)
print(a)

Tuple(x=1, y=2, z=3)


In [31]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity) -> None:
        self.memory = deque(maxlen=capacity)
    
    def push(self, *args):
        self.memory.append(Transition(*args))
    
    def sample(self, batchsize):
        return random.sample(self.memory, batchsize)
    
    def clear(self):
        self.memory.clear()
    
    def __len__(self):
        return len(self.memory)

#test it
rep_mem = ReplayMemory(capacity=10)

rep_mem.push('s1', 'action 1', 's2', 35)

print('memory len = ', len(rep_mem))


memory len =  1


# DQN

### Objective
train a policy that tries to maximize the discounted, cumulative reward

### Gamma, discounted factor
lower γ makes rewards from the uncertain far future less important for our agent than the ones in the near future that it can be fairly confident about.

### minimize the TD equation using Huber Loss. 
The Huber loss acts like the mean squared error when the error is small, but like the mean absolute error when the error is large - this makes it more robust to outliers when the estimates of Q are very noisy.

In [4]:
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        #what's the purpose of this?
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.LazyLinear(128)
        self.layer3 = nn.LazyLinear(n_actions)
    
    def forward(self, X):
        X = F.relu(self.layer1(X))
        X = F.relu(self.layer2(X))
        return self.layer3(X)

In [6]:
dqn = DQN(4, 4)
X = torch.rand(4,4)

y = dqn(X)
print("y shape = ", y.shape)

y shape =  torch.Size([4, 4])


In [23]:
a = torch.randint(1,100, (4,4))
print(a)
maxaction = a.max(1).indices
print(maxaction)

maxaction = maxaction.view(1,1)

print(maxaction)


tensor([[99,  6, 92, 88],
        [70, 65, 20, 26],
        [53, 45, 16, 98],
        [28, 29, 59, 16]])
tensor([0, 0, 3, 2])


RuntimeError: shape '[1, 1]' is invalid for input of size 4

In [86]:
a = torch.rand(10,4)
print(a)
print(a.shape)
actions_idx = torch.randint(0,4,(10,4))
a = a.gather(actions_idx)

print(a)

tensor([[0.0381, 0.9885, 0.9034, 0.9378],
        [0.9217, 0.7292, 0.4444, 0.5068],
        [0.1608, 0.1414, 0.3183, 0.4310],
        [0.6017, 0.6649, 0.8968, 0.8986],
        [0.9200, 0.8781, 0.7053, 0.6742],
        [0.5678, 0.3205, 0.0467, 0.3574],
        [0.2037, 0.2012, 0.8730, 0.9202],
        [0.2821, 0.9785, 0.4867, 0.5124],
        [0.5163, 0.3850, 0.4200, 0.7734],
        [0.0230, 0.2200, 0.0613, 0.9619]])
torch.Size([10, 4])


TypeError: gather() received an invalid combination of arguments - got (Tensor), but expected one of:
 * (int dim, Tensor index, *, bool sparse_grad)
 * (name dim, Tensor index, *, bool sparse_grad)


In [90]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [95]:
#Optimization
def optimize_model(memory, policy_net, target_net, \
    batch_size, gamma, device, debug = False):
    #can't optimize yet
    if len(memory) < batch_size: return

    transitions = memory.sample(batch_size)
    #this converts an array of transitions into transitions of action array, state arrays, ... 
    #Transition(state = (1,2,3...), action=(4,5,6,...), next_state=(7,8,9,...))
    batch = Transition(*zip(*transitions))

    #NOTE: Make a mask of non-final states. If non-final: True, if final state: True
    #return a tensor of ([True, True, ..., False, True,...])
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device = device)
    test = [s for s in batch.next_state if s is not None]

    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    #get state_batch, action_batch, reward_batch, 
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    if debug:
        print('non final mask shape = ', non_final_mask.shape)
        print('non final next states shape = ', non_final_next_states.shape)
        print('state batch shape = ', state_batch.shape)
        print('action batch shape = ', action_batch.shape)
        print('reward state batch shape = ', reward_batch.shape)
        print()

    #Select actions that would've been taken by the model
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    print('state action values shape = ', state_action_values.shape)

    #we want to mask the terminal states to 0s
    next_state_values = torch.zeros(batch_size, device = device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
    
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    print('next state values shape = ', next_state_values.shape)
    print('expected state actions values shape = ', expected_state_action_values.shape)

    #Compute Huber Loss
    # criterion = nn.SmoothL1Loss()
    # loss =criterion(state_action_values, expected_state_action_values)

#test optimze model function

policy_net = DQN(4,4).to(DEVICE)
target_net = DQN(4,4).to(DEVICE)
target_net.load_state_dict(policy_net.state_dict())

#test
memory = ReplayMemory(capacity = 10000)
memory.clear()

for _ in range(1000):
    state = torch.tensor([random.randint(0,3) for _ in range(4)], dtype = torch.float32).unsqueeze(0)
    action = torch.tensor([random.randint(0,3)]).unsqueeze(0)
    next_state = torch.tensor([random.randint(0,3) for _ in range(4)], dtype = torch.float32).unsqueeze(0)
    reward = torch.tensor([random.randint(0,3)], dtype = torch.float32).unsqueeze(0)
    memory.push(state, action, next_state, reward)

optimize_model(memory, policy_net, target_net, BATCH_SIZE, GAMMA, device = DEVICE, debug = True)

non final mask shape =  torch.Size([128])
non final next states shape =  torch.Size([128, 4])
state batch shape =  torch.Size([128, 4])
action batch shape =  torch.Size([128, 1])
reward state batch shape =  torch.Size([128, 1])

state action values shape =  torch.Size([128, 1])
next state values shape =  torch.Size([128])
expected state actions values shape =  torch.Size([128, 128])


In [None]:

env = gym.make('CartPole-v1', render_mode = 'human')

n_actions = env.action_space.n
state, info = env.reset()
n_observations = len(state)

#why use policy net?
policy_net = DQN(n_observations, n_actions).to(DEVICE)
#why use target net?
target_net = DQN(n_observations, n_actions).to(DEVICE)

#copy policy net weights into target net
target_net.load_state_dict(policy_net.state_dict())

optimzer = torch.optim.Adam(policy_net.parameters(), lr = LR, amsgrad = True)
memory = ReplayMemory(capacity = 10000)

steps_done = 0



In [59]:
#explore and exploit epsilon
def select_action(state):
    global steps_done
    sample = random.random()
    #epsilon decay rate
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1.0 * steps_done / EPS_DECAY)

    if sample > eps_threshold:
        #best action  
        with torch.no_grad():
            return policy_net(state).max(1).indicies.view(1,1)
    else:
        #random action
        return torch.tensor([[env.action_space.sample()]], device = DEVICE, dtype = torch.long) 

memory = ReplayMemory(capacity = 10000)
memory.clear()
 
#Training loop
if DEVICE == 'cuda':
    num_episodes = 10000
else:
    num_episodes = 50

for i_episode in range(num_episodes):

    state, info = env.reset()
    state = torch.tensor(state, dtype = torch.float32, device = DEVICE).unsqueeze(0)
    print('init state = ' , state)

    for t in count():

        action = select_action(state)
        obs, reward, terminated, truncated, _ = env.step(action.item())
        # print(obs)
        reward = torch.tensor([reward])
        print('reward ', reward)
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(obs, dtype = torch.float32, device = DEVICE).unsqueeze(0)
        
        memory.push(state, action, next_state, reward)

        state = next_state

        optimize_model()

        if t == 5:
            #debug
            print(memory.memory)
            break

    break



init state =  tensor([[-0.0024, -0.0013, -0.0259,  0.0245]])
reward  tensor([1.])
reward  tensor([1.])
reward  tensor([1.])
reward  tensor([1.])
reward  tensor([1.])
reward  tensor([1.])
deque([Transition(state=tensor([[-0.0024, -0.0013, -0.0259,  0.0245]]), action=tensor([[1]]), next_state=tensor([[-0.0024,  0.1942, -0.0255, -0.2762]]), reward=tensor([1.])), Transition(state=tensor([[-0.0024,  0.1942, -0.0255, -0.2762]]), action=tensor([[0]]), next_state=tensor([[ 0.0015, -0.0006, -0.0310,  0.0083]]), reward=tensor([1.])), Transition(state=tensor([[ 0.0015, -0.0006, -0.0310,  0.0083]]), action=tensor([[0]]), next_state=tensor([[ 0.0014, -0.1952, -0.0308,  0.2911]]), reward=tensor([1.])), Transition(state=tensor([[ 0.0014, -0.1952, -0.0308,  0.2911]]), action=tensor([[1]]), next_state=tensor([[-0.0025,  0.0003, -0.0250, -0.0112]]), reward=tensor([1.])), Transition(state=tensor([[-0.0025,  0.0003, -0.0250, -0.0112]]), action=tensor([[0]]), next_state=tensor([[-0.0025, -0.1945, -0.0252, 

### Apply DQN on 3 other different environments:

1. Mountain Car
2. Snake Game
3. Breakout