In [None]:
import torch
from torch import nn
import gym

import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from collections import deque

In [None]:
env_id = 'CartPole-v1'
env = gym.make(env_id)
n_actions = env.action_space.n
len_obs_space = env.observation_space.shape[0]

print('n_actions = {}'.format(n_actions))
print('len_obs_space = {}'.format(len_obs_space))

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

https://gsurma.medium.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288#:~:text=Cartpole%20%2D%20known%20also%20as%20an,forces%20to%20a%20pivot%20point.


https://github.com/gsurma/cartpole

In [None]:
class DQN(nn.Module):

    def __init__(self, lr, device) -> None:
        super().__init__()
        hidden_layer_size = 64
        self.flatten = nn.Flatten()
        # self.seq_relu = nn.Sequential(
        #     nn.Linear(len_obs_space, hidden_layer_size),
        #     nn.ReLU(),
        #     nn.Linear(hidden_layer_size, hidden_layer_size),
        #     nn.ReLU(),
        #     nn.Linear(hidden_layer_size, n_actions)
        # )
        self.seq_relu = nn.Sequential(
            nn.Linear(len_obs_space, hidden_layer_size),
            nn.ReLU(),
            nn.Linear(hidden_layer_size, n_actions),
        )
        self.learning_rate = lr
        # self.optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        self.loss = nn.MSELoss()
        self.device = device

    
    def forward(self, x):
        x = self.flatten(x)
        logits = self.seq_relu(x)
        return logits

# learning_rate = 0.05
# q_approx = DQN(lr=learning_rate, device=device).to(device)
# q_target = DQN(lr=learning_rate, device=device).to(device)
# # load same weights as approx
# q_target.load_state_dict(q_approx.state_dict())
# q_target.eval()
# print(q_approx)
# print(q_target)


In [None]:
import random

class Agent:
    
    def __init__(self, q_model, q_target, env, epsilon, gamma, batch_size) -> None:
        self.q_model = q_model
        self.q_target = q_target
        self.env = env
        self.n_actions = self.env.action_space.n
        self.epsilon = epsilon
        self.device = self.q_model.device
        self.gamma = gamma
        self.steps_done = 0
        self.train_data = []
        self.BATCH_SIZE = batch_size

        # self.target_update = 5
        # self.target_update_counter = 0
        # self.index = 0

    def train_episode(self):
        done = False
        obs = self.env.reset()
        obs = torch.tensor([obs], device=self.device, dtype=torch.float64)
        rewards = []

        while not done:
            # get action to execute based on state
            action = self.get_action(obs.float())
            #  take action, go to next time step
            obs_next, reward, done, info = self.env.step(action.item())
            rewards.append(reward)
            # convert to tensors
            obs_next = torch.tensor([obs_next], device=self.device, dtype=torch.float64)
            reward = torch.tensor([reward,], device=self.device).reshape((-1,1))
            # print(reward.shape)
            done = torch.tensor(done, device=self.device, dtype=torch.bool)
            
            # create transitions
            transition = (obs, action, reward, obs_next, done)
            # store transitions
            D.append(transition)
            # self.index += 1

            if len(D) >= self.BATCH_SIZE:
                # self.index = 0
                # for _ in range(4):
                # update target
                # if self.target_update_counter == self.target_update:
                #     self.q_target.load_state_dict(self.q_model.state_dict())
                #     self.q_target.eval()
                #     self.target_update_counter = 0                        

                # sample minibatch
                mini_batch = random.choices(D, k=self.BATCH_SIZE)

                obs_batch = tuple([tple[0] for tple in mini_batch])
                obs_batch = torch.cat(obs_batch, dim=0)
                # print('obs_batch = \n{}'.format(obs_batch.shape))

                action_batch = tuple([tple[1] for tple in mini_batch])
                action_batch = torch.cat(action_batch, dim=0).reshape(-1,1)
                # print('action_batch = \n{}'.format(action_batch.shape))

                reward_batch = torch.tensor([tple[2] for tple in mini_batch])
                reward_batch = reward_batch.reshape((-1,1))
                # print('reward_batch = \n{}'.format(reward_batch.shape))

                obs_next_batch = tuple([tple[3] for tple in mini_batch])
                obs_next_batch = torch.cat(obs_next_batch, dim=0)
                # print('obs_next_batch = \n{}'.format(obs_next_batch.shape))
            
                done_batch = torch.tensor([tple[4] for tple in mini_batch])
                done_batch = reward_batch.reshape((-1,1))
                # print('done_batch = \n{}'.format(done_batch.shape))
                

                # Compute prediction and loss
                q_values = self.q_model(obs_batch.float()).gather(1, action_batch)
                # print('q_values = {}'.format(q_values.shape))
                target_q_values   = self.q_target(obs_batch.float())
                # print('target_q_values = {}'.format(target_q_values.shape))
                y = reward_batch + self.gamma*torch.max(target_q_values)*(1-done_batch)
                loss = self.q_model.loss(q_values, y)
                
                # Backpropagation
                self.q_model.optimizer.zero_grad()
                loss.backward()
                self.q_model.optimizer.step()

                # self.target_update_counter += 1

        rewards = np.array(rewards)
        self.train_data.append(np.sum(rewards))
        
    def get_action(self, obs):
        sample = random.random()
        action = None
        if sample < self.epsilon:
            action = torch.tensor([random.choice(range(self.n_actions))])
        else:
            with torch.no_grad():
                q_vals = self.q_model(obs)
                # print(q_vals)
                action = torch.argmax(q_vals)
        return torch.tensor([action], device=self.device)


In [None]:
# -------------------------------
# hyperparameters
TARGET_UPDATE = 10

EPSILON_MIN = 0.05
EPSILON_START = 1
EPS_DECAY = 0.001

epsilon = EPSILON_START
gamma = 0.999
EPISODES = 10000

MINI_BATCH_SIZE = 16
MEMORY_SIZE = 256
# -------------------------------

We need to tune the hyperparams
- epsilon (epsilong_min)
- gamma (discount rate)
- learning rate
- target update
- mini batch size

In [None]:
# create range for hyperparams
learning_rates = np.arange(0.001, 0.002, 0.0005)
gammas = np.arange(0.98, 1.001, 0.001)



In [None]:
rewards = np.zeros((len(learning_rates), len(gammas), EPISODES))

In [None]:
BASE_PATH = "./dqn_models/"
for index_lr, learning_rate in enumerate(learning_rates):
    print('-'*5)
    print('lr = {}'.format(learning_rate))
    for index_gamma, gamma in enumerate(gammas):
        q_approx = DQN(lr=learning_rate, device=device).to(device)
        q_target = DQN(lr=learning_rate, device=device).to(device)
        # load same weights as approx
        q_target.load_state_dict(q_approx.state_dict())
        q_target.eval()
        
        # reset replay memory
        D = deque(maxlen=MEMORY_SIZE)
        agent = Agent(q_approx, q_target, env, epsilon, gamma, MINI_BATCH_SIZE)
        for episode in range(EPISODES):

            # if episode % 100 == 0:
            #     print('Episode = {}'.format(episode))
            #     print('Epsilon = {}'.format(agent.epsilon))
            # train one episode
            agent.train_episode()

            # update epsilon value
            agent.epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN)*np.exp(-EPS_DECAY*episode)

            # update target paramters
            if episode % TARGET_UPDATE == 0:
                q_target.load_state_dict(q_approx.state_dict())
                q_target.eval()
        
        # shallow copy
        # rewards_per_episode = agent.train_data.copy()
        # alias
        rewards_per_episode = agent.train_data
        rewards[index_lr, index_gamma] = rewards_per_episode 

        total_rewards = np.sum(rewards[index_lr, index_gamma,:])
        print("Score over time: {}, gamma={}".format(total_rewards/EPISODES, gamma))
        average_reward = total_rewards/EPISODES

        # save models
        model_info = str(learning_rate)+ '_' + str(gamma)
        path = BASE_PATH + 'model' + '_' + model_info + '.pt'
        q_name = 'q_approx_state_dict_' + model_info
        optim_name = 'optim_state_dict_' + model_info
        model = {
            q_name: agent.q_model.state_dict(),
            optim_name: agent.q_model.optimizer.state_dict()
        }
        torch.save(model, path)
    print('-'*5)
# save rewards to disk
with open("./dqn_rewards/dqn_rewards.pkl",'wb') as f:
    pickle.dump(rewards, f)


In [None]:
# plot average reward data
# print(rewards[:,:,:].shape)
import pandas as pd

d = pd.Series(rewards[0,0,:])
plt.plot(np.arange(EPISODES),d.rolling(100).mean())
plt.xlabel('episodes')
plt.ylabel('average reward')
# plt.plot(rewards[:,:])

In [None]:
# plt.plot(agent.train_data)
# plt.show()

In [None]:
# for param in q_approx.parameters():
#     print(param.size())

In [None]:
# RENDER = False
# total_rewards = []
# for episode in range(EPISODES):
#     done = False
#     obs = agent.env.reset()
#     obs = torch.tensor([obs], device=agent.device, dtype=torch.float64)
#     rewards = []
#     while not done:
#         # get action to execute based on state
#         action = agent.get_action(obs.float())
#         #  take action, go to next time step
#         obs_next, reward, done, info = agent.env.step(action.item())
#         obs_next = torch.tensor([obs_next], device=agent.device, dtype=torch.float64)
#         obs = obs_next
#         rewards.append(reward)
#         if RENDER:
#             env.render()

#     rewards = np.array(rewards)
#     total_rewards.append(np.sum(rewards))

# plt.plot(total_rewards)
# plt.show()

In [None]:
'''

learning_rate = 0.0001
EPSILON_MIN = 0.05
EPS_DECAY = 0.001
gamma= 0.99
TARGET_UPDATE = 10
EPISODES = 10000

MEMORY_SIZE = 256
D = deque(maxlen=MEMORY_SIZE)
MINI_BATCH_SIZE = 16
epsilon=1



q_approx = DQN(lr=learning_rate, device=device).to(device)
q_target = DQN(lr=learning_rate, device=device).to(device)
# load same weights as approx
q_target.load_state_dict(q_approx.state_dict())
q_target.eval()
# reset replay memory
D = deque(maxlen=MEMORY_SIZE)
agent = Agent(q_approx, q_target, env, epsilon, gamma, MINI_BATCH_SIZE)
for episode in range(EPISODES):

    if episode % 1000 == 0:
        print('-'*5)
        print('Episode = {}'.format(episode))
        print('Epsilon = {}'.format(agent.epsilon))
        print('-'*5)
    # train one episode
    agent.train_episode()

    # update epsilon value
    # agent.epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN)*np.exp(-EPS_DECAY*episode)
    if agent.epsilon > 0.05 :
        agent.epsilon -= (1 / 5000)

    # # update target paramters
    if episode % TARGET_UPDATE == 0:
        q_target.load_state_dict(q_approx.state_dict())
        q_target.eval()

# shallow copy
# rewards_per_episode = agent.train_data.copy()
# alias
rewards_per_episode = agent.train_data

total_rewards = np.sum(rewards_per_episode)
print("Score over time: {}, gamma={}".format( total_rewards/EPISODES, gamma))
average_reward = total_rewards/EPISODES

plt.plot(rewards_per_episode)


'''