In [1]:
import gym
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
import math, time, random, copy
import numpy as np
from IPython.display import clear_output

In [2]:
env = gym.make('CartPole-v1', render_mode='human')

In [3]:
class DQL():
    def __init__(self, state_dim, action_dim, hidden_dim=64, lr=0.05):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(state_dim, hidden_dim),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(hidden_dim, hidden_dim*2),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(hidden_dim*2, action_dim)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)

    def update(self, state, y):
        y_pred = self.model(torch.Tensor(state[0]))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step

    def predict(self, state):
        return self.model(torch.Tensor(state[0]))

In [4]:
def q_learning(env, model, episodes, gamma=0.9,
            epsilon=0.3, eps_decay=0.99, replay=False,
            replay_size=20, title='DQL', double=False,
            n_update=10, soft=False, verbose=True):
    final = []
    memory = []
    episode_i = 0
    sum_total_replay_time = 0
    for episode in range(episodes):
        episode_i += 1
        if double and not soft:
            if episode % n_update == 0:
                model.target_update()
        if double and soft:
            model.target_update()

        state = env.reset()
        done = False
        total = 0

        while not done:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                q_values = model.predict(state)
                action = torch.argmax(q_values).item()
            
            # Take action and add reward to total
            next_state, reward, done, _ = env.step(action)[0]
            
            # Update total and memory
            total += reward
            memory.append((state, action, next_state, reward, done))
            q_values = model.predict(state).tolist()
             
            if done:
                break

            # Update network weights using the last step only
            q_values_next = model.predict(next_state)
            q_values[action] = reward + gamma * torch.max(q_values_next).item()
            model.update(state, q_values)

            state = next_state
        
        # Update epsilon
        epsilon = max(epsilon * eps_decay, 0.01)
        final.append(total)
        
        if verbose:
            print("episode: {}, total reward: {}".format(episode_i, total))
        
    return final

In [5]:
# Number of states
n_state = env.observation_space.shape[0]
# Number of actions
n_action = env.action_space.n
# Number of episodes
episodes = 500
# Number of hidden nodes
n_hidden = 50
# Learning rate
lr = 0.001

In [6]:
# Get DQL results
simple_dqn = DQL(n_state, n_action, n_hidden, lr)
simple = q_learning(env, simple_dqn, episodes, gamma=.9, epsilon=0.3)

episode: 1, total reward: 0.14834623038768768
episode: 2, total reward: 0.17842577397823334
episode: 3, total reward: 0.1950480341911316
episode: 4, total reward: 0.2237825244665146
episode: 5, total reward: 0.18406514823436737
episode: 6, total reward: 0.19504636526107788
episode: 7, total reward: 0.1490022838115692
episode: 8, total reward: 0.20139974355697632
episode: 9, total reward: 0.17696118354797363
episode: 10, total reward: 0.19098158180713654
episode: 11, total reward: 0.18655622005462646
episode: 12, total reward: 0.23297907412052155
episode: 13, total reward: 0.19310952723026276
episode: 14, total reward: 0.22422964870929718
episode: 15, total reward: 0.16884903609752655
episode: 16, total reward: 0.15550369024276733
episode: 17, total reward: 0.18909287452697754
episode: 18, total reward: -0.1518300622701645
episode: 19, total reward: 0.23897406458854675
episode: 20, total reward: 0.14733988046646118
episode: 21, total reward: 0.23687560856342316
episode: 22, total reward

: 