basics of DQN taken from: https://towardsdatascience.com/automating-pac-man-with-deep-q-learning-an-implementation-in-tensorflow-ca08e9891d9c

In [1]:
import numpy as np
import gym

In [6]:
import time
import IPython.display as ipd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from tqdm import tqdm
import gc
import copy
import pandas as pd
from time import time

%matplotlib inline

In [7]:
import torch
from torch import nn

from torch.utils.data import Dataset, DataLoader
from torchvision.models import vgg16

from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau


### Functions

In [8]:
def preprocess_observation(obs):
    color = 54.
    # Crop and resize the image
    img = obs[1:176:2, ::2]
    # Convert the image to greyscale
    img = img.mean(axis=2)
    # Improve image contrast
    img[np.floor(img)==color] = 0
    # Next we normalize the image from -1 to +1
    img = (img - 128) / 128
    return torch.tensor(img.reshape(1, 1, 88, 80))

In [9]:
def choose_epsilon_greedy_action(state, dqn, epsilon=0.1, greedy=False, action_space_size=9):
    Qs = dqn(state).detach().numpy()
    if greedy or np.random.rand()>epsilon:
        action = np.argmax(Qs)
    else:
        action = np.random.randint(0, action_space_size)
        
    return action

In [10]:
def choose_epsilon_greedy_multiple_actions(states, dqn, epsilon=0.1, greedy=False, action_space_size=9):
    Qs = dqn(states).detach().numpy()
    if greedy:
        action = np.argmax(Qs)
    elif np.random.rand()>epsilon:
        action = np.argmax(Qs)
    else:
        action = np.random.randint(0, action_space_size)
        
    return action

In [11]:
def display_only_generated(generated, fps=30):
    fig = plt.figure()

    ims = []
    for frame in tqdm(generated):
        im = plt.imshow(frame, animated=True)
        plt.axis('off')
        ims.append([im])
    interval = 1000/fps
    ani = animation.ArtistAnimation(fig, ims, interval=interval, repeat_delay=1000)
    plt.close()
    return ani

In [12]:
def create_episode_video(dqn, greedy=True):
    observation = env.reset()

    l = []
    for i in range(1000):
        observation, reward, is_done, info = env.step(choose_epsilon_greedy_action(preprocess_observation(observation), dqn, greedy=greedy))

        l.append(observation)
        if is_done:
            break
            
    vid = display_only_generated(l, fps=10)
    
    return vid

In [13]:
class BufferDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dqn, n_episodes, discount, verbose=True, epsilon=0.1):
        """
        Args:
            buffer (list): list of tuples, where each tuple is (state_t, action_t, reward_t1, state_t1, is_t1_terminal).
        """
        self.n_episodes = n_episodes
        self.dqn = copy.deepcopy(dqn)
        self.discount = discount
        self.verbose = verbose
        self.epsilon = epsilon
        
        self.buffer = self._create_buffer()
        
        self.S_ts = [i[0] for i in self.buffer]
        self.As = np.array([j[1] for j in self.buffer])
        self.Qs_t1 = self._get_Qs_t1()
        self.Qs_t1_max = self.Qs_t1.max(axis=1)[0]
        self.Rs_t1 = torch.tensor(np.array([i[2] for i in self.buffer]))
        self.are_terminal_t1s = torch.tensor(np.array([i[4] for i in self.buffer]))

        self.ys = self.Rs_t1 + self.discount*self.are_terminal_t1s*self.Qs_t1_max
        
        if verbose:
            print(f"len(buffer): {len(self.buffer)}")
            print(f"Qs_t1.shape: {self.Qs_t1.shape}")
            print(f"Qs_t1_max.shape: {self.Qs_t1_max.shape}")
            print(f"Rs_t1.shape: {self.Rs_t1.shape}")
            print(f"are_terminal_t1s.shape: {self.are_terminal_t1s.shape}")
            
            print(f"ys.shape: {self.ys.shape}")
    
    def _create_buffer(self):
        buffer = []
        episodes = 0

        for k in range(self.n_episodes):
            if self.verbose:
                print(f"start building buffer's episode # {episodes+1}...")
                
            cumulative_reward = 0.
                    
            state_t = preprocess_observation(env.reset())
            is_t1_terminal = False
            
            while is_t1_terminal==False:
                action_t = choose_epsilon_greedy_action(state_t, self.dqn, epsilon=self.epsilon)

                state_t1, reward_t1, is_t1_terminal, info = env.step(action_t)
                cumulative_reward += reward_t1
                state_t1 = preprocess_observation(state_t1)

                buffer.append((state_t[0], action_t, reward_t1, state_t1[0], is_t1_terminal))
                state_t = state_t1
            
                if is_t1_terminal:
                    if self.verbose:
                        print(f"finished building buffer's episode # {episodes+1}. buffer length: {len(buffer)} steps. cumulative_reward={cumulative_reward}")
                    episodes += 1
                    gc.collect()
                    
        return buffer

    def _get_Qs_t1(self, batch_size=64):
        i=0
        Qs_t1 = []
        k=3 # the index of state_t1 in buffer sample.
        while i*batch_size<len(self.buffer):
            if self.verbose and (i*batch_size)%1000==0:
                print(f"preprocess Qs_t1: passed buffer[{(1000*(i*batch_size)//1000)}]")
            states_t1 = torch.stack([i[k] for i in self.buffer[i*batch_size:(i+1)*batch_size]])
            Qs_t1.append(self.dqn(states_t1).detach())
            gc.collect()
            i+=1

        return torch.cat(Qs_t1)
    
    
    def __len__(self):
        return len(self.buffer)

    def __getitem__(self, idx):
        X = self.S_ts[idx]
        A = self.As[idx]
        Y = self.ys[idx]
        
        return X, A, Y

### Initialization and Expoloration of env

In [15]:
env = gym.make("MsPacman-v0")

### define model and run episode with the untrained model

In [16]:
vgg = vgg16(pretrained=False)
vgg.features = vgg.features[:5]
vgg.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
vgg.classifier[0] = nn.Linear(in_features=3136, out_features=4096, bias=True)
vgg.classifier[6] = nn.Linear(in_features=4096, out_features=9, bias=True)
dqn = vgg.double()

In [17]:
optimizer = AdamW(dqn.parameters(), lr=0.0001)
lr_reduce =  ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08, verbose=False)

In [None]:
vid = create_episode_video(dqn)
ipd.HTML(vid.to_html5_video())

100%|██████████| 1000/1000 [00:01<00:00, 935.44it/s]


### training

- play games with current policy
- for each step store the following supervised data for the batch training: (s,a,r,s’,d)
- (s,a,r,s’,d) = (state_t, action_t, reward_t1, s_t1, is_done)

### dataset
  
#### Elements:
- creating buffer by playing current dqn for n episodes
- dqn works on states_t resulting with (N, 9) tensor.
- we choose the ones to be updated by the proper actions we chose during building the buffer. For this we create tensor of As (actions)
- this will do for calculateing Q(s, a)
- now we refer to create the targets for training. using the r_t1 + Discount\*Q(s_t1, a_t1). if t1 is terminal - take only R_t1.
- for this we need to run the dqn on all states S_t1 (get (N, 9) tensor), and take the max value for each new state.

- reminder: buffer entry is (state_t, action_t, reward_t1, state_t1, is_terminal_t1)

In [None]:
EPSILON = 0.05
DISCOUNT_FACTOR = 0.95
LEARNING_RATE = 0.001

N_EPOCHS = 30
N_EPISODES = 30
BATCH_SIZE = 64

In [16]:
def print_headline(mystr):
    print("\n============================================================")
    print(mystr)
    print("============================================================")

In [17]:
optimizer = AdamW(dqn.parameters(), lr=LEARNING_RATE)
# lr_reduce =  ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08, verbose=False)

for epoch in range(N_EPOCHS):
    print_headline(f"EPOCH {epoch + 1}; start building replay buffer.")
    tic = time()
    dataset = BufferDataset(dqn, n_episodes=N_EPISODES, discount=DISCOUNT_FACTOR, epsilon=EPSILON)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    toc = time()
    print(f"time: {toc-tic:.1f}sec.")
    
    running_loss = 0.0
    print_headline(f"EPOCH {epoch + 1}; start running training.")
    for i_batch, (X, A, Y) in enumerate(dataloader):
        tic = time()
    #     print(X.shape, A.shape, Y.shape)
        optimizer.zero_grad()

        A = torch.tensor(list(zip(torch.arange(len(A)), A)))
        loss = ((dqn(X)[list(zip(*A))] - Y)**2).mean()

        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i_batch % 10 == 9:    # print every 2000 mini-batches
            toc = time()
            print(f"batch: {i_batch + 1}/{len(dataset)//BATCH_SIZE+1}; loss: {running_loss / 10}; time: {toc-tic:.1f}sec.")
            running_loss = 0.0
            tic = toc

print('Finished Training')


EPOCH 1; start building replay buffer.
start building buffer's episode # 1...
finished building buffer's episode # 1. buffer length: 587 steps. cumulative_reward=270.0
start building buffer's episode # 2...
finished building buffer's episode # 2. buffer length: 1221 steps. cumulative_reward=160.0
start building buffer's episode # 3...
finished building buffer's episode # 3. buffer length: 1789 steps. cumulative_reward=140.0
start building buffer's episode # 4...
finished building buffer's episode # 4. buffer length: 2212 steps. cumulative_reward=150.0
start building buffer's episode # 5...
finished building buffer's episode # 5. buffer length: 2764 steps. cumulative_reward=200.0
start building buffer's episode # 6...
finished building buffer's episode # 6. buffer length: 3403 steps. cumulative_reward=170.0
start building buffer's episode # 7...
finished building buffer's episode # 7. buffer length: 3906 steps. cumulative_reward=160.0
start building buffer's episode # 8...
finished bui

KeyboardInterrupt: 

In [18]:
vid = create_episode_video(dqn)
ipd.HTML(vid.to_html5_video())

100%|██████████| 437/437 [00:00<00:00, 666.79it/s]


In [None]:
envs = []
envs.append(gym.make("MsPacman-v0"))
envs.append(gym.make("MsPacman-v0"))

In [None]:
states_t = []

for i in range(len(envs)):
    states_t.append(preprocess_observation(envs[i].reset()))
    
are_t1_terminals = [False for i in range(len(states_t))]

In [None]:
states_t = torch.tensor(np.vstack(states_t))
Qs = dqn(states_t).detach().numpy()

In [None]:
Qs

In [None]:
np.argmax(Qs, axis=1)

In [None]:
actions = np.random.randint(0, 9, size=(len(Qs)))
actions

In [None]:

Qs = dqn(states).detach().numpy()
if greedy or np.random.rand()>epsilon:
    actions = np.argmax(Qs, axis=1)
else:
    actions = np.random.randint(0, action_space_size)

return action

In [None]:
while not all(are_t1_terminals):
    for i in range(len(envs)):
        action_t = choose_epsilon_greedy_action(state_t, self.dqn, epsilon=self.epsilon)

In [None]:
buffer = []
episodes = 0

for k in range(self.n_episodes):
    if self.verbose:
        print(f"start building buffer's episode # {episodes+1}...")

    cumulative_reward = 0.

    state_t = preprocess_observation(env.reset())
    is_t1_terminal = False

    while is_t1_terminal==False:
        action_t = choose_epsilon_greedy_action(state_t, self.dqn, epsilon=self.epsilon)

        state_t1, reward_t1, is_t1_terminal, info = env.step(action_t)
        cumulative_reward += reward_t1
        state_t1 = preprocess_observation(state_t1)

        buffer.append((state_t[0], action_t, reward_t1, state_t1[0], is_t1_terminal))
        state_t = state_t1

        if is_t1_terminal:
            if self.verbose:
                print(f"finished building buffer's episode # {episodes+1}. buffer length: {len(buffer)} steps. cumulative_reward={cumulative_reward}")
            episodes += 1
            gc.collect()

 

In [None]:
class BufferDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dqn, n_episodes, discount, verbose=True, epsilon=0.1):
        """
        Args:
            buffer (list): list of tuples, where each tuple is (state_t, action_t, reward_t1, state_t1, is_t1_terminal).
        """
        self.n_episodes = n_episodes
        self.dqn = copy.deepcopy(dqn)
        self.discount = discount
        self.verbose = verbose
        self.epsilon = epsilon
        
        self.buffer = self._create_buffer()
        
        self.S_ts = [i[0] for i in self.buffer]
        self.As = np.array([j[1] for j in self.buffer])
        self.Qs_t1 = self._get_Qs_t1()
        self.Qs_t1_max = self.Qs_t1.max(axis=1)[0]
        self.Rs_t1 = torch.tensor(np.array([i[2] for i in self.buffer]))
        self.are_terminal_t1s = torch.tensor(np.array([i[4] for i in self.buffer]))

        self.ys = self.Rs_t1 + self.discount*self.are_terminal_t1s*self.Qs_t1_max
        
        if verbose:
            print(f"len(buffer): {len(self.buffer)}")
            print(f"Qs_t1.shape: {self.Qs_t1.shape}")
            print(f"Qs_t1_max.shape: {self.Qs_t1_max.shape}")
            print(f"Rs_t1.shape: {self.Rs_t1.shape}")
            print(f"are_terminal_t1s.shape: {self.are_terminal_t1s.shape}")
            
            print(f"ys.shape: {self.ys.shape}")
    
    def _create_buffer(self):
        buffer = []
        episodes = 0

        for k in range(self.n_episodes):
            if self.verbose:
                print(f"start building buffer's episode # {episodes+1}...")
                
            cumulative_reward = 0.
                    
            state_t = preprocess_observation(env.reset())
            is_t1_terminal = False
            
            while is_t1_terminal==False:
                action_t = choose_epsilon_greedy_action(state_t, self.dqn, epsilon=self.epsilon)

                state_t1, reward_t1, is_t1_terminal, info = env.step(action_t)
                cumulative_reward += reward_t1
                state_t1 = preprocess_observation(state_t1)

                buffer.append((state_t[0], action_t, reward_t1, state_t1[0], is_t1_terminal))
                state_t = state_t1
            
                if is_t1_terminal:
                    if self.verbose:
                        print(f"finished building buffer's episode # {episodes+1}. buffer length: {len(buffer)} steps. cumulative_reward={cumulative_reward}")
                    episodes += 1
                    gc.collect()
                    
        return buffer

    def _get_Qs_t1(self, batch_size=64):
        i=0
        Qs_t1 = []
        k=3 # the index of state_t1 in buffer sample.
        while i*batch_size<len(self.buffer):
            if self.verbose and (i*batch_size)%1000==0:
                print(f"preprocess Qs_t1: passed buffer[{(1000*(i*batch_size)//1000)}]")
            states_t1 = torch.stack([i[k] for i in self.buffer[i*batch_size:(i+1)*batch_size]])
            Qs_t1.append(self.dqn(states_t1).detach())
            gc.collect()
            i+=1

        return torch.cat(Qs_t1)
    
    
    def __len__(self):
        return len(self.buffer)

    def __getitem__(self, idx):
        X = self.S_ts[idx]
        A = self.As[idx]
        Y = self.ys[idx]
        
        return X, A, Y