In [1]:
import pandas as pd
import datetime

In [2]:
pd.options.display.max_columns=500
pd.options.display.max_colwidth = None
pd.options.display.max_rows = None

In [3]:
result = []
render = False

In [4]:
import time
import threading

import pandas as pd

### CODE

In [5]:
import gym

import random
import collections

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchvision import transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


HyperParameter = collections.namedtuple('HyperParameter',
                                        ['batch_size', 'gamma', 'learning_rate', 'buffer_limit'])


class Qnet(nn.Module):

    def __init__(self):
        super(Qnet, self).__init__()
        self._conv1 = nn.Conv2d(in_channels=3,
                                out_channels=16,
                                kernel_size=8,
                                stride=4,
                                device=device)

        self._bn1 = nn.BatchNorm2d(16, device=device)

        self._conv2 = nn.Conv2d(in_channels=16,
                                out_channels=32,
                                kernel_size=4,
                                stride=2,
                                device=device)

        self._bn2 = nn.BatchNorm2d(32,
                                   device=device)

        self._ln1 = nn.Linear(2592, 256,
                              device=device)
        self._ln2 = nn.Linear(256, 9,
                              device=device)

    def forward(self, x):
        x = x.to(device)
        x = F.relu(self._bn1(self._conv1(x)))
        x = F.relu(self._bn2(self._conv2(x)))

        x = x.view(-1) if x.dim() == 3 else x.view(x.shape[0], -1)

        x = F.relu(self._ln1(x))
        x = self._ln2(x)

        return x


class ReplayBuffer:

    def __init__(self, buffer_limit):
        self._buffer = collections.deque(maxlen=buffer_limit)

    @property
    def size(self):
        return len(self._buffer)

    def put(self, state, state_prime, action, reward, done):
        self._buffer.append((state, state_prime, action, reward, done))

    def sample(self, n):
        mini_batch = random.sample(self._buffer, n)
        state_list, action_list, reward_list, state_prime_list, done_mask_list = [], [], [], [], []

        for transition in mini_batch:
            state, state_prime, action, reward, done_mask = transition

            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            state_prime_list.append(state_prime)
            done_mask_list.append([done_mask])

        return torch.stack(state_list), torch.tensor(action_list), \
               torch.tensor(reward_list), torch.stack(state_prime_list), \
               torch.tensor(done_mask_list)

    def reset(self):
        self._buffer.clear()


class DQNAgent:

    def __init__(self, param: HyperParameter, path=None):
        self._PARAMETER = param

        self._memory = ReplayBuffer(param.buffer_limit)

        if path:
            self.load(path)
        else:
            self._policy_network = Qnet()
            self._target_network = Qnet()

            self.update_network()

        self._optimizer = optim.Adam(self._policy_network.parameters(), lr=param.learning_rate)

    def update_network(self):
        self._target_network.load_state_dict(self._policy_network.state_dict())

    def predict(self, state, epsilon):

        out = self._policy_network(state.unsqueeze(0))
        r = random.random()

        # epsilon greedy
        if r < epsilon:
            return random.randint(0, 8)
        else:
            return out.argmax().item()

    def step(self, env, state, action):

        state_prime, reward, done, info = env.step(action)

        self._memory.put(
            state=state,
            state_prime=state_prime,
            action=action,
            reward=reward,
            done=done
        )

        return state_prime, reward, done, info

    def train(self):
        state_list, action_list, reward_list, state_prime_list, \
        done_mask_list = self._memory.sample(self._PARAMETER.batch_size)

        output = self._policy_network(state_list)
        q_action = output.gather(1, action_list)

        max_q_prime = self._target_network(state_prime_list).max(1)[0].unsqueeze(1)
        target = reward_list + self._PARAMETER.gamma * max_q_prime * done_mask_list

        loss = F.smooth_l1_loss(q_action, target)

        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()

    def save(self, path):
        torch.save(self._policy_network.state_dict(), path)

    def load(self, path):
        self._policy_network = Qnet()
        self._policy_network.load_state_dict(torch.load(path))
        self._policy_network.eval()

        self._memory.reset()

        self.update_network()


class Environment(gym.Wrapper):
    move = 0
    eat = 50
    death = -1000

    def __init__(self):
        super(Environment, self).__init__(gym.make('MsPacman-v0'))

        self._move_reward = Environment.move
        self._eat_reward = Environment.eat
        self._death_reward = Environment.death

        self._metadata = None

    def reset(self,
              reward_move: int = move,
              reward_eat: int = eat,
              reward_death: int = death,
              **kwargs):

        self._move_reward = reward_move
        self._eat_reward = reward_eat
        self._death_reward = reward_death

        state = super(Environment, self).reset(**kwargs)
        return self.observation(state)

    def step(self, action):
        state_prime, reward, done, info = super(Environment, self).step(action)

        state_prime = self.observation(state_prime)
        reward = self.reward(reward, info)

        self._metadata = info

        return state_prime, reward, done, info

    def reward(self, reward, info):

        new_reward = 0

        # move
        if reward == 0:
            new_reward = self._move_reward
        # eat
        elif reward == 10:
            new_reward = self._eat_reward

        if self._metadata and self._metadata['lives'] > info['lives']:
            new_reward -= 1000

        return new_reward

    def observation(self, observation):
        observation = observation[1:172, 1:160]

        transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((84, 84)),
            transforms.ToTensor()
        ])

        return transform(observation)


agent = None
target_epsilon = None

def main():

    parameter = HyperParameter(
        batch_size=32,
        buffer_limit=50000,
        gamma=0.98,
        learning_rate=0.1
    )
    
    global agent
    global epsilon
    
    env = Environment()
    agent = DQNAgent(param=parameter)

    print_interval = 20
    score = 0.0

    for n_epi in range(100000):
        epsilon = max(0.01, 0.5 - 0.01 * (n_epi / 10000))
        
        if target_epsilon:
            epsilon = target_epsilon
        
        state = env.reset()
        done = False
        
        action_list = []
        episode_reward = 0
        
        while not done:
            action = agent.predict(state, epsilon)
            state_prime, reward, done, info = agent.step(env, state, action)

            state = state_prime
            
            score += reward   
            episode_reward += reward
            
            if render:
                env.render()
                import time
                time.sleep(0.01)
            
            
            action_list.append(action)
            
        result.append({
            'action_list': action_list,
            'reward': episode_reward,
            'epsilon': epsilon
        })
        
        if n_epi % print_interval == 0 and n_epi != 0:
            agent.update_network()

            print("n_episode :{}, score : {:.1f}, eps : {:.1f}%".format(
                n_epi, score / print_interval, epsilon * 100))
            score = 0.0

threading.Thread(target=main).start()            

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
df = pd.DataFrame(result)
df

Unnamed: 0,action_list,reward,epsilon
0,"[3, 3, 3, 3, 3, 3, 6, 8, 8, 3, 2, 3, 4, 3, 6, 3, 5, 5, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 5, 3, 3, 3, 3, 1, 0, 1, 3, 3, 3, 3, 8, 2, 5, 7, 1, 3, 0, 2, 6, 3, 3, 4, 2, 3, 3, 3, 2, 3, 5, 3, 3, 0, 3, 5, 6, 3, 3, 3, 6, 3, 1, 3, 6, 5, 3, 7, 3, 3, 3, 7, 3, 2, 4, 5, 1, 3, 3, 3, 3, 8, 3, 5, 1, 3, ...]",-1250,0.5
1,"[6, 6, 0, 4, 1, 3, 7, 3, 3, 3, 8, 7, 3, 7, 7, 7, 0, 3, 3, 0, 3, 1, 3, 0, 3, 3, 3, 2, 3, 1, 0, 6, 4, 1, 3, 1, 6, 4, 3, 3, 0, 3, 2, 3, 3, 3, 3, 1, 3, 3, 3, 5, 3, 3, 3, 1, 3, 3, 3, 4, 3, 1, 5, 7, 3, 4, 7, 4, 3, 3, 8, 8, 0, 3, 0, 5, 3, 3, 0, 3, 3, 6, 3, 8, 3, 1, 3, 5, 6, 4, 3, 0, 6, 3, 3, 2, 4, 3, 7, 3, ...]",-400,0.499999
2,"[3, 3, 3, 3, 5, 3, 3, 3, 7, 5, 3, 7, 5, 2, 8, 0, 3, 3, 8, 3, 3, 3, 0, 3, 4, 3, 2, 3, 3, 1, 3, 3, 3, 8, 3, 2, 0, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 3, 3, 7, 3, 3, 3, 8, 6, 4, 3, 0, 8, 8, 6, 3, 7, 8, 1, 3, 1, 3, 3, 5, 3, 1, 0, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 7, 2, 2, 0, 2, 3, 2, 6, 3, 4, 0, 2, 3, 6, ...]",-2000,0.499998
3,"[3, 3, 3, 4, 7, 3, 3, 3, 6, 7, 2, 1, 0, 3, 3, 3, 2, 8, 0, 3, 3, 3, 3, 7, 4, 3, 3, 3, 3, 7, 0, 3, 1, 3, 3, 3, 3, 3, 3, 2, 8, 3, 3, 3, 7, 6, 3, 3, 3, 3, 5, 3, 8, 3, 0, 0, 6, 3, 8, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 1, 3, 5, 2, 7, 3, 1, 7, 4, 3, 3, 3, 1, 0, 6, 3, 3, 3, 3, 3, 6, 3, 5, 4, 3, 3, 3, ...]",-700,0.499997
4,"[1, 2, 2, 5, 8, 3, 3, 3, 3, 5, 3, 3, 3, 4, 3, 7, 1, 8, 4, 3, 3, 0, 4, 3, 2, 0, 3, 4, 3, 3, 3, 0, 6, 1, 3, 3, 3, 8, 0, 3, 3, 0, 3, 3, 2, 3, 0, 3, 0, 0, 4, 0, 7, 0, 7, 3, 3, 8, 2, 4, 3, 0, 4, 0, 0, 3, 4, 3, 2, 3, 6, 3, 3, 5, 1, 3, 5, 3, 3, 1, 4, 3, 3, 3, 1, 5, 3, 3, 0, 8, 3, 3, 3, 0, 5, 3, 3, 6, 3, 8, ...]",-1500,0.499996
5,"[0, 3, 3, 3, 1, 7, 5, 8, 2, 3, 3, 3, 3, 3, 3, 4, 8, 6, 3, 3, 3, 1, 3, 3, 3, 6, 4, 0, 3, 3, 3, 8, 5, 3, 3, 0, 3, 7, 1, 3, 3, 3, 3, 5, 3, 3, 7, 2, 6, 7, 3, 3, 8, 3, 3, 3, 3, 3, 3, 3, 2, 1, 3, 3, 3, 3, 3, 7, 5, 3, 3, 8, 8, 1, 1, 3, 5, 0, 3, 0, 5, 3, 3, 3, 3, 3, 3, 3, 3, 8, 3, 2, 3, 3, 3, 6, 7, 3, 3, 3, ...]",-1500,0.499995
6,"[3, 3, 4, 0, 7, 3, 2, 3, 3, 2, 3, 5, 3, 4, 0, 7, 3, 3, 3, 3, 1, 3, 3, 6, 8, 3, 6, 3, 6, 2, 3, 3, 2, 3, 6, 5, 3, 2, 3, 7, 2, 3, 5, 8, 5, 3, 3, 3, 8, 3, 3, 5, 3, 8, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 5, 0, 8, 3, 8, 3, 7, 3, 7, 0, 3, 3, 0, 5, 8, 7, 7, 3, 7, 5, 1, 3, 3, 5, 6, 3, 3, 3, 1, 3, 3, 2, ...]",-600,0.499994
7,"[6, 3, 7, 1, 5, 0, 3, 3, 4, 8, 1, 3, 8, 6, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 4, 3, 3, 2, 3, 3, 0, 1, 3, 3, 3, 3, 7, 3, 5, 3, 5, 3, 3, 3, 3, 3, 2, 0, 5, 7, 3, 6, 6, 4, 3, 3, 3, 3, 2, 3, 3, 3, 3, 8, 3, 3, 4, 3, 2, 0, 2, 3, 3, 3, 8, 3, 0, 5, 3, 3, 5, 3, 3, 3, 3, 6, 3, 3, 4, 0, 3, 6, 3, ...]",-400,0.499993
8,"[5, 3, 3, 3, 1, 3, 3, 6, 2, 3, 6, 1, 3, 3, 5, 3, 3, 3, 3, 5, 7, 3, 5, 4, 7, 3, 6, 3, 3, 8, 7, 6, 3, 6, 8, 1, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 3, 4, 3, 3, 8, 3, 8, 3, 3, 3, 3, 3, 8, 7, 3, 3, 3, 2, 3, 3, 6, 3, 3, 7, 6, 4, 3, 5, 3, 3, 3, 5, 6, 0, 3, 4, 3, 8, 8, 3, 3, 3, 3, 3, 3, 3, 8, 3, 7, 5, 3, 3, 3, ...]",-2100,0.499992
9,"[3, 5, 3, 3, 3, 3, 7, 1, 3, 3, 4, 3, 3, 7, 6, 3, 3, 5, 8, 3, 3, 6, 3, 4, 3, 0, 3, 2, 2, 0, 0, 3, 3, 1, 3, 3, 3, 1, 4, 3, 3, 3, 3, 3, 0, 5, 3, 4, 8, 0, 2, 3, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 0, 5, 1, 0, 6, 0, 4, 3, 3, 3, 3, 7, 3, 7, 3, 3, 2, 1, 2, 3, 3, 5, 1, 3, 3, 8, 6, 1, 6, 5, 3, 3, 3, 0, 5, 4, ...]",-1700,0.499991


n_episode :520, score : -1557.5, eps : 49.9%
n_episode :540, score : -1312.5, eps : 49.9%


In [22]:
agent.save('20220515215200.pt')

In [20]:
render = False

n_episode :200, score : -1330.0, eps : 50.0%
n_episode :220, score : -1290.0, eps : 50.0%
n_episode :240, score : -1560.0, eps : 50.0%
n_episode :260, score : -1530.0, eps : 50.0%
n_episode :280, score : -1472.5, eps : 50.0%
n_episode :300, score : -1465.0, eps : 50.0%
n_episode :320, score : -1210.0, eps : 50.0%
n_episode :340, score : -1602.5, eps : 50.0%
n_episode :360, score : -1320.0, eps : 50.0%
n_episode :380, score : -1407.5, eps : 50.0%
n_episode :400, score : -1292.5, eps : 50.0%
n_episode :420, score : -1520.0, eps : 50.0%
n_episode :440, score : -1427.5, eps : 50.0%
n_episode :460, score : -1527.5, eps : 50.0%
n_episode :480, score : -1100.0, eps : 50.0%
n_episode :500, score : -1502.5, eps : 50.0%


In [8]:
raise Exception

Exception: 

In [None]:
render = True

In [None]:
df.to_csv(f"result-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.csv", index=False)

2770    [4, 1, 7, 0, 2, 5, 5, 0, 3, 5, 7, 8, 5, 7, 3, 3, 5, 1, 5, 5, 5, 7, 4, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 3, 6, 7, 5, 5, 5, 4, 0, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 4, 5, 0, 5, 2, 5, 5, 4, 5, 5, 5, 5, 1, 0, 3, 5, 5, 0, 4, 6, 8, 4, 8, 5, 0, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 3, 5, 5, 2, 5, 4, 5, 5, ...]
Name: action_list, dtype: object

### VIEW MAX SCORE

In [None]:
log = pd.read_csv('result-20220510175834.csv')
log

In [None]:
row = log[log['reward'] == log['reward'].max()]
row

In [None]:
import ast

action_list = ast.literal_eval(row['action_list'].values[0])
action_list

In [None]:
import gym

env = gym.make('MsPacman-v0')
env.reset()


for action in action_list:
    env.step(action)
    env.render()
    
    import time
    time.sleep(0.01)

  "We strongly suggest supplying `render_mode` when "
