In [1]:
import pandas as pd
import datetime

In [73]:
datetime.datetime.now().strftime('%Y%m%d%H%M%S')

'20220510175712'

In [41]:
pd.options.display.max_columns=500
pd.options.display.max_colwidth = None
pd.options.display.max_rows = None

In [2]:
result = []
render = False

### CODE

In [3]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

import threading

# Hyperparameters
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32


class ReplayBuffer:
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)

    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        state_list, action_list, reward_list, state_prime_list, done_mask_list = [], [], [], [], []

        for transition in mini_batch:
            state, action, reward, state_prime, done_mask = transition

            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            state_prime_list.append(state_prime)
            done_mask_list.append([done_mask])

        return torch.stack(state_list), torch.tensor(action_list), \
               torch.tensor(reward_list), torch.stack(state_prime_list), \
               torch.tensor(done_mask_list)

    def size(self):
        return len(self.buffer)


class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self._conv1 = nn.Conv2d(3, 16, (5, 3))
        # self.bn1 = nn.BatchNorm2d(16)
        self._max_pool1 = nn.MaxPool2d(kernel_size=(3, 2))

        self._conv2 = nn.Conv2d(16, 32, (5, 3))
        self._max_pool2 = nn.MaxPool2d(kernel_size=(3, 2))
        # self.bn2 = nn.BatchNorm2d(32)
        self._conv3 = nn.Conv2d(32, 32, (5, 3))
        # self.bn3 = nn.BatchNorm2d(32)
        self._max_pool3 = nn.MaxPool2d(kernel_size=(3, 2))

        self._ln1 = nn.Linear(2304, 512)
        self._ln2 = nn.Linear(512, 256)
        self._ln3 = nn.Linear(256, 64)
        self._ln4 = nn.Linear(64, 9)

    def forward(self, x):

        x = self._conv1(x)
        x = F.relu(x)

        x = self._max_pool1(x)

        x = self._conv2(x)
        x = F.relu(x)

        x = self._max_pool2(x)

        x = self._conv3(x)
        x = F.relu(x)

        x = self._max_pool3(x)

        if x.dim() == 3:
            x = x.view(-1)
        else:
            x = x.view(batch_size, -1)

        x = self._ln1(x)
        x = F.relu(x)

        x = self._ln2(x)
        x = F.relu(x)

        x = self._ln3(x)
        x = F.relu(x)

        x = self._ln4(x)

        return x

    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0, 8)
        else:
            return out.argmax().item()


def train(q_network, target_network, memory, optimizer):
    state_list, action_list, reward_list, state_prime_list, \
    done_mask_list = memory.sample(batch_size)

    output = q_network(state_list)
    q_action = output.gather(1, action_list)


    max_q_prime = target_network(state_prime_list).max(1)[0].unsqueeze(1)
    target = reward_list + gamma * max_q_prime * done_mask_list
    loss = F.smooth_l1_loss(q_action, target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


def preproess_state(state):
    state = state[1:172, 1:160]
    r = state[:, :, 0]
    g = state[:, :, 1]
    b = state[:, :, 2]

    return np.asarray([r, g, b])


def main():
    
    env = gym.make('MsPacman-v0')

    q_network = Qnet()
    target_network = Qnet()
    target_network.load_state_dict(q_network.state_dict())

    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0
    optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)
    
    global render

    for n_epi in range(10000):
        epsilon = max(0.01, 0.5 - 0.01 * (n_epi / 2000))

        state = env.reset()
        state = torch.tensor(preproess_state(state)).float()
        done = False
        
        action_list = []
        episode_reward = 0
        
        while not done:

            # 순전파를 통한 액션 도출
            action = q_network.sample_action(state, epsilon)
            state_prime, reward, done, info = env.step(action)
            
            action_list.append(action)
            
            reward = -1 if reward == 0 else reward

            state_prime = torch.tensor(preproess_state(state_prime)).float()

            done_mask = 0.0 if done else 1.0

            memory.put((state, action, reward, state_prime, done_mask))
            state = state_prime

            episode_reward += reward

            if render:
                env.render()
                import time
                time.sleep(0.01)

            if done:
                break

        if memory.size() > 2000:
            # 순전파, 역전파를 통한 학습
            train(q_network, target_network, memory, optimizer)
        
        result.append({
            'action_list': action_list,
            'reward': episode_reward
        })
        
        score += episode_reward
        

        if n_epi % print_interval == 0 and n_epi != 0:
            target_network.load_state_dict(q_network.state_dict())
            print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                n_epi, score / print_interval, memory.size(), epsilon * 100))
            score = 0.0

    env.close()


threading.Thread(target=main).start()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
render = True

In [78]:
df.to_csv(f"result-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.csv", index=False)

2770    [4, 1, 7, 0, 2, 5, 5, 0, 3, 5, 7, 8, 5, 7, 3, 3, 5, 1, 5, 5, 5, 7, 4, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 3, 6, 7, 5, 5, 5, 4, 0, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 4, 5, 0, 5, 2, 5, 5, 4, 5, 5, 5, 5, 1, 0, 3, 5, 5, 0, 4, 6, 8, 4, 8, 5, 0, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 3, 5, 5, 2, 5, 4, 5, 5, ...]
Name: action_list, dtype: object

### VIEW MAX SCORE

In [2]:
log = pd.read_csv('result-20220510175834.csv')
log

Unnamed: 0,action_list,reward
0,"[6, 7, 7, 3, 3, 3, 5, 1, 8, 3, 5, 3, 6, 8, 3, ...",-261.0
1,"[6, 1, 3, 5, 3, 3, 3, 3, 5, 3, 1, 6, 7, 1, 3, ...",-283.0
2,"[6, 3, 3, 3, 3, 3, 1, 3, 0, 6, 0, 8, 2, 0, 6, ...",-456.0
3,"[2, 8, 8, 8, 5, 8, 8, 6, 4, 8, 8, 7, 7, 4, 7, ...",142.0
4,"[0, 3, 1, 5, 7, 3, 2, 1, 1, 1, 1, 5, 1, 4, 1, ...",-172.0
...,...,...
3844,"[4, 6, 2, 8, 2, 2, 2, 3, 6, 2, 2, 8, 2, 5, 8, ...",-379.0
3845,"[0, 0, 2, 8, 5, 1, 5, 5, 1, 4, 8, 4, 1, 5, 5, ...",-351.0
3846,"[4, 3, 5, 3, 5, 0, 5, 5, 5, 6, 2, 4, 6, 1, 2, ...",-91.0
3847,"[5, 5, 5, 5, 5, 2, 5, 2, 5, 5, 3, 0, 0, 4, 5, ...",-300.0


In [3]:
row = log[log['reward'] == log['reward'].max()]
row

Unnamed: 0,action_list,reward
2770,"[4, 1, 7, 0, 2, 5, 5, 0, 3, 5, 7, 8, 5, 7, 3, ...",2694.0


In [4]:
import ast

action_list = ast.literal_eval(row['action_list'].values[0])
action_list

[4,
 1,
 7,
 0,
 2,
 5,
 5,
 0,
 3,
 5,
 7,
 8,
 5,
 7,
 3,
 3,
 5,
 1,
 5,
 5,
 5,
 7,
 4,
 5,
 5,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 6,
 7,
 5,
 5,
 5,
 4,
 0,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 0,
 5,
 2,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 1,
 0,
 3,
 5,
 5,
 0,
 4,
 6,
 8,
 4,
 8,
 5,
 0,
 5,
 5,
 5,
 6,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 3,
 5,
 5,
 2,
 5,
 4,
 5,
 5,
 5,
 3,
 5,
 5,
 0,
 5,
 7,
 5,
 5,
 8,
 2,
 8,
 5,
 5,
 2,
 3,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 5,
 5,
 8,
 0,
 5,
 7,
 5,
 3,
 5,
 6,
 5,
 7,
 3,
 0,
 5,
 2,
 5,
 5,
 0,
 4,
 5,
 3,
 3,
 3,
 5,
 5,
 7,
 5,
 5,
 2,
 5,
 2,
 8,
 2,
 5,
 5,
 5,
 5,
 1,
 2,
 3,
 8,
 4,
 5,
 1,
 5,
 5,
 1,
 8,
 5,
 5,
 1,
 5,
 4,
 7,
 5,
 5,
 5,
 7,
 5,
 3,
 6,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 1,
 7,
 5,
 5,
 5,
 0,
 8,
 5,
 5,
 5,
 5,
 6,
 4,
 3,
 0,
 7,
 5,
 7,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 3,
 0,
 0,
 5,
 5,
 7,
 5,
 5,
 1,
 3,
 6,
 1,
 5,
 5,
 5,
 8,
 5,
 0,
 6,
 5,


In [5]:
import gym

env = gym.make('MsPacman-v0')
env.reset()


for action in action_list:
    env.step(action)
    env.render()
    
    import time
    time.sleep(0.01)

  "We strongly suggest supplying `render_mode` when "
