In [1]:
import pandas as pd
import datetime

In [2]:
pd.options.display.max_columns=500
pd.options.display.max_colwidth = None
pd.options.display.max_rows = None

In [3]:
result = []
render = False

### CODE

In [4]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

import threading

# Hyperparameters
learning_rate = 0.1
gamma = 0.98
buffer_limit = 50000
batch_size = 32

result = []

class ReplayBuffer:
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)

    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        state_list, action_list, reward_list, state_prime_list, done_mask_list = [], [], [], [], []

        for transition in mini_batch:
            state, action, reward, state_prime, done_mask = transition

            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            state_prime_list.append(state_prime)
            done_mask_list.append([done_mask])

        return torch.stack(state_list), torch.tensor(action_list), \
               torch.tensor(reward_list), torch.stack(state_prime_list), \
               torch.tensor(done_mask_list)

    def size(self):
        return len(self.buffer)


class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self._conv1 = nn.Conv2d(3, 16, (5, 3))
        # self.bn1 = nn.BatchNorm2d(16)
        self._max_pool1 = nn.MaxPool2d(kernel_size=(3, 2))

        self._conv2 = nn.Conv2d(16, 32, (5, 3))
        self._max_pool2 = nn.MaxPool2d(kernel_size=(3, 2))
        # self.bn2 = nn.BatchNorm2d(32)
        self._conv3 = nn.Conv2d(32, 32, (5, 3))
        # self.bn3 = nn.BatchNorm2d(32)
        self._max_pool3 = nn.MaxPool2d(kernel_size=(3, 2))

        self._ln1 = nn.Linear(2304, 64)
        self._ln2 = nn.Linear(64, 9)

    def forward(self, x):

        x = self._conv1(x)
        x = F.relu(x)

        x = self._max_pool1(x)

        x = self._conv2(x)
        x = F.relu(x)

        x = self._max_pool2(x)

        x = self._conv3(x)
        x = F.relu(x)

        x = self._max_pool3(x)

        if x.dim() == 3:
            x = x.view(-1)
        else:
            x = x.view(batch_size, -1)

        x = self._ln1(x)
        x = F.relu(x)

        x = self._ln2(x)

        return x

    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0, 8)
        else:
            return out.argmax().item()


def train(q_network, target_network, memory, optimizer):
    state_list, action_list, reward_list, state_prime_list, \
    done_mask_list = memory.sample(batch_size)

    output = q_network(state_list)
    q_action = output.gather(1, action_list)


    max_q_prime = target_network(state_prime_list).max(1)[0].unsqueeze(1)
    target = reward_list + gamma * max_q_prime * done_mask_list
    loss = F.smooth_l1_loss(q_action, target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


def preproess_state(state):
    state = state[1:172, 1:160]
    r = state[:, :, 0]
    g = state[:, :, 1]
    b = state[:, :, 2]

    return np.asarray([r, g, b])


def main():
    
    env = gym.make('MsPacman-v0')

    q_network = Qnet()
    target_network = Qnet()
    target_network.load_state_dict(q_network.state_dict())

    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0
    optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)
    
    global render

    for n_epi in range(10000):
        epsilon = max(0.01, 0.5 - 0.01 * (n_epi / 20))

        state = env.reset()
        state = torch.tensor(preproess_state(state)).float()
        done = False
        
        action_list = []
        episode_reward = 0
        
        current_lives = 3
        
        while not done:

            # 순전파를 통한 액션 도출
            action = q_network.sample_action(state, epsilon)
            state_prime, reward, done, info = env.step(action)
            
            action_list.append(action)
            
            reward = -1 if reward == 0 else reward
            reward = 50 if reward == 10 else reward
            
            if info['lives'] < current_lives:
                reward -= 1000
                current_lives = info['lives']
            
            
            state_prime = torch.tensor(preproess_state(state_prime)).float()
            
            done_mask = 0.0 if done else 1.0
                        
            memory.put((state, action, reward, state_prime, done_mask))
            state = state_prime

            episode_reward += reward

            if render:
                env.render()
                import time
                time.sleep(0.01)

            if done:
                break

        if memory.size() > 2000:
            # 순전파, 역전파를 통한 학습
            train(q_network, target_network, memory, optimizer)
        
        result.append({
            'action_list': action_list,
            'reward': episode_reward,
            'epsilon': epsilon
        })
        
        score += episode_reward
        

        if n_epi % print_interval == 0 and n_epi != 0:
            target_network.load_state_dict(q_network.state_dict())
            print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                n_epi, score / print_interval, memory.size(), epsilon * 100))
            score = 0.0

    env.close()


threading.Thread(target=main).start()



  from .autonotebook import tqdm as notebook_tqdm


In [13]:
df = pd.DataFrame(result)
df

Unnamed: 0,action_list,reward,epsilon
0,"[3, 3, 3, 3, 1, 3, 3, 3, 4, 3, 4, 3, 3, 4, 3, 3, 1, 0, 8, 3, 3, 4, 0, 8, 8, 3, 3, 5, 7, 3, 3, 7, 3, 3, 3, 4, 3, 7, 8, 4, 3, 1, 3, 3, 3, 3, 3, 0, 4, 8, 4, 3, 6, 3, 3, 7, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 3, 3, 2, 1, 3, 3, 2, 3, 3, 3, 7, 2, 3, 4, 3, 8, 6, 8, 7, 4, 3, 3, 1, 3, 3, 3, 3, 2, 3, 7, 3, 3, 3, 3, ...]",-1886.0,0.5
1,"[0, 3, 8, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 3, 3, 7, 3, 3, 3, 3, 7, 3, 4, 8, 3, 3, 0, 2, 0, 6, 3, 1, 3, 0, 4, 3, 7, 2, 1, 3, 3, 3, 3, 3, 3, 3, 5, 3, 3, 3, 3, 2, 1, 4, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3, 6, 8, 3, 3, 3, 3, 3, 8, 3, 3, 3, 7, 3, 2, 5, 8, 8, 3, 3, 3, 1, 3, ...]",-2615.0,0.4995


In [15]:
render = False

n_episode :20, score : -2160.9, n_buffer : 17580, eps : 49.0%
n_episode :40, score : -2283.5, n_buffer : 34893, eps : 48.0%
n_episode :60, score : -2198.1, n_buffer : 50000, eps : 47.0%
n_episode :80, score : -2234.3, n_buffer : 50000, eps : 46.0%
n_episode :100, score : -2124.1, n_buffer : 50000, eps : 45.0%
n_episode :120, score : -2283.2, n_buffer : 50000, eps : 44.0%
n_episode :140, score : -2133.2, n_buffer : 50000, eps : 43.0%
n_episode :160, score : -2155.6, n_buffer : 50000, eps : 42.0%
n_episode :180, score : -1937.3, n_buffer : 50000, eps : 41.0%
n_episode :200, score : -1947.4, n_buffer : 50000, eps : 40.0%
n_episode :220, score : -2225.8, n_buffer : 50000, eps : 39.0%
n_episode :240, score : -2332.8, n_buffer : 50000, eps : 38.0%
n_episode :260, score : -2339.6, n_buffer : 50000, eps : 37.0%
n_episode :280, score : -1923.0, n_buffer : 50000, eps : 36.0%
n_episode :300, score : -2208.0, n_buffer : 50000, eps : 35.0%
n_episode :320, score : -2103.2, n_buffer : 50000, eps : 34

n_episode :2640, score : -2748.6, n_buffer : 50000, eps : 1.0%
n_episode :2660, score : -2591.8, n_buffer : 50000, eps : 1.0%
n_episode :2680, score : -2879.4, n_buffer : 50000, eps : 1.0%
n_episode :2700, score : -2817.9, n_buffer : 50000, eps : 1.0%
n_episode :2720, score : -2838.3, n_buffer : 50000, eps : 1.0%
n_episode :2740, score : -2801.7, n_buffer : 50000, eps : 1.0%
n_episode :2760, score : -2624.4, n_buffer : 50000, eps : 1.0%
n_episode :2780, score : -3047.9, n_buffer : 50000, eps : 1.0%
n_episode :2800, score : -2750.0, n_buffer : 50000, eps : 1.0%
n_episode :2820, score : -2625.5, n_buffer : 50000, eps : 1.0%
n_episode :2840, score : -2446.3, n_buffer : 50000, eps : 1.0%
n_episode :2860, score : -2772.4, n_buffer : 50000, eps : 1.0%
n_episode :2880, score : -3005.3, n_buffer : 50000, eps : 1.0%
n_episode :2900, score : -2865.1, n_buffer : 50000, eps : 1.0%
n_episode :2920, score : -2814.9, n_buffer : 50000, eps : 1.0%
n_episode :2940, score : -2899.1, n_buffer : 50000, eps

n_episode :5260, score : -2726.9, n_buffer : 50000, eps : 1.0%
n_episode :5280, score : -2560.6, n_buffer : 50000, eps : 1.0%
n_episode :5300, score : -2711.1, n_buffer : 50000, eps : 1.0%
n_episode :5320, score : -2762.1, n_buffer : 50000, eps : 1.0%
n_episode :5340, score : -2880.3, n_buffer : 50000, eps : 1.0%
n_episode :5360, score : -2574.4, n_buffer : 50000, eps : 1.0%
n_episode :5380, score : -2711.9, n_buffer : 50000, eps : 1.0%
n_episode :5400, score : -3018.8, n_buffer : 50000, eps : 1.0%
n_episode :5420, score : -2757.6, n_buffer : 50000, eps : 1.0%
n_episode :5440, score : -2818.8, n_buffer : 50000, eps : 1.0%
n_episode :5460, score : -3070.9, n_buffer : 50000, eps : 1.0%
n_episode :5480, score : -2906.1, n_buffer : 50000, eps : 1.0%
n_episode :5500, score : -2896.2, n_buffer : 50000, eps : 1.0%
n_episode :5520, score : -2993.3, n_buffer : 50000, eps : 1.0%
n_episode :5540, score : -2741.3, n_buffer : 50000, eps : 1.0%
n_episode :5560, score : -2933.5, n_buffer : 50000, eps

In [7]:
raise Exception

Exception: 

In [None]:
render = True

In [None]:
df.to_csv(f"result-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.csv", index=False)

2770    [4, 1, 7, 0, 2, 5, 5, 0, 3, 5, 7, 8, 5, 7, 3, 3, 5, 1, 5, 5, 5, 7, 4, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 3, 6, 7, 5, 5, 5, 4, 0, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 4, 5, 0, 5, 2, 5, 5, 4, 5, 5, 5, 5, 1, 0, 3, 5, 5, 0, 4, 6, 8, 4, 8, 5, 0, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 3, 5, 5, 2, 5, 4, 5, 5, ...]
Name: action_list, dtype: object

### VIEW MAX SCORE

In [None]:
log = pd.read_csv('result-20220510175834.csv')
log

In [None]:
row = log[log['reward'] == log['reward'].max()]
row

In [None]:
import ast

action_list = ast.literal_eval(row['action_list'].values[0])
action_list

In [None]:
import gym

env = gym.make('MsPacman-v0')
env.reset()


for action in action_list:
    env.step(action)
    env.render()
    
    import time
    time.sleep(0.01)

  "We strongly suggest supplying `render_mode` when "
