In [1]:
import sys
import os
import glob

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

In [2]:
# Hyperparameters
learning_rate = 0.0005
gamma = 0.98
lmbda = 0.95
eps_clip = 0.1
K_epoch = 3
T_horizon = 20
env_name = 'PongDeterministic-v4'


In [3]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# device = "cpu"
# preprocess a single frame
# crop image and downsample to 80x80
# stack two frames together as input
def preprocess_single(image, bkg_color = np.array([144, 72, 17])):
    img = np.mean(image[34:-16:2,::2]-bkg_color, axis=-1)/255.
    return torch.from_numpy(img).float().to(device)

# convert outputs of parallelEnv to inputs to pytorch neural net
# this is useful for batch processing especially on the GPU
def preprocess_batch(images, bkg_color = np.array([144, 72, 17])):
    list_of_images = np.asarray(images)
    if len(list_of_images.shape) < 5:
        list_of_images = np.expand_dims(list_of_images, 1)
    # subtract bkg and crop

    list_of_images_prepro = np.mean(list_of_images[:,:,34:-16:2,::2]-bkg_color,
                                    axis=-1)/255.
    batch_input = np.swapaxes(list_of_images_prepro,0,1)
    return torch.from_numpy(batch_input).float().to(device)

# model

In [4]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)
    
class PPO(nn.Module):
    def __init__(self, input_size=2, hidden_size=512, output_size=6):
        super().__init__()
        
        self.base = nn.Sequential(
            #80 -> 40
            nn.Conv2d(input_size, 32, 4, stride=2),
            nn.LeakyReLU(),
            #40->20
            nn.Conv2d(32, 64, 4, stride=2),
            nn.LeakyReLU(),
            #20->20
            nn.Conv2d(64, 32, 3, stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(32*16*16, hidden_size),
            nn.LeakyReLU()
        )      
        
        self.data = []

        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc_pi = nn.Linear(256, output_size)
        self.fc_v = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim=-1):
        x = self.base(x)
        x = F.leaky_relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob

    def v(self, x):
        x = self.base(x)
        x = F.leaky_relu(self.fc1(x))
        v = self.fc_v(x)
        return v
    
    def forward(self,x):
        return self.pi(x), self.v(x)

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
        data = np.array(self.data)
        s = data[:,0]
        a = data[:,1].astype("int32")
        r = data[:,2].astype("float32")
        s_prime = data[:,3]
        prob_a = data[:,4].astype("float32")      
        done = data[:,5].astype("int32")
        
        
        return torch.cat([*s]).cuda(), torch.from_numpy(a).cuda().long(), torch.from_numpy(r).cuda().float(), torch.cat([*s_prime]).cuda(), torch.from_numpy(prob_a).cuda().float(), torch.from_numpy(done).cuda().float()
       
    def train_net(self):
        s, a, r, s_prime, prob_a, done_mask = self.make_batch()

        for i in range(K_epoch):  # 배치 하나를 K_epoch번 반복함
            td_target = r + gamma * self.v(s_prime) * done_mask
            delta = td_target - self.v(s)
            delta = delta.detach().cpu().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float).to(device)

            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1, a.unsqueeze(-1))
            
            
            # a/b == exp(log(a)-log(b))
            #새로운 policy 가 특정 action 에 주는 확률과 이전 policy 가 특정 action 에 주던 확률을 비교한다
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a.float()))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + \
                F.smooth_l1_loss(self.v(s), td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward()
            print(loss.mean())
            self.optimizer.step()
        
            del loss, pi , advantage


env = gym.make(env_name)
s = env.reset()

import matplotlib.pyplot as plt
plt.imshow(s)

In [5]:
# if torch.cuda.is_available():
#     torch.set_default_tensor_type('torch.cuda.FloatTensor')
save_interval = 100
env = gym.make(env_name)
model = PPO(input_size=1)
model.to(device)
# model = nn.DataParallel(model)
score = 0.0
print_interval = 20

for n_epi in range(10000):  # 게임 1만판 진행
    s = env.reset()
    done = False
    while not done:
        # T_horizon 이 뭐였지 : T_horizon 동안 데이터 모으고 학습을 해본다.
        for t in range(T_horizon):
            
#             s1,re1,is_done_, _ = env.step()
#             s2,re2,is_done,_ = env.step(0)
#             s_processed = preprocess_batch([s1,s2])
            
            s_processed = preprocess_single(s).expand(1,1,80,80)
            prob = model.pi(s_processed)
            m = Categorical(prob)
            a = m.sample().item()
            
            s_prime, r, done, info = env.step(a)
#             s2,_,_,_ = env.step(0)
            s = s_prime
            s_prime_processed = preprocess_single(s_prime).expand(1,1,80,80)
            
#             s_prime_processed = preprocess_batch([s_prime,s2])

            model.put_data((s_processed, a, r/100.0, s_prime_processed, prob.squeeze(0)[a].item(), done))
            score += r
            
#             del s_processed, s_prime_processed
            
            if done:
                break

            
        model.train_net()

    if n_epi % print_interval == 0 and n_epi != 0:
        print("# of episode :{}, avg score : {:.1f}".format(
            n_epi, score/print_interval))
        score = 0.0
        torch.save(model.state_dict(),
                   '{}_{}.pth'.format(env_name, n_epi))

env.close()



RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 0 does not equal 1 (while checking arguments for cudnn_convolution)

In [None]:
 torch.save(model.state_dict(),
                   '{}_{}.pth'.format(env_name, n_epi))

In [None]:
s_processed.shape

In [None]:
s_processed.expand(1,1,80,80).shape

In [None]:
s_processed.shape