In [1]:
#!/usr/bin/env python
from collections import deque
import os
import random
import gym
import torch
from torch.distributions import Categorical
import torch.nn.functional as F
from IPython.display import clear_output
import numpy as np
from skimage.color import rgb2gray
from skimage.transform import rescale



class QNetwork(torch.nn.Module):
    def __init__(self, num_frames= 4, num_actions=4):
        super(QNetwork, self).__init__()
        self.num_frames = num_frames

        
        # Layers
        self.conv1 = torch.nn.Conv2d(
            in_channels=self.num_frames,
            out_channels=16,
            kernel_size=8,
            stride=4,
            padding=2
            )
        self.conv2 = torch.nn.Conv2d(
            in_channels=16,
            out_channels=32,
            kernel_size=4,
            stride=2,
            padding=1
            )
        self.fc1 = torch.nn.Linear(
            in_features=3200,
            out_features=256,
            )
        self.fc2 = torch.nn.Linear(
            in_features=256,
            out_features=num_actions,
            )
        
        # Activation Functions
        self.relu = torch.nn.ReLU()
    
    def flatten(self, x):
        batch_size = x.size()[0]
        x = x.view(batch_size, -1)
        return x
    
    def forward(self, x):
        
        # Forward pass
        x = self.relu(self.conv1(x))  # In: (80, 80, 4)  Out: (20, 20, 16)
        x = self.relu(self.conv2(x))  # In: (20, 20, 16) Out: (10, 10, 32)
        x = self.flatten(x)           # In: (10, 10, 32) Out: (3200,)
        x = self.relu(self.fc1(x))    # In: (3200,)      Out: (256,)
        x = self.fc2(x)               # In: (256,)       Out: (4,)
        
        return x    
    
    
    
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    
    
# gym environment
#env = gym.make("PongNoFrameskip-v4")
env = gym.make("Breakout-v0")

# network and optimizer
n_actions = env.action_space.n
Q = QNetwork().to(device)
optimizer = torch.optim.Adam(Q.parameters(), lr=0.0005)

# target network
Q_target = QNetwork().to(device)
Q_target.load_state_dict(Q.state_dict())




history = deque(maxlen=100000)  # replay buffer
discount = 0.99  # discount factor gamma

def update_Q(Q,Q_target,optimizer):
    loss = 0

    for state, action, state_next, reward, done in random.sample(history, min(32, len(history))):
        with torch.no_grad():
            if done:
                target = reward
            else:
                target = reward + discount * torch.max(Q_target(state_next.to(device)))
        loss = loss + (target - Q(state.to(device))[0][action])**2

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

def process(state):
    state = rgb2gray(state[35:195, :, :])
    state = rescale(state, scale=0.5)
    state = state[np.newaxis, np.newaxis, :, :]
    state= state[0][0]
    return state 
    




In [None]:
max_time_steps = 1000

# for computing average reward over 100 episodes
reward_history = deque(maxlen=100)


# for updating target network
target_interval = 1000
target_counter = 0

# training
for episode in range(1500):
    # sum of accumulated rewards
    rewards = 0

    # get initial observation
    observation = env.reset()
    state =process(observation)
    stack = [state] * 4
    state = np.array(stack)
    state = torch.from_numpy(state).float().to(device).unsqueeze(0)


    

    # loop until an episode ends
    for t in range(1, max_time_steps + 1):
        # display current environment
        #env.render()
        

        # epsilon greedy policy for current observation
        with torch.no_grad():
            if random.random() < 0.05:
                action = env.action_space.sample()
            else:
                #action = Q(state.float().to(device)).max(1)[1].view(1, 1).item()
                q_values = Q(state.to(device)).detach()
                action = torch.argmax(q_values)
        # get next observation and current reward for the chosen action
        observation_next, reward, done, info = env.step(action)
        state_next =process(observation_next)
        stack.pop(0)
        stack.append(state_next)
        state_next = np.array(stack)
        state_next = torch.from_numpy(state_next).float().to(device).unsqueeze(0)

        # collect reward
        rewards = rewards + reward

        # collect a transition
        history.append([state, action, state_next, reward, done])

        update_Q(Q,Q_target,optimizer)

        # update target network
        target_counter = target_counter + 1
        if target_counter % target_interval == 0:
            Q_target.load_state_dict(Q.state_dict())

        if done:
            env.close()
            break

        # pass observation to the next step
        observation = observation_next
        state = state_next

    # compute average reward
    reward_history.append(rewards)
    avg = sum(reward_history) / len(reward_history)
    print('episode: {}, reward: {:.1f}, avg: {:.1f}'.format(episode, rewards, avg))

env.close()


if not os.path.exists("./param"):
    os.makedirs("./param")
torch.save(Q.state_dict(), 'param/Q_net_params.pkl')





episode: 0, reward: 2.0, avg: 2.0
episode: 1, reward: 2.0, avg: 2.0
episode: 2, reward: 1.0, avg: 1.7
episode: 3, reward: 0.0, avg: 1.2
episode: 4, reward: 1.0, avg: 1.2
episode: 5, reward: 0.0, avg: 1.0
episode: 6, reward: 1.0, avg: 1.0
episode: 7, reward: 3.0, avg: 1.2
episode: 8, reward: 2.0, avg: 1.3
episode: 9, reward: 2.0, avg: 1.4
episode: 10, reward: 3.0, avg: 1.5
episode: 11, reward: 1.0, avg: 1.5
episode: 12, reward: 0.0, avg: 1.4
episode: 13, reward: 0.0, avg: 1.3
episode: 14, reward: 1.0, avg: 1.3
episode: 15, reward: 2.0, avg: 1.3
episode: 16, reward: 2.0, avg: 1.4
episode: 17, reward: 0.0, avg: 1.3
episode: 18, reward: 0.0, avg: 1.2
episode: 19, reward: 0.0, avg: 1.1
episode: 20, reward: 1.0, avg: 1.1
episode: 21, reward: 0.0, avg: 1.1
episode: 22, reward: 1.0, avg: 1.1
episode: 23, reward: 1.0, avg: 1.1
episode: 24, reward: 0.0, avg: 1.0
episode: 25, reward: 0.0, avg: 1.0
episode: 26, reward: 1.0, avg: 1.0
episode: 27, reward: 5.0, avg: 1.1
episode: 28, reward: 0.0, avg:

In [None]:
from collections import deque
import os
import random
import gym
import torch
from torch.distributions import Categorical
import torch.nn.functional as F
from IPython.display import clear_output
import numpy as np
from skimage.color import rgb2gray
from skimage.transform import rescale
from time import sleep



class QNetwork(torch.nn.Module):
    def __init__(self, num_frames= 4, num_actions=4):
        super(QNetwork, self).__init__()
        self.num_frames = num_frames

        
        # Layers
        self.conv1 = torch.nn.Conv2d(
            in_channels=self.num_frames,
            out_channels=16,
            kernel_size=8,
            stride=4,
            padding=2
            )
        self.conv2 = torch.nn.Conv2d(
            in_channels=16,
            out_channels=32,
            kernel_size=4,
            stride=2,
            padding=1
            )
        self.fc1 = torch.nn.Linear(
            in_features=3200,
            out_features=256,
            )
        self.fc2 = torch.nn.Linear(
            in_features=256,
            out_features=num_actions,
            )
        
        # Activation Functions
        self.relu = torch.nn.ReLU()
    
    def flatten(self, x):
        batch_size = x.size()[0]
        x = x.view(batch_size, -1)
        return x
    
    def forward(self, x):
        
        # Forward pass
        x = self.relu(self.conv1(x))  # In: (80, 80, 4)  Out: (20, 20, 16)
        x = self.relu(self.conv2(x))  # In: (20, 20, 16) Out: (10, 10, 32)
        x = self.flatten(x)           # In: (10, 10, 32) Out: (3200,)
        x = self.relu(self.fc1(x))    # In: (3200,)      Out: (256,)
        x = self.fc2(x)               # In: (256,)       Out: (4,)
        
        return x    
    
    
def process(state):
    state = rgb2gray(state[35:195, :, :])
    state = rescale(state, scale=0.5)
    state = state[np.newaxis, np.newaxis, :, :]
    state= state[0][0]
    return state 
    
    
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    
Q = QNetwork().to(device)
Q.load_state_dict(torch.load('param/Q_net_params.pkl'))

env = gym.make("Breakout-v0")


# TEST     
episode = 0
state = env.reset()  
state =process(state)
stack = [state] * 4
state = np.array(stack)
state = torch.from_numpy(state).float().unsqueeze(0)

while episode < 10:  # episode loop
    env.render()
    
    q_values = Q(state.to(device)).detach()
    action = torch.argmax(q_values)
    next_state, reward, done, info = env.step(action)  # take a random action
    next_state =process(next_state)
    stack.pop(0)
    stack.append(next_state)
    next_state = np.array(stack)
    next_state = torch.from_numpy(next_state).float().unsqueeze(0)
    state = next_state
    sleep(0.03)

    if done:
        episode = episode + 1
        state = env.reset()
        state =process(state)
        stack = [state] * 4
        state = np.array(stack)
        state = torch.from_numpy(state).float().unsqueeze(0)
env.close()     