In [1]:
import gym
import time
import gym.spaces
import random
import numpy as np
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as mpl
from collections import deque
from common.replay_buffer import ReplayBuffer
from common.utils import make_experience, from_experience
from common.device import device

warnings.filterwarnings('ignore')

In [2]:
class BitFlipEnv:
    def __init__(self, bits):
        self.bits = bits
        self.state = np.zeros(bits)
        self.reset()

    def reset(self):
        self.state = np.random.randint(0, 2, size=self.bits)
        return np.copy(self.state)

    def step(self, action):
        self.state[action] = 1 - self.state[action]
        return np.copy(self.state), None, None, None

    def render(self):
        print("State: {}".format(self.state.tolist()))

In [3]:
env = gym.make('LunarLander-v2')
# BITS = 50
# env = BitFlipEnv(BITS)

In [4]:
BUFFER_SIZE = int(1e6)
BATCH_SIZE = 128
GAMMA = 0.98
TAU = 0.95
EPOCHS = 200
CYCLES = 50
EPISODES = 16
MAX_STEPS = 1000
FUTURE_K = 4
OPTIMS = 50
STATE_SIZE = env.observation_space.shape[0] # env.observation_space.shape[0] * 2
ACTION_SIZE = env.action_space.n
LR = 0.001
EPS_START = 1.0
EPS_END = 0.0
EPS_DECAY = 0.95
ENV_SOLVED = 200
TIMES_SOLVED = 100

In [5]:
class DuelingQNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        
        self.features = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
        )
        
        self.advantage = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_size),
            
        )
        
        self.value = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
            
    def forward(self, state):
        x = self.features(state)
        advantage = self.advantage(x)
        value = self.value(x)
        
        return value + advantage  - advantage.mean(dim=1, keepdim=True)

In [6]:
class DQNAgent:

    def __init__(self):
        self.qn_local = DuelingQNetwork(STATE_SIZE, 
                                        ACTION_SIZE).to(device)
        
        self.qn_target = DuelingQNetwork(STATE_SIZE, 
                                         ACTION_SIZE).to(device)
        
        self.soft_update(1.)
        
        self.optimizer = optim.Adam(self.qn_local.parameters(), lr=LR)
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        
        self.qn_local.eval()
        with torch.no_grad():
            action_values = self.qn_local(state)
        self.qn_local.train()
        
        if random.random() > eps:
            return np.argmax(action_values.cpu().numpy())
        else:
            return random.choice(np.arange(ACTION_SIZE))

    def optimize(self):
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            return self.learn(experiences)
            
    def learn(self, experiences):
        (states, 
         actions, 
         rewards, 
         next_states, 
         dones) = from_experience(experiences)
        
        best_action = self.qn_local(next_states).argmax(-1, keepdim=True)
        max_q = self.qn_target(next_states).detach().gather(-1, best_action)
        
        q_targets = rewards + (GAMMA * max_q * (1 - dones))
        q_expected = self.qn_local(states).gather(-1, actions)

        loss = F.mse_loss(q_expected, q_targets)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()

    def soft_update(self, tau):
        for target_param, local_param in zip(self.qn_target.parameters(), 
                                             self.qn_local.parameters()):
            target_param.data.copy_(tau * local_param + (1.0 - tau) * target_param)
    
    def add_experience(self, state, action, reward, next_state, done):
        experience = make_experience(state, 
                                     action, 
                                     reward, 
                                     next_state, 
                                     done)
        self.memory.add(experience)
    
    def make_goal(self):
        return np.array([0., 0., 0., 0., 0., 0., 1., 1.])
    
    def compute_reward(self, state, goal, eps=0.001):
        done = np.sum(np.abs(goal - state)) < eps
        return 0. if done else -1., done
    
    def eval_episode(self):
        total_reward = 0
        goal = self.make_goal()
        
        for _ in range(TIMES_SOLVED):
            state = env.reset()
            
            for step in range(MAX_STEPS):
#                 action = self.act(np.concatenate([state, goal]))
                action = self.act(state)
                state, reward, done, _ = env.step(action)

                total_reward += reward
    
                if done: break
                
        return total_reward / TIMES_SOLVED
    
    def train(self):
        eps = EPS_START
        
        for epoch in range(1, EPOCHS+1):
            
            success = 0
            
            for cycle in range(CYCLES):
            
                for episode in range(EPISODES):

                    trajectory = []
                    state = env.reset()
                    goal = self.make_goal()

                    score = 0

                    for step in range(MAX_STEPS):
#                         action = self.act(np.concatenate([state, goal]), eps)
                        action = self.act(state, eps)
                        next_state, env_reward, env_done, _ = env.step(action)
#                         reward, done = self.compute_reward(next_state, goal)

#                         trajectory.append(make_experience(state, action, env_reward, next_state, done))
                        trajectory.append(make_experience(state, action, env_reward, next_state, env_done))

                        state = next_state

#                         if done: success += 1
                        
                        if env_done: break

                    steps_taken = len(trajectory)

                    for t in range(steps_taken):
                        state, action, reward, next_state, done = trajectory[t]
                        
#                         self.add_experience(np.concatenate([state, goal]), 
#                                             action, 
#                                             reward, 
#                                             np.concatenate([next_state, goal]), 
#                                             done)

                        self.add_experience(state, 
                                            action, 
                                            reward, 
                                            next_state, 
                                            done)

#                         for _ in range(FUTURE_K):
#                             future = np.random.randint(t, steps_taken)
#                             achieved_goal = trajectory[future].next_state
#                             reward, done = self.compute_reward(next_state, achieved_goal)
                            
#                             self.add_experience(np.concatenate([state, achieved_goal]), 
#                                                 action, 
#                                                 reward, 
#                                                 np.concatenate([next_state, achieved_goal]), 
#                                                 done)
                            
                # End Episode

                for _ in range(OPTIMS):
                    loss = self.optimize()

                self.soft_update(TAU)
                
            # End Cycle
            
            success_rate = success / (EPISODES * CYCLES)
            
            print('\rEpoch {}\tExploration: {:.2f}%\tSuccess Rate: {:.2}\tLast Loss: {:.4f}'.format(
                epoch, 
                100*eps, 
                success_rate, 
                loss
            ), end='')
            
            if epoch % 5 == 0:
                print('\nRunning evaluation...')

                mean_score = self.eval_episode()

                if mean_score >= ENV_SOLVED:
                    print('Environment solved {} times consecutively!'.format(TIMES_SOLVED))
                    print('Avg score: {:.3f}'.format(mean_score))
                    break
                else:
                    print('No success. Avg score: {:.3f}'.format(mean_score))
            
            eps = max(EPS_END, EPS_DECAY*eps)

In [7]:
agent = DQNAgent()

In [8]:
agent.train()

Epoch 5	Exploration: 81.45%	Success Rate: 0.0	Last Loss: 3.957032
Running evaluation...
No success. Avg score: -197.330
Epoch 10	Exploration: 63.02%	Success Rate: 0.0	Last Loss: 9.26015
Running evaluation...
No success. Avg score: -118.832
Epoch 15	Exploration: 48.77%	Success Rate: 0.0	Last Loss: 20.6801
Running evaluation...
No success. Avg score: -11.940
Epoch 20	Exploration: 37.74%	Success Rate: 0.0	Last Loss: 7.44047
Running evaluation...
Environment solved 100 times consecutively!
Avg score: 208.168
