# Bitflipping with DQN and Hindsight Experience Replay (HER)

### Packages

In [1]:
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
import progressbar as pb           # tracking time while training

from Bitflipping_Environment import BitFlippingEnv
from dqn_agent import Agent

### Testing Environment

In [2]:
def test_bitflipping(n, n_episodes=5, policy=None, render=False):
    env = BitFlippingEnv(n)
    success = []

    for e in range(n_episodes):
        state, _, _, _ = env.reset()
        if render:
            env.render()
        for t in range(n):
            
            if policy is None:
                action = np.random.randint(0, n)
            else:
                state_goal = np.concatenate([state['obs'], state['goal']])
                action = policy.act(state_goal, eps=0)
            
            state, reward, done, info = env.step(action)
            if render:
                env.render()
            if done:
                break
        success.append(int(info))
    return np.mean(success)

In [3]:
# testing random actions in env
test_bitflipping(n=5, n_episodes=3, render=True)  

Step  0    Bits: [0 0 0 1 0]   Goal: [1 1 0 1 0]   Success: False
Step  1    Bits: [0 0 1 1 0]   Goal: [1 1 0 1 0]   Success: False
Step  2    Bits: [0 0 1 0 0]   Goal: [1 1 0 1 0]   Success: False
Step  3    Bits: [1 0 1 0 0]   Goal: [1 1 0 1 0]   Success: False
Step  4    Bits: [1 0 0 0 0]   Goal: [1 1 0 1 0]   Success: False
Step  5    Bits: [1 0 0 1 0]   Goal: [1 1 0 1 0]   Success: False
DONE in 5 timesteps, success: False 

Step  0    Bits: [0 0 0 0 1]   Goal: [1 0 1 1 0]   Success: False
Step  1    Bits: [0 0 1 0 1]   Goal: [1 0 1 1 0]   Success: False
Step  2    Bits: [0 1 1 0 1]   Goal: [1 0 1 1 0]   Success: False
Step  3    Bits: [1 1 1 0 1]   Goal: [1 0 1 1 0]   Success: False
Step  4    Bits: [1 1 0 0 1]   Goal: [1 0 1 1 0]   Success: False
Step  5    Bits: [1 0 0 0 1]   Goal: [1 0 1 1 0]   Success: False
DONE in 5 timesteps, success: False 

Step  0    Bits: [0 0 1 0 0]   Goal: [0 1 1 1 0]   Success: False
Step  1    Bits: [0 1 1 0 0]   Goal: [0 1 1 1 0]   Success: False


0.3333333333333333

### Training Setup

In [4]:
DEFAULT_PARAMS = {
    'n_bits': 20,                 # n bits to flip in environment (n corresponding target bits)
    'seed': 0,                    # random seed for environment, torch, numpy, random packages

    'eps': 0.2,             # probability of random action, 'epsilon-greedy' policy
    
    'buffer_size': int(1e6),      # replay-buffer size
    'batch_size': 128,             # mini-batch size
    'gamma': 0.98,                # discount factor
    # TODO
    'tau': 0.05,                  # soft update of target network, 1-tau = polyak coefficient
    'lr': 0.001,                  # learning rate

    # training setup
    'replay_strategy': 'future',    # 'final' or 'future' replay strategy for HER
    'n_epochs': 200,              # number of epochs, HER paper: 200 epochs (i.e. maximum of 8e6 timesteps)
    'n_cycles': 50,               # number of cycles per epoch, HER paper: 50 cycles
    'n_episodes': 16,             # number of episodes per cycle, HER paper: 16 episodes
    'n_optim': 40,                # number of optimization steps every cycle, HER paper: 40 steps
}


def set_seeds(seed: int = 0):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    pass

set_seeds(DEFAULT_PARAMS['seed'])

In [5]:
def train(n, agent):
    print("Training DQN on Bitflipping with", n, "bits for", DEFAULT_PARAMS['n_epochs'], "epochs...")

    # widget bar to display progress during training
    widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA()]
    timer = pb.ProgressBar(widgets=widget, maxval=DEFAULT_PARAMS['n_epochs']).start()

    env = BitFlippingEnv(n)
    success = []
    eps = DEFAULT_PARAMS['eps']
    for i_epoch in range(1, DEFAULT_PARAMS['n_epochs'] + 1):
        for i_cycle in range(DEFAULT_PARAMS['n_cycles']):
            for i_episode in range(DEFAULT_PARAMS['n_episodes']):
                state, _, _, _ = env.reset()
                state_ep, act_ep, reward_ep, next_state_ep, done_ep = [], [], [], [], []
                for t in range(DEFAULT_PARAMS['n_bits']):
                    state_goal = np.concatenate([state['obs'], state['goal']])
                    action = agent.act(state_goal, eps)
                    next_state, reward, done, info = env.step(action)

                    state_ep.append(state.copy())
                    act_ep.append(action)
                    reward_ep.append(reward)
                    next_state_ep.append(next_state.copy())
                    done_ep.append(done)
                    
                    success.append(int(info))
                    if done:
                        break
                    state = next_state
                agent.store_episode(state_ep, act_ep, reward_ep, next_state_ep, done_ep)
                
                # HER additional goals
                agent.store_episode_HER(state_ep, act_ep, reward_ep, next_state_ep, done_ep, replay_strategy=DEFAULT_PARAMS['replay_strategy'])
               
            for _ in range(DEFAULT_PARAMS['n_optim']):
                agent.learn()       
        agent.soft_update(agent.qnetwork_local, agent.qnetwork_target, agent.tau)
        
        # stop training
        if np.mean(success[-10:]) > 0.999:
            print("\n learning done")
            break
            
        if i_epoch % (DEFAULT_PARAMS['n_cycles'] / 10) == 0:
            print('\rEpoch {} \t Success: {:.4f}'.format(i_epoch, np.mean(success[-10:])))

        timer.update(i_epoch)
    timer.finish()
    return success

### Start training DQN Agent

In [None]:
agent = Agent(DEFAULT_PARAMS['n_bits'], DEFAULT_PARAMS['n_bits'],
              DEFAULT_PARAMS['batch_size'], DEFAULT_PARAMS['buffer_size'], DEFAULT_PARAMS['gamma'],
              DEFAULT_PARAMS['tau'], DEFAULT_PARAMS['lr'])

success = train(DEFAULT_PARAMS['n_bits'], agent)

Training DQN on Bitflipping with 20 bits for 200 epochs...


training loop:   2% |                                          | ETA:   1:28:59

Epoch 5 	 Success: 0.0625


training loop:   4% |#                                         | ETA:   1:28:05

Epoch 10 	 Success: 0.5437


training loop:   7% |##                                        | ETA:   1:27:16

Epoch 15 	 Success: 0.8625


training loop:   9% |###                                       | ETA:   1:27:33

Epoch 20 	 Success: 0.8875


training loop:  12% |#####                                     | ETA:   1:26:05

Epoch 25 	 Success: 0.8938


training loop:  14% |######                                    | ETA:   1:27:05

Epoch 30 	 Success: 0.8938


training loop:  17% |#######                                   | ETA:   1:29:01

Epoch 35 	 Success: 0.9313


training loop:  19% |########                                  | ETA:   1:29:55

In [None]:
# Plot rolling average of success
import pandas as pd

N = 300
rolling_avg = pd.Series(success).rolling(window = N).mean().iloc[N-1:].values

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(rolling_avg)), rolling_avg)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# saving trained network
torch.save(agent.qnetwork_local.state_dict(), './trained/checkpoint_'+str(DEFAULT_PARAMS['n_bits'])+'bits.pth')

### Test trained Agent

In [None]:
n = 45
env = BitFlippingEnv(n)
agent = Agent(state_size=n, action_size=n, seed=0)

# load the weights from file
# agent.qnetwork_local.load_state_dict(torch.load('./trained/checkpoint_30bits.pth'))

test_bitflipping(n=5, n_episodes=3, policy=agent, render=True) 