In [None]:
%matplotlib inline

import numpy as np
import tensorflow as tf
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# depending on the classification model use, we might need to import other packages
# from sklearn import svm
# from sklearn.ensemble import RandomForestClassifier

from datasets import DatasetUCI
from envs import LalEnvTargetAccuracy

from helpers import Minibatch, ReplayBuffer
from dqn import DQN
from Test_AL import policy_rl

# Setup and initialisation

#### Parameters for dataset and model

In [None]:
N_STATE_ESTIMATION = 30
SIZE = 100
# if we want to train and test RL on the same dataset, use even and odd datapoints for training and testing correspondingly
SUBSET = -1 # -1 for using all datapoints, 0 for even, 1 for odd
N_JOBS = 1 # can set more if we want to parallelise
# remove the dataset that will be used for testing
# ['australian', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart', 'mushrooms', 'waveform', 'wdbc']
possible_dataset_names = ['breast_cancer', 'diabetis', 'flare_solar', 
                          'german', 'heart', 'mushrooms', 'waveform', 'wdbc']
test_dataset_names = ['australian']
# The quality is measures according to a given quality measure `quality_method`. 
QUALITY_METHOD = metrics.accuracy_score
# The `tolerance_level` is the proportion of max quality that needs to be achived in order to terminate an episode. 
TOLERANCE_LEVEL = 0.98

Initialise a dataset that will contain a sample of datapoint from one the indicated classes.

In [None]:
dataset = DatasetUCI(possible_dataset_names, n_state_estimation=N_STATE_ESTIMATION, subset=SUBSET, size=SIZE)
# if we want to measure test error along with training
dataset_test = DatasetUCI(test_dataset_names, n_state_estimation=N_STATE_ESTIMATION, subset=1, size=SIZE)

Initialise a model that would be used for training a classifier. <br>
It can be, for example, Logistic regression: <br>
`LogisticRegression(n_jobs=N_JOBS)` <br>
SVM: <br>
`svm.SVC(probability=True)`

In [None]:
model = LogisticRegression(n_jobs=N_JOBS)

Initialise the environment

In [None]:
env = LalEnvTargetAccuracy(dataset, model, quality_method=QUALITY_METHOD, tolerance_level=TOLERANCE_LEVEL)
env_test = LalEnvTargetAccuracy(dataset_test, model, quality_method=QUALITY_METHOD, tolerance_level=TOLERANCE_LEVEL)
tf.reset_default_graph()

#### Parameters for training RL

In [None]:
DIRNAME = './agents/1-australian-logreg-8-to-1/' # The resulting agent of this experiment will be written in a file

# Replay buffer parameters.
REPLAY_BUFFER_SIZE = 1e4
PRIOROTIZED_REPLAY_EXPONENT = 3

# Agent parameters.
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
TARGET_COPY_FACTOR = 0.01
BIAS_INITIALIZATION = 0 # default 0 # will be set to minus half of average duration during warm start experiemnts

# Warm start parameters.
WARM_START_EPISODES = 128 # reduce for test
NN_UPDATES_PER_WARM_START = 100

# Episode simulation parameters.
EPSILON_START = 1
EPSILON_END = 0.1
EPSILON_STEPS = 1000

# Training parameters
TRAINING_ITERATIONS = 1000 # reduce for test
TRAINING_EPISODES_PER_ITERATION = 10 # at each training ietration x episodes are simulated
NN_UPDATES_PER_ITERATION = 60 # at each training iteration x gradient steps are made

# Validation and test parameters
N_VALIDATION = 500 # reduce for test
N_TEST = 500 # reduce for test
VALIDATION_TEST_FREQUENCY = 100 # every x iterations val and test are performed

Initialise replay buffer

In [None]:
replay_buffer = ReplayBuffer(buffer_size=REPLAY_BUFFER_SIZE, 
                             prior_exp=PRIOROTIZED_REPLAY_EXPONENT)

# Warm start

Warm-start the replay buffer with random episodes. 

Collect episodes

In [None]:
# Keep track of episode duration to compute average
episode_durations = []
for _ in range(WARM_START_EPISODES):
    print('.', end='')
    # Reset the environment to start a new episode
    # classifier_state contains vector representation of state of the environment (depends on classifier)
    # next_action_state contains vector representations of all actions available to be taken at the next step
    classifier_state, next_action_state = env.reset()
    terminal = False
    episode_duration = 0
    # before we reach a terminal state, make steps
    while not terminal:
        # Choose a random action
        action = np.random.randint(0, env.n_actions)
        # taken_action_state is a vector corresponding to a taken action
        taken_action_state = next_action_state[:,action]
        next_classifier_state, next_action_state, reward, terminal = env.step(action)
        # Store the transition in the replay buffer
        replay_buffer.store_transition(classifier_state, 
                                       taken_action_state, 
                                       reward, next_classifier_state, 
                                       next_action_state, terminal)
        # Get ready for next step
        classifier_state = next_classifier_state
        episode_duration += 1 
    episode_durations.append(episode_duration)
# compute the average episode duration of episodes generated during the warm start procedure
av_episode_duration = np.mean(episode_durations)
print('Average episode duration = ', av_episode_duration)

BIAS_INITIALIZATION = -av_episode_duration/2

Initialize the DQN agent

In [None]:
agent = DQN(experiment_dir=DIRNAME,
            observation_length=N_STATE_ESTIMATION,
            learning_rate=LEARNING_RATE,
            batch_size=BATCH_SIZE,
            target_copy_factor=TARGET_COPY_FACTOR,
            bias_average=BIAS_INITIALIZATION,
           )

Do updates of the network based on warm start episodes

In [None]:
for _ in range(NN_UPDATES_PER_WARM_START):
    print('.', end='')
    # Sample a batch from the replay buffer proportionally to the probability of sampling.
    minibatch = replay_buffer.sample_minibatch(BATCH_SIZE)
    # Use batch to train an agent. Keep track of temporal difference errors during training.
    td_error = agent.train(minibatch)
    # Update probabilities of sampling each datapoint proportionally to the error.
    replay_buffer.update_td_errors(td_error, minibatch.indeces)

# Train RL

Run multiple training iterations. Each iteration consits of:
- generating episodes following agent's actions with exploration
- validation and test episodes for evaluating performance
- Q-network updates


In [None]:
train_episode_rewards = []
i_episode = 0

In [None]:
for iteration in range(TRAINING_ITERATIONS):
    # GENERATE NEW EPISODES
    # Compute epsilon value according to the schedule.
    epsilon = max(EPSILON_END, EPSILON_START-iteration*(EPSILON_START-EPSILON_END)/EPSILON_STEPS)
    print(iteration, end=': ')
    # Simulate training episodes.
    for _ in range(TRAINING_EPISODES_PER_ITERATION):
        # Reset the environment to start a new episode.
        classifier_state, next_action_state = env.reset()
        print(".", end='')
        terminal = False
        # Keep track of stats of episode to analyse it in tensorboard.
        episode_reward = 0
        episode_duration = 0
        episode_summary = tf.Summary()
        # Run an episode.
        while not terminal:
            # Let an agent choose an action.
            action = agent.get_action(classifier_state, next_action_state)
            # Get a prob of a datapoint corresponding to an action chosen by an agent.
            # It is needed just for the tensorboard analysis.
            rlchosen_action_state = next_action_state[0,action]
            # With epsilon probability, take a random action.
            if np.random.ranf() < epsilon: 
                action = np.random.randint(0, env.n_actions)
            # taken_action_state is a vector that corresponds to a taken action
            taken_action_state = next_action_state[:,action]
            # Make another step.
            next_classifier_state, next_action_state, reward, terminal = env.step(action)
            # Store a step in replay buffer
            replay_buffer.store_transition(classifier_state, 
                                           taken_action_state, 
                                           reward, 
                                           next_classifier_state, 
                                           next_action_state, 
                                           terminal)
            # Change a state of environment.
            classifier_state = next_classifier_state
            # Keep track of stats and add summaries to tensorboard.
            episode_reward += reward
            episode_duration += 1
            episode_summary.value.add(simple_value=rlchosen_action_state, 
                                      tag="episode/rlchosen_action_state")
            episode_summary.value.add(simple_value=taken_action_state[0], 
                                      tag="episode/taken_action_state")
        # Add summaries to tensorboard
        episode_summary.value.add(simple_value=episode_reward, 
                                  tag="episode/episode_reward")
        episode_summary.value.add(simple_value=episode_duration, 
                                  tag="episode/episode_duration")
        i_episode += 1
        agent.summary_writer.add_summary(episode_summary, i_episode)
        agent.summary_writer.flush()
        
    # VALIDATION AND TEST EPISODES
    episode_summary = tf.Summary()
    if iteration%VALIDATION_TEST_FREQUENCY == 0:
        # Validation episodes are run. Use env for it.
        all_durations = []
        for i in range(N_VALIDATION):
            done = False
            state, next_action_state = env.reset()
            while not(done):
                action = policy_rl(agent, state, next_action_state)        
                taken_action_state = next_action_state[:,action]
                next_state, next_action_state, reward, done = env.step(action)
                state = next_state
            all_durations.append(len(env.episode_qualities))
        episode_summary.value.add(simple_value=np.mean(all_durations), 
                                  tag="episode/train_duration")
        # Test episodes are run. Use env_test for it.
        all_durations = []
        for i in range(N_TEST):
            done = False
            state, next_action_state = env_test.reset()
            while not(done):
                action = policy_rl(agent, state, next_action_state)        
                taken_action_state = next_action_state[:,action]
                next_state, next_action_state, reward, done = env_test.step(action)
                state = next_state
            all_durations.append(len(env_test.episode_qualities))
        episode_summary.value.add(simple_value=np.mean(all_durations), 
                                  tag="episode/test_duration")
    
    episode_summary.value.add(simple_value=epsilon, 
                              tag="episode/epsilon")
    agent.summary_writer.add_summary(episode_summary, iteration)
    agent.summary_writer.flush()
            
    # NEURAL NETWORK UPDATES
    for _ in range(NN_UPDATES_PER_ITERATION):
        minibatch = replay_buffer.sample_minibatch(BATCH_SIZE)
        td_error = agent.train(minibatch)
        replay_buffer.update_td_errors(td_error, minibatch.indeces)

#### To see the results in tensorboard

on the server:
tensorboard --logdir=./

on the computer:
ssh -N -f -L localhost:6006:localhost:6006 konyushk@iccvlabsrv20.iccluster.epfl.ch && open http://localhost:6006