In [1]:
import numpy as np
import pandas as pd
import time
import copy
import random
from tqdm import tqdm
import os
#import keras.backend.tensorflow_backend as backend
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
import tensorflow as tf
from collections import deque

#from PIL import Image
#import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, auc, roc_auc_score, roc_curve

#### Getting reproducible results I think

In [2]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
tf.random.set_seed(SEED)
#tf.set_random_seed(SEED)

In [3]:
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)




#### The data

In [4]:
df = pd.read_csv('data/dataset_10000.csv')
df.head()

Unnamed: 0,length,width,height,label
0,7,7,49,B
1,20,13,75,B
2,29,2,81,A
3,15,6,39,A
4,11,9,71,B


In [5]:
class_dict = {'A':0, 'B':1, 'C':2}
df['label'] = df['label'].replace(class_dict)
df.head()

Unnamed: 0,length,width,height,label
0,7,7,49,1
1,20,13,75,1
2,29,2,81,0
3,15,6,39,0
4,11,9,71,1


In [6]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=42)

In [7]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

#### Setting constants

In [8]:
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1_000  # Minimum number of steps in a memory to start training
#MINIBATCH_SIZE = 64  # How many steps (samples) to use for training
MINIBATCH_SIZE  = 5
UPDATE_TARGET_EVERY = 5  # Terminal states (end of episodes)
MODEL_NAME = 'dqn_keras_example'
MIN_REWARD = -200  # For model save
MEMORY_FRACTION = 0.20

# Environment settings
EPISODES = 20_000

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

#  Stats settings
AGGREGATE_STATS_EVERY = 50  # episodes
SHOW_PREVIEW = False

#### The Environment

In [9]:
class Env:
    def __init__(self, X, Y, random=True):
        self.X = X
        self.Y = Y
        self.x = np.zeros((1, 3), dtype=np.float32)
        self.y = -1
        self.num_classes = 3
        self.sample_num = len(X)
        self.state = np.zeros((1, 3), dtype=np.float32)
        self.total_reward = 0
        self.trajectory = []
        self.episode_length = 0
        self.random = random
        self.idx = 0
        self.available_actions = np.zeros((1, 6), dtype=np.float32)
        
    def reset(self, i): #I am going to go through the data sequentially
        #print(f'Current epsiode completed. Resetting to index {i}')
        if i < self.sample_num:
            self.trajectory = []
            self.total_reward = 0
            self.episode_length = 0
            self.state = np.zeros((1, 3), dtype=np.float32)
            self.x, self.y = self.X[i], self.Y[i]
            self.available_actions = np.zeros((1, 6), dtype=np.float32)
            #return self.state, self.available_actions
            return self.state
        else:
            pass
        
    def get_next_state(self, action):
        self.available_actions[0, action] =1
        if action < self.num_classes: #the classes
            #next_state = None
            next_state = copy.deepcopy(self.state)
        elif (action >=3) & (action <=5):
            feature_idx = action - 3
            self.x = self.x.reshape(-1, 3)
            x_value = self.x[0, feature_idx]
            next_state = copy.deepcopy(self.state)
            next_state[0, feature_idx] = x_value
        return next_state
    
    def step(self, action):
        ep_length = 1
        reward = 0
        next_state = self.get_next_state(action)
        if action < 3:
            if action == self.y:
                reward += 1
            else:
                reward -= 1
            y_actual = self.y 
            y_pred = action
            done = True
        else:
            reward += 0
            y_actual = np.nan
            y_pred = np.nan
            done=False
            
        self.total_reward+=reward
        self.episode_length+= ep_length
        total_reward_metric = self.total_reward 
        total_length_metric = self.episode_length
        
        info = {'episode_length':total_length_metric, 'total_reward': total_reward_metric, 'y_actual':y_actual, 
                   'y_pred': y_pred}
        #print(f'The metrics: {metrics}')
        if not self.random:
            self.idx +=1
        else:
            self.idx = random.randint(0, self.sample_num-1)
        return next_state, reward, done, info

In [10]:
env = Env(X_train, y_train)
# For stats
ep_rewards = [-200]

# Create models folder
if not os.path.isdir('models'):
    os.makedirs('models')

#### The DQN Agent

In [11]:
class ModifiedTensorBoard(TensorBoard):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir

    def set_model(self, model):
        self.model = model

        self._train_dir = os.path.join(self._log_write_dir, 'train')
        self._train_step = self.model._train_counter

        self._val_dir = os.path.join(self._log_write_dir, 'validation')
        self._val_step = self.model._test_counter

        self._should_write_train_graph = False

    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    def on_batch_end(self, batch, logs=None):
        pass

    def on_train_end(self, _):
        pass

    def update_stats(self, **stats):
        with self.writer.as_default():
            for key, value in stats.items():
                tf.summary.scalar(key, value, step = self.step)
                self.writer.flush()

In [12]:
class DQNAgent:
    def __init__(self):

        # Main model
        self.model = self.create_model2()

        # Target network
        self.target_model = self.create_model2()
        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=50000)

        # Custom tensorboard object
        self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format('keras_dqn_all_feats', int(time.time())))

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0

    def create_model(self):
        model = Sequential()
        model.add(Dense(units = 64, input_dim = 3, activation = 'relu'))
        model.add(Dense(units = 32, activation='relu'))
        model.add(Dense(units = 8, activation = 'relu'))
        model.add(Dense(6, activation = 'linear'))
        #model.compile(loss='mse', optimizer = Adam(learning_rate=0.001))
        model.compile(loss='mse', optimizer=Adam(lr=0.001), metrics=['accuracy'])
        return model
    
    def create_model2(self):
        model = Sequential()
        model.add(Dropout(0.5, input_shape=(1, 3)))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(8, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(6, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=0.001), metrics=['accuracy'])
        return model
    
    # Adds step's data to a memory replay array
    # (observation space, action, reward, new observation space, done)
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    # Trains main network every step during episode
    def train(self, terminal_state, step):

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < 1000:
            return

        # Get a minibatch of random samples from memory replay table
        minibatch = random.sample(self.replay_memory, 64)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch])
        future_qs_list = self.target_model.predict(new_current_states)

        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            #print(f'current qs list shape: {current_qs_list.shape}')
            current_qs = current_qs_list[index]
            #print(f'current qs type: {type(current_qs)}')
            #print(f'current qs shape: {current_qs.shape}')
            #print(f'current qs: {current_qs}')
            #print(f'action: {action}')
            
            try:
                current_qs[action] = new_q
            except:
                current_qs = current_qs.reshape(6,)
                current_qs[action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.asarray(X).astype('float32'), np.asarray(y).astype('float32'), batch_size=64, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)

        # Update target network counter every episode
        if terminal_state: #if done==True
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > 5:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

    # Queries main network for Q values given current observation space (environment state)
    def get_qs(self, state):
        return self.model.predict(np.array(state).reshape(-1, *state.shape))[0]

In [13]:
agent = DQNAgent()

#for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
for episode in range(1, 5000):
    if episode%1000 == 0:
        print(f'Episode: {episode}')
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    #i = random.randint(0, env.sample_num-1)
    current_state = env.reset(env.idx)
    #print(f'X: {env.x}')
    #print(f'y: {env.y}')

    # Reset flag and start iterating until episode ends
    done = False
    while not done:
        #print(f'Step: {step}')

        if np.random.random() > epsilon:
            #current_state = current_state.reshape((3,))
            q_values = agent.get_qs(current_state)
            #q_values = self.dqn.predict_q(state) #get q values for all actions shape(1, 31)
            #print(f'q values: {q_values}')
            available_q_values = q_values[env.available_actions==0] #q-values for actions not yet selected e.g. shape(20,)
            action_q = np.max(available_q_values)
            #action_q = np.max(q_values)
            action = np.where(q_values[0]==action_q)[0][0] #index of action with max q-value
        else:
            #print('Choosing randomly')
            available_indices = np.where(env.available_actions==0)[1]
            action = random.choice(available_indices)  
            #action_index = random.randrange(self.n_actions) 
            
        new_state, reward, done, info = env.step(action)

        # Transform new continous state to new discrete state and count reward
        episode_reward += reward

        #if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
        #    env.render()
        
        #print(f'current state: {current_state}')
        #print(f'action: {action}')
        #print(f'reward: {reward}')
        #print(f'new state: {new_state}')
        #print(f'done: {done}')
        
        # Every step we update replay memory and train main network
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        #print(f'TRAINING AGENT')
        agent.train(done, step)

        current_state = new_state
        step += 1

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

        # Save model, but only when min reward is greater or equal a set value
        if min_reward >= MIN_REWARD:
            #agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
            agent.model.save(f'models/{MODEL_NAME}.model')

    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)

  super(Adam, self).__init__(name, **kwargs)


INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets
INFO:tensorflow:Assets written to: models/dqn_keras_example.model\assets




ValueError: setting an array element with a sequence.

#### Performance Evaluation

In [None]:
from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc

In [None]:
def multiclass(actual_class, pred_class, average = "macro"):

    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
    #print(f'Roc auc dict: {roc_auc_dict}')
    avg = sum(roc_auc_dict.values()) / len(roc_auc_dict)
    
    #return roc_auc_dict
    return avg

In [None]:
def test(ytest, ypred):
    acc = accuracy_score(ytest, ypred)
    f1 = f1_score(ytest, ypred, average ='macro', labels=np.unique(ytest))
    try:
        roc_auc = multiclass(ytest, ypred)
    except:
        roc_auc = None
    return acc, f1, roc_auc

In [None]:
testing_env = Env(X_test, y_test,random=False)
test_df = pd.DataFrame()
testing_env.reset(testing_env.idx)
for episode in tqdm(range(1, len(X_test)+1), ascii=True, unit='episodes'):
    #print(f'episode: {episode}, index: {testing_env.idx}')

    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    # Reset environment and get initial state
    current_state = testing_env.reset(testing_env.idx)
    #print(f'current state: {current_state}')
    #print(f'current y: {testing_env.y}')

    # Reset flag and start iterating until episode ends
    done = False
    while not done:

        action = np.argmax(agent.get_qs(current_state))
        #print(f'action using agent: {action}')
        new_state, reward, done, info = testing_env.step(action)
        #print(f'new state: {new_state}')
        #print(f'reward: {reward}')
        #print(f'done: {done}')
        #print(f'info: {info}')
        if done == True:
            test_df = test_df.append(info, ignore_index=True)

        # Transform new continous state to new discrete state and count reward
        episode_reward += reward

        #if SHOW_PREVIEW and not episode % 1000:
        #    testing_env.render()

        # Every step we update replay memory and train main network
        #agent.update_replay_memory((current_state, action, reward, new_state, done))
        #agent.train(done, step)

        current_state = new_state
        step += 1

    # Append episode reward to a list and log stats (every given number of episodes)
#     ep_rewards.append(episode_reward)
#     if not episode % 1000 or episode == 1:
#         average_reward = sum(ep_rewards[-1000:])/len(ep_rewards[-1000:])
#         min_reward = min(ep_rewards[-1000:])
#         max_reward = max(ep_rewards[-1000:])
#         agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

#         # Save model, but only when min reward is greater or equal a set value
#         #if min_reward >= MIN_REWARD:
#         if min_reward >= -200:
#             #agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
#             agent.model.save(f'models/keras_dqn_all_feats.model')
#     # Decay epsilon
#     if epsilon > 0.001:
#         epsilon *= 0.99975
#         epsilon = max(0.001, epsilon)