In [1]:
import pandas as pd
import numpy as np
import random
import os
import tensorflow
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tensorflow.set_random_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

#### The data

In [3]:
df = pd.read_csv('data/dataset_10000.csv')
class_dict = {'A':0, 'B':1, 'C':2}
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)

In [4]:
np.unique(y_test)

array([0, 1, 2], dtype=int64)

#### The Environment

In [5]:
import copy
from gym import Env
from gym.spaces import Discrete, Box

In [6]:
class SyntheticEnv(Env):
    def __init__(self, X, Y, random=True):
        super(SyntheticEnv, self).__init__()
        self.action_space = Discrete(6)
        self.observation_space = Box(0, 1.5, (3,))
        self.actions = ['A', 'B', 'C', 'length', 'width', 'height']
        self.max_steps = 7
        self.X = X
        self.Y = Y
        self.sample_num = len(X)
        self.idx = -1
        self.x = np.zeros((3,), dtype=np.float32)
        self.y = np.nan
        self.state = np.zeros((3,), dtype=np.float32)
        self.num_classes = 3
        self.episode_length = 0
        self.trajectory = []
        self.total_reward = 0
        self.random = random
        
    
    def step(self, action):
        #print('A step in the environment')
        #print(f'action: {action}')
        self.episode_length += 1
        reward = 0
        if self.episode_length == self.max_steps: # episode too long
            #print('Reached max steps')
            reward -=1
            self.total_reward -=1
            terminated = True
            done = True
            y_actual = self.y
            y_pred = np.nan
        elif action < self.num_classes: #diagnosis (terminal action)
            #print('Terminal action')
            if action == self.y:
                reward +=1
                self.total_reward += 1
            else:
                reward -= 1
                self.total_reward -= 1
            terminated = False
            done = True
            y_actual = self.y
            y_pred = action
        elif self.actions[action] in self.trajectory: #action already picked 
            #print('Repeated action')
            terminated = False
            reward -= 1
            self.total_reward -= 1
            done = False
            y_actual = np.nan
            y_pred = np.nan
        else: #new feature being acquired
            #print('Acquiring new feature')
            terminated = False
            reward += 1
            self.total_reward += 1
            done = False
            self.state = self.get_next_state(action-self.num_classes)
            y_actual = np.nan
            y_pred = np.nan
        self.trajectory.append(self.actions[action])
        info = {'index': self.idx, 'episode_length':self.episode_length, 'reward': self.total_reward, 'y_pred': y_pred, 
                'y_actual': y_actual, 'trajectory':self.trajectory, 'terminated':terminated}
        #self.render()
        return self.state, reward, done, info
            
    
    def render(self):
        print(f'STEP {self.episode_length} for index {self.idx}')
        print(f'x: {self.x}')
        print(f'y: {self.y}')
        print(f'Current state: {self.state}')
        print(f'Total reward: {self.total_reward}')
        print(f'Trajectory: {self.trajectory}')
        
            
    
    def reset(self):
        #print('RESETTING THE ENVIRONMENT')
        if self.random:
            self.idx = random.randint(0, self.sample_num-1)
        else:
            self.idx += 1
            if self.idx == len(self.X):
                raise StopIteration()
        #print(f'New idx: {self.idx}')
        self.x, self.y = self.X[self.idx], self.Y[self.idx]
        #print(f'New x: {self.x}')
        #print(f'New y: {self.y}')
        self.state = np.zeros((3,), dtype=np.float32)
        #print(f'New state: {self.state}')
        self.trajectory = []
        #print(f'New trajectory: {self.trajectory}')
        self.episode_length = 0
        #print(f'New episode length: {self.episode_length}')
        self.total_reward = 0
        #print(f'New total reward: {self.total_reward}')
        return self.state
        
    
    def get_next_state(self, feature_idx):
        self.x = self.x.reshape(-1, 3)
        x_value = self.x[0, feature_idx]
        next_state = copy.deepcopy(self.state)
        next_state[feature_idx] = x_value
        return next_state

In [7]:
# from stable_baselines.common.env_checker import check_env
# from stable_baselines.common.policies import MlpPolicy
# from stable_baselines.common.vec_env import DummyVecEnv
# from stable_baselines import PPO2
import time
import tensorflow as tf
from gym.wrappers.time_limit import TimeLimit

from baselines.ppo2 import ppo2
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

from baselines import bench
from baselines import logger
from baselines import deepq
from baselines.common.tf_util import make_session

#### DQN

In [8]:
def deepq_dqn():
    logger.configure(dir='./logs/synthetic_deepq_dqn', format_strs=['stdout', 'tensorboard'])
    env = SyntheticEnv(X_train, y_train)
    env = bench.Monitor(env, logger.get_dir())

    model = deepq.learn(
        env,
        'mlp',
        num_layers=1,
        num_hidden=64,
        activation=tf.nn.relu,
        hiddens=[32],
        dueling=False,
        lr=1e-4,
        total_timesteps=int(1.2e5),
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
    )

    model.save('models/synthetic_deepq_dqn.pkl')
    env.close()

    return model

start_time = time.time()
dqn_model = deepq_dqn()
print("DQN Training Time:", time.time() - start_time)

Logging to ./logs/synthetic_deepq_dqn
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.cast instead.
--------------------------------------
| % time spent exploring  | 98       |
| episodes                | 100      |
| mean 100 episode reward | 0.3      |
| steps                   | 207      |
--------------------------------------


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


--------------------------------------
| % time spent exploring  | 96       |
| episodes                | 200      |
| mean 100 episode reward | 0.2      |
| steps                   | 426      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 94       |
| episodes                | 300      |
| mean 100 episode reward | 0.2      |
| steps                   | 623      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 93       |
| episodes                | 400      |
| mean 100 episode reward | 0        |
| steps                   | 788      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 91       |
| episodes                | 500      |
| mean 100 episode reward | 0.2      |
| steps                   | 999      |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 38       |
| episodes                | 3.8e+03  |
| mean 100 episode reward | -0.5     |
| steps                   | 7.5e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 36       |
| episodes                | 3.9e+03  |
| mean 100 episode reward | -0.4     |
| steps                   | 7.67e+03 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 35       |
| episodes                | 4e+03    |
| mean 100 episode reward | -0.4     |
| steps                   | 7.81e+03 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 34       |
| episodes                | 4.1e+03  |
| mean 100 episode reward | -0.4     |
| steps                   | 7.98e+03 |
--------------------------------------
--------------------------------------
| % time spent exploring 

Saving model due to mean reward increase: None -> 1.0
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.4e+03  |
| mean 100 episode reward | 1        |
| steps                   | 2e+04    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.5e+03  |
| mean 100 episode reward | 0.9      |
| steps                   | 2.05e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.6e+03  |
| mean 100 episode reward | 1.6      |
| steps                   | 2.1e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.7e+03  |
| mean 100 episode reward | 1.3      |
| steps                   | 2.15e+04 |
--------------------------------------
----------

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.09e+04 |
| mean 100 episode reward | 2.2      |
| steps                   | 3.65e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.1e+04  |
| mean 100 episode reward | 2.1      |
| steps                   | 3.69e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.11e+04 |
| mean 100 episode reward | 2.5      |
| steps                   | 3.74e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.12e+04 |
| mean 100 episode reward | 1.8      |
| steps                   | 3.79e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.44e+04 |
| mean 100 episode reward | 2.2      |
| steps                   | 5.22e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.45e+04 |
| mean 100 episode reward | 2.9      |
| steps                   | 5.26e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.46e+04 |
| mean 100 episode reward | 2.5      |
| steps                   | 5.3e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.47e+04 |
| mean 100 episode reward | 2.5      |
| steps                   | 5.35e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.79e+04 |
| mean 100 episode reward | 3        |
| steps                   | 6.74e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.8e+04  |
| mean 100 episode reward | 2.7      |
| steps                   | 6.78e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.81e+04 |
| mean 100 episode reward | 2.9      |
| steps                   | 6.83e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.82e+04 |
| mean 100 episode reward | 3.2      |
| steps                   | 6.87e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.14e+04 |
| mean 100 episode reward | 3.1      |
| steps                   | 8.26e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.15e+04 |
| mean 100 episode reward | 2.6      |
| steps                   | 8.3e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.16e+04 |
| mean 100 episode reward | 3.2      |
| steps                   | 8.35e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.17e+04 |
| mean 100 episode reward | 3.3      |
| steps                   | 8.39e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.5e+04  |
| mean 100 episode reward | 2.8      |
| steps                   | 9.81e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.51e+04 |
| mean 100 episode reward | 2.8      |
| steps                   | 9.85e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.52e+04 |
| mean 100 episode reward | 2.8      |
| steps                   | 9.89e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.53e+04 |
| mean 100 episode reward | 2.7      |
| steps                   | 9.94e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.86e+04 |
| mean 100 episode reward | 3.1      |
| steps                   | 1.14e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.87e+04 |
| mean 100 episode reward | 3.2      |
| steps                   | 1.14e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.88e+04 |
| mean 100 episode reward | 3.1      |
| steps                   | 1.14e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.89e+04 |
| mean 100 episode reward | 3        |
| steps                   | 1.15e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring 

#### Performance Evaluation

In [9]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, auc, roc_curve

In [10]:
def multiclass(actual_class, pred_class, average = 'macro'):

    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        other_class = [x for x in unique_class if x != per_class]
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
    avg = sum(roc_auc_dict.values()) / len(roc_auc_dict)
    return avg

In [11]:
def test(ytest, ypred):
    acc = accuracy_score(ytest, ypred)
    f1 = f1_score(ytest, ypred, average ='macro', labels=np.unique(ytest))
    try:
        roc_auc = multiclass(ytest, ypred)
    except:
        roc_auc = None
    return acc, f1, roc_auc

In [12]:
def get_avg_length_reward(df):
    length = np.mean(df.episode_length)
    reward = np.mean(df.reward)
    return length, reward

In [13]:
def synthetic_dqn_eval(dqn_model):
    attempts, correct = 0,0
    test_df = pd.DataFrame()

    env = SyntheticEnv(X_test, y_test, random=False)

    try:
        while True:
            obs, done = env.reset(), False
            while not done:
                obs, rew, done,info = env.step(dqn_model(obs[None])[0])
                #if (done==True) & (np.isfinite(info['y_pred'])):
                if done == True:
                    test_df = test_df.append(info, ignore_index=True)
                #print('....................TEST DF ....................')
                #if len(test_df) != 0:
                #    print(test_df.head())

    except StopIteration:
        print('Testing done.....')
    return test_df

test_df = synthetic_dqn_eval(dqn_model)

Testing done.....


In [14]:
len(X_test), len(test_df)

(3000, 3000)

In [15]:
y_pred_df = test_df[test_df['y_pred'].notna()]
success_df = y_pred_df[y_pred_df['y_pred']== y_pred_df['y_actual']]
len(success_df)

2195

In [16]:
success_rate = len(success_df)/len(test_df)*100
success_rate

73.16666666666667

In [17]:
#eavg length and return 
avg_length, avg_return = get_avg_length_reward(test_df)
avg_length, avg_return

(4.3806666666666665, 2.8213333333333335)

In [18]:
acc, f1, roc_auc = test(y_pred_df['y_actual'], y_pred_df['y_pred'])
acc, f1, roc_auc

(0.8468364197530864, 0.8438350607706327, 0.8745244159312989)

In [19]:
#### Dueling DQN

#### Analysis for episodes that exceed maximum length

In [20]:
max_length_df = test_df[test_df['y_pred'].isna()]
max_length_df.head()

Unnamed: 0,episode_length,index,reward,terminated,trajectory,y_actual,y_pred
1,7.0,1.0,-1.0,1.0,"[height, length, width, width, width, width, w...",1.0,
12,7.0,12.0,-1.0,1.0,"[height, length, width, width, width, width, w...",0.0,
18,7.0,18.0,-1.0,1.0,"[height, length, width, width, width, width, w...",1.0,
19,7.0,19.0,-1.0,1.0,"[height, length, width, width, width, width, w...",1.0,
20,7.0,20.0,-1.0,1.0,"[height, length, width, width, width, width, w...",0.0,


In [21]:
len(max_length_df)

408

In [22]:
max_length_df.tail()

Unnamed: 0,episode_length,index,reward,terminated,trajectory,y_actual,y_pred
2948,7.0,2948.0,-1.0,1.0,"[height, length, width, width, width, width, w...",0.0,
2961,7.0,2961.0,-3.0,1.0,"[height, length, length, length, length, lengt...",1.0,
2968,7.0,2968.0,-3.0,1.0,"[height, length, length, length, length, lengt...",2.0,
2978,7.0,2978.0,-3.0,1.0,"[height, length, length, length, length, lengt...",2.0,
2986,7.0,2986.0,-3.0,1.0,"[height, length, length, length, length, lengt...",1.0,


In [23]:
# Look at misdiagnosed episodes

# Look at episodes that exceed max length