In [164]:
import pandas as pd
import numpy as np
import random
import copy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, auc

#### The data

In [165]:
df = pd.read_csv('data/balanced_q_dataset.csv')
df.head()

Unnamed: 0,length,width,height,label
0,0,0,0,0
1,1,1,1,1
2,1,0,0,0
3,0,1,0,2
4,1,1,1,1


In [166]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

#X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
#X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((525, 3), (225, 3), (525,), (225,))

In [167]:
class Env:
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        self.x = np.zeros((1, 3), dtype=np.int32)
        self.y = -1
        self.num_classes = 3
        self.sample_num = len(X)
        self.state = np.zeros((1, 3), dtype=np.int32)
        self.total_reward = 0
        self.trajectory = []
        self.episode_length = 0
        self.action_space = np.array([0, 1, 2, 3, 4, 5])
        #self.available_actions = np.zeros((1, 6), dtype=np.float32)
        
    def reset(self, i): #I am going to go through the data sequentially
        #print(f'Current epsiode completed. Resetting to index {i}')
        if i < self.sample_num:
            self.trajectory = []
            self.total_reward = 0
            self.episode_length = 0
            self.state = np.zeros((1, 3), dtype=np.int32)
            self.x, self.y = self.X[i], self.Y[i]
            #self.available_actions = np.zeros((1, 6), dtype=np.float32)
            return self.state
        else:
            pass
        
    def get_next_state(self, action):
        #self.available_actions[0, action] =1
        if action < 3: #the classes
            next_state = None
        elif (action >=3) & (action <=5):
            feature_idx = action - 3
            self.x = self.x.reshape(-1, 3)
            x_value = self.x[0, feature_idx]
            next_state = copy.deepcopy(self.state)
            next_state[0, feature_idx] = x_value
        else:
            print('unknown action')
            next_state = None
        return next_state
    
    def step(self, action):
        self.trajectory.append(action)
        ep_length = 1
        reward = 0
        next_state = self.get_next_state(action)
        if action < 3:
            if action == self.y:
                reward += 1
            else:
                reward -= 1
            y_actual = self.y 
            y_pred = action
            done = True
        elif action in self.trajectory:
            reward -= 1
            y_actual = np.nan
            y_pred = np.nan
            done = False
        else:
            reward += 0
            y_actual = np.nan
            y_pred = np.nan
            done=False
            
        self.total_reward+=reward
        self.episode_length+= ep_length
        total_reward_metric = self.total_reward 
        total_length_metric = self.episode_length
        
        info = {'episode_length':total_length_metric, 'total_reward': total_reward_metric, 'y_actual':y_actual, 
                   'y_pred': y_pred}
        #print(f'The metrics: {metrics}')
        return next_state, reward, done, info

In [168]:
class TestingEnv(Env):
    def __init__(self, X, y):
        super().__init__(X, y)
        self.x, self.y = self.X[0], self.Y[0]
        self.idx = 0
        self.episode_length =len(self.trajectory)
        
    def step(self, action, name): 
        self.trajectory.append(action)
        reward = 0
        if action < 3:
            if action == self.y:
                reward += 1
            else:
                reward -= 1
            done = True
            y_actual = self.y 
            y_pred = action
            self.idx+=1
        
        elif action in self.trajectory:
            reward -= 1
            y_actual = np.nan
            y_pred = np.nan
            done = False
        
        else:
            reward += 0
            done = False
            y_actual = np.nan
            y_pred = np.nan 
        
        next_state = self.get_next_state(action)
        episode_number = self.idx
        pathway = self.trajectory
        self.total_reward += reward
        total_reward_metric = self.total_reward 
        info = {'episode_length':len(pathway), 'total_reward': total_reward_metric, 'y_actual':y_actual, 'y_pred': y_pred, 
                'pathway': pathway, 'done':done} 
        return next_state, info

#### The shit starts here

In [169]:
from IPython.display import clear_output

In [170]:
q_table = np.zeros([8, 6])
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [171]:
#radical idea
states_dict = {'0': [0, 0, 0], '1': [0, 0, 1], '2': [0, 1, 0], '3': [0, 1, 1], '4': [1, 0, 0], '5': [1, 0, 1], '6': [1, 1, 0], 
               '7': [1, 1, 1]}
def get_state(arr):
    #print(f'init state in func: {arr}')
    ls = list(arr[0])
    #print(f'ls in func: {ls}')
    state = list(states_dict.keys())[list(states_dict.values()).index(ls)]
    #print(f'final state in func: {int(state)}')
    return int(state)

In [172]:
get_state([[0,0,0]])

0

In [173]:
#%%time

alpha = 0.1
gamma = 0.7
epsilon = 0.1
epochs = 100000
env = Env(X_train, y_train)

for epoch in range(1, epochs+1):
#for epoch in range(1, 6):
    #print('a')
    if epoch%10000 == 0:
    #if epoch%(epochs/10)==0:
        print(f'Epoch {epoch}')
    idx = random.randint(0, env.sample_num-1)
    #print(f'idx: {idx}')
    state = get_state(env.reset(idx)) #initialize env
    #print(f'Initial state:{state}')
    done = False
    
    while not done:
        #print('b')
        #print(q_table)
        if random.uniform(0, 1) < epsilon:
            #print('choosing action randomly')
            action = random.choice(env.action_space) # Explore action space
        else:
            #print('choosing action using q table')
            action = np.argmax(q_table[state]) # Exploit learned values
        #print(f'action: {action}')
        #print(f'state: {state}')
        next_state, reward, done, info = env.step(action)
        #print(f'Initial next state: {next_state}')
        if next_state is None:
            pass
        else:
            next_state = get_state(next_state)
            #print(f'Revised next state: {next_state}')
        
        old_value = q_table[state, action]
        #print(f'Old value: {old_value}')
        next_max = np.max(q_table[next_state])
        #print(f'next max: {next_max}')
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        #print(f'new value: {new_value}')
        q_table[state, action] = new_value
        
        state = next_state
    if epoch % 100 == 0:
        clear_output(wait=True)

Epoch 100000


In [174]:
X_train[500]

array([0, 1, 0], dtype=int64)