In [1]:
import numpy as np
import gym
from gym_ianna.envs.ianna_env import IANNAEnv
import tensorflow as tf



In [2]:
INPUT_DIM = 32 #Input dimensions
H_SIZE = 50 #Number of hidden layer neurons
ETA = 1e-2 #Learning Rate
OUTPUT_DIM = 15

In [3]:
#Initializing 
tf.reset_default_graph()

input_x = tf.placeholder(tf.float32, [None,INPUT_DIM] , name="input_x")

W1 = tf.get_variable("W1", shape=[INPUT_DIM, H_SIZE],
           initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.get_variable(shape=[1,H_SIZE],name='b1',
                              initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(input_x,W1) + b1) 
W2 = tf.get_variable("W2", shape=[H_SIZE, H_SIZE],
           initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.get_variable(shape=[1,H_SIZE],name='b2',
                              initializer=tf.contrib.layers.xavier_initializer())
layer2 = tf.nn.relu(tf.matmul(layer1,W2) + b2)
W3 = tf.get_variable("W3", shape=[H_SIZE, OUTPUT_DIM],
           initializer=tf.contrib.layers.xavier_initializer())
b3 = tf.get_variable(shape=[1,OUTPUT_DIM],name='b3',
                              initializer=tf.contrib.layers.xavier_initializer())
y_predicted = tf.matmul(layer2,W3) + b3


In [4]:
saver = tf.train.Saver()

sess = tf.InteractiveSession()
saver.restore(sess, "../models/ianna-nn-supervised")


INFO:tensorflow:Restoring parameters from ../models/ianna-nn-supervised


[2017-08-12 23:58:56,734] Restoring parameters from ../models/ianna-nn-supervised


In [5]:
#From here we define the parts of the network needed for learning a good policy.
tvars = tf.trainable_variables()
y_selected_by_agent = tf.placeholder(tf.float32,[None,OUTPUT_DIM], name="y_selected_by_agent")
advantages = tf.placeholder(tf.float32,[None,1], name="reward_signal")

# The loss function. This sends the weights in the direction of making actions 
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.

probs = tf.nn.softmax(y_predicted)
ce1 = tf.nn.softmax_cross_entropy_with_logits(logits=y_predicted, labels=y_selected_by_agent)
#probs_log = tf.log(probs)
#ce2 = -tf.reduce_sum(input_y * probs_log, reduction_indices=[1])

# following lines do inner product
mul_by_adv = tf.multiply(advantages, tf.reshape(ce1, [-1, 1]))
loss_new = tf.reduce_mean(mul_by_adv)

adam = tf.train.AdamOptimizer(learning_rate=ETA) # Adam optimizer
# next line returns for each layer the a (grads, vars) pair, but we dont want to use it as-is, we want to accumulate grad
newGrads = adam.compute_gradients(loss_new, var_list=tvars)
updateGrads = adam.apply_gradients(newGrads)



In [6]:
GAMMA = 0.9 #Discount factor

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * GAMMA + r[t]
        discounted_r[t] = running_add
    return discounted_r


In [7]:
discount_rewards(np.array([1.0, 10.0, 3.0, 6.0]))

array([ 16.804,  17.56 ,   8.4  ,   6.   ])

In [8]:
#        IANNA actions would be:
#        0) action_type:            back[0], filter[1], group[2]
#        1) col_id:                 [0..num_of_columns-1]
#        2) filter_operator:        LT[0], GT[1] if the selected column was numeric (maybe change semantics if column is STR?)
#        3) filter_decile:          [0..9] the filter operand  
#        4) aggregation column_id:  [0..num_of_columns - 1] (what do we do if the selected col is also grouped_by?)
#        5) aggregation type:       MEAN[0], COUNT[1], SUM[2], MIN[3], MAX[4]

def build_ianna_action_from_grouped_by_field(grouped_by_field):
    action = [2, grouped_by_field, 0, 0, 0, 0]
    return action


In [9]:
env = gym.make("IANNA-v0")


[2017-08-12 23:58:59,850] Making new env: IANNA-v0


reading input /home/ihadanny/src/IANNA/gym_ianna/envs/../../data/1.tsv
observation space from [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] to [8648 8648   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1] expected shape (32,)


In [16]:
xs,drs,ys = [],[],[]	#Arrays to store parameters till an update happens
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 5
init = tf.global_variables_initializer()

reward_per_episode = []
running_reward=0
# Training
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    state = env.reset() # Initial state of the environment
    print(state)
    
    while episode_number <= total_episodes:                    
        # Format the state for placeholder
        x = np.reshape(state,[1,INPUT_DIM])
        
        # Run policy network 
        probs_result = sess.run(probs,feed_dict={input_x: x})
        
        y = np.random.multinomial(1,probs_result[0])
        field = np.argmax(y)
        action = build_ianna_action_from_grouped_by_field(field)
        
        xs.append(x)
        ys.append(y)

        # take action for the state
        state, reward, done, info = env.step(action)
        print(state, reward)
        reward_sum += reward

        drs.append(reward) # store reward after action is taken

        if done: 
            episode_number += 1
            # Stack the memory arrays to feed in session
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            
            xs,drs,ys = [],[],[] #Reset Arrays

            # Compute the discounted reward
            discounted_epr = discount_rewards(epr)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)
            
            sess.run(updateGrads,feed_dict={input_x: epx, y_selected_by_agent: epy, advantages: discounted_epr})
            
            reward_per_episode.append(reward_sum)
            # Print details of the present model
            running_reward = np.mean(reward_per_episode[-100:])
            print('Episode %d: Average reward for episode %f.  Running average reward %f.' % (episode_number, reward_sum, running_reward))
                
            reward_sum = 0            
            input_initial = env.reset()

print(episode_number,'Episodes completed.')


reading input /home/ihadanny/src/IANNA/gym_ianna/envs/../../data/1.tsv
[8648 8648   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
group length
[8648   10   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    1    0    0    0    0
    0    0] 1.0
group tcp_dstport
[8648  164   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    1    0    0    0    1
    0    0] 1.0
group tcp_dstport
[8648  164   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    1    0    0    0    1
    0    0] 0.0
group tcp_dstport
[8648  164   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    1    0    0    0    1
    0    0] 0.0
group tcp_dstport
[8648  1

[8648  158   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    0    0    0    0    1
    0    0] 0.0
group tcp_dstport
[8648  158   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    0    0    0    0    1
    0    0] 0.0
group tcp_dstport
[8648  158   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    0    0    0    0    1
    0    0] 0.0
group tcp_dstport
[8648  158   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    0    0    0    0    1
    0    0] 0.0
group tcp_dstport
[8648  158   10    2    2    3 8148    0 3692  180   10 8648    1  118  157
  208  206    0    0    0    0    0    0    0    0    0    0    0    0    1
    0    0] 0.0
group tcp_dstport
[8648  158   10    2    2    3 8148    0 3692  180   10 8648    1  118