# Dependencies

In [None]:
import numpy as np
import random
import gym
import gym_cps
import matplotlib.pyplot as plt
import pickle

# Learning (Attacker & Defender)

In [None]:
## Create the Environmnet
env = gym.make('cps-v0')

## Initialize Q-tables
a_q_table = np.zeros((len(env.state), 4*len(env.state[0].attacker_value)))
d_q_table = np.zeros((len(env.state), 16*len(env.state[0].defender_value)))

In [None]:
## Hyperparameters
total_episodes = 100000   ## Total number of episodes
learning_rate = 0.6       ## Degree of learning from this state; 0:nothing 1: a lot
max_steps = 99            ## Maximum step per episodes
gamma = 0.95              ## Discount rate; 0: 1:

# Exploration parameters
epsilon = 1.0
min_epsilon = 0.01
max_epsilon = 1.0
decay_rate = 0.005

In [None]:
# Store win-factor and paths
w_factor_human = []
w_factor_machine = []

data_path_human = []
data_path_machine = []

In [None]:
# All_paths
paths = []

## TODO: CHANGE TO w_factor_human WHEN NEEDED
w_factor_machine = []
w_factor_machine.append(1)

# Run for life or for a specific episode amount
for episode in range(total_episodes):
    step = 0

    if episode%10000 == 0:
        print("Episode: " + str(episode))
    
    ## Get Environment Parameters
    env.observation, env.d_observation    = env.reset()
    done                                  = env.done
    win                                   = env.win
    
    
      ## TODO: REMOVE COMMENT WHEN WE NEED TO LEARN ON SPECIFIC SUBNET
#     observation = env.state[10]
#     env.observation = observation
#     env.d_observation = observation
    
    win_factor  = env.win_factor
    
    ## Set a path, and append to it
    a_path = []
    d_path = []
    
    a_path.append(env.observation.name)
    d_path.append(env.d_observation.name)

    
    while step < max_steps:
        step += 1
        #####################
        ##### Attacker action
        #####################
        
        cur_observation = env.observation
        
        ## Exploration/Exploitation Trade-Off
        exp_exp_tradeoff = random.uniform(0, 1)

        ## If this number is greater than epsilon: EXPLOITATION (take biggest q-value of this state.)
        if exp_exp_tradeoff > epsilon:

            sorted_moves = cur_observation.moves
            sorted_moves.sort()

            action_space = np.unravel_index(np.argmax(a_q_table[cur_observation.index, :]), a_q_table.shape)
            #(0: always zero, 1: index of the value) indexing is on 20 scale
            
            # Get what section [0 or 1 or 2 or 3]
            # This is done to get the next node by using this: env.state[sorted_moves[index]]
            index = -1
            for i in range(4):
                if i*5 <= action_space[1] < (i+1)*5:
                    index = i
                    break
                    
            ## Now we need to get the index of the attack_value for this specific node (next node)
            attack_val_index = action_space[1] - (5*index)
            
            action = [cur_observation, env.state[sorted_moves[index]], attack_val_index]
            

        ## Else if this value is less tan or equal to epsilon: EXPLORATION (take a random attacker choice od this state.)
        else:

            action = env.action_sample_jk(cur_observation)

        
        ## Use action a and observe reward, next state.
        new_observation, reward_a, reward_b, done, win_factor, win_step, win = env.attack_step_1_jk(action)
        
        ## Update Q-table
        ## Bellman equation
        ## Q(S,A) = Q(S,A) + lr[R(S,A) + gamma*(Q(S',A')) - Q(S,A)]
        
        # Get & Sort observation moves..
        obs_moves = cur_observation.moves
        obs_moves.sort()
        
        # Get section, than index value.
        section = obs_moves.index(action[1].index)
        indx_val = (5*section) + action[2]
        
        # update
        a_q_table[cur_observation.index, indx_val] = a_q_table[cur_observation.index, indx_val] + learning_rate * (reward_a + gamma * np.max(a_q_table[new_observation.index, :] ) - a_q_table[cur_observation.index, indx_val])
    
        action.append(win_step)
        
        #####################
        ##### Defender action
        #####################

        d_cur_observation = env.d_observation
        

        ## Exploration/Exploitation Trade-Off
        exp_exp_tradeoff = random.uniform(0, 1)
        
        here = False
        ## If this number is greater than epsilon: EXPLOITATION (take biggest q-value of this state.)
        if exp_exp_tradeoff > epsilon:
            here = False
            d_action_space = np.unravel_index(np.argmax(d_q_table[cur_observation.index, :]), d_q_table.shape)
            #(0: always zero, 1: index of the value) indexing is on 20 scale
        
            # Get what section [0 or 1 or 2 or 3]
            # This is done to get the next node by using this: env.state[sorted_moves[index]]
            index = -1
            for i in range(16):
                if i*6 <= d_action_space[1] < (i+1)*6:
                    index = i
                    break
                    
            ## Now we need to get the index of the defense_value for this specific node (next node)
            defense_val_index = d_action_space[1] - (6*index)
        
            d_action = [action, d_cur_observation, env.state[index], defense_val_index]

        
        ## Else if this value is less tan or equal to epsilon: EXPLORATION (take a random attacker choice od this state.)
        else:
            here = True
            d_action = env.defender_action_sample_jk_jk2(action, d_cur_observation)
            
        ## Use action a and observe reward, next state.
        # d_action = [action, d_observation, next_node, next_node_defense_index]
        new_d_observation, reward_b = env.defense_step_jk_jk2(d_action)

        # Get section, than index value.
        section = d_action[2].index
        indx_val = (6*section) + d_action[3]
        
     
        # update
        d_q_table[cur_observation.index, indx_val] = d_q_table[cur_observation.index, indx_val] + learning_rate * (reward_b)    
        
        ## Update Observation
        cur_observation = new_observation
        d_cur_observation = new_d_observation
        
        a_path.append(env.observation.name + " " + str(action[2]))
        d_path.append(env.d_observation.name + " " + str(d_action[3]))
        paths.append([a_path, d_path])
        
        ## Check if game is Done
        if done:
            ## TODO
            w_factor_machine.append(win_factor)
            break
                
    # Reduce epsilon; because we need less exploration
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    
print("Done")

In [None]:
print(w_factor_machine.count(-1)) ##Attacker
print(w_factor_machine.count(1))  ##Defender