# Module loading and functions defined

In [1]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
import hiive.mdptoolbox.example as mdpex
import pandas as pd
import seaborn as sns
import numpy as np
import sys
import gym
import os
from numpy.random import choice

np.random.seed(56)

In [2]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

def test_policy(P, R, policy, test_count=1000, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode

def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [3]:
def trainQ(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [4]:
def train_pi(P, R, gamma = [0.9], max_iter = [1e6]):
    result = {}
    for g in gamma:
        result[g] = {}
        for itr in max_iter:
            result[g][itr] = {}
            pi = PolicyIteration(P, R, gamma=g, max_iter=itr)
            pi.run()
            pi_pol = pi.policy
            pi_reward = test_policy(P, R, pi_pol)
            pi_iter = pi.iter
            pi_time = pi.time
            result[g][itr]["policy"] = pi_pol
            result[g][itr]["reward"] = pi_reward
            result[g][itr]["iteration"] = pi_iter
            result[g][itr]["time"] = pi_time
    return result

## model building: size of 10

In [5]:
num_state = 10
reward1 = 4
reward2 = 2
fire_probability = 0.1
P_10, R_10 = mdpex.forest(S=num_state, r1=reward1, r2=reward2, p=0.1)

## VI: 10

In [6]:
vi_10_result = trainVI(P_10, R_10, epsilon=[1e-1, 1e-2, 1e-4, 1e-6, 1e-8, 1e-10, 1e-12, 1e-14, 1e-16])
vi_10_result

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",16,0.003708,2.092869,"(3.772106445047769, 4.513314520588507, 5.42838..."
1,0.01,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",16,0.000641,2.123551,"(3.772106445047769, 4.513314520588507, 5.42838..."
2,0.0001,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",16,0.000728,2.088074,"(3.772106445047769, 4.513314520588507, 5.42838..."
3,1e-06,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",16,0.000641,2.1152,"(3.772106445047769, 4.513314520588507, 5.42838..."
4,1e-08,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",16,0.00083,2.136778,"(3.772106445047769, 4.513314520588507, 5.42838..."
5,1e-10,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",16,0.000774,2.09953,"(3.772106445047769, 4.513314520588507, 5.42838..."
6,1e-12,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",16,0.00067,2.130398,"(3.772106445047769, 4.513314520588507, 5.42838..."
7,1e-14,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",170,0.00687,2.165125,"(6.0037852114428425, 6.744993286983577, 7.6600..."
8,1e-16,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",192,0.008128,2.108321,"(6.003785392141506, 6.744993467682242, 7.66006..."


## PI:10

In [7]:
pi_10_result = train_pi(P_10, R_10, gamma = [0.99, 0.9, 0.8, 0.7, 0.6, 0.5], max_iter = [1e4, 1e5, 1e6])
pi_10_result

{0.99: {10000.0: {'policy': (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
   'reward': 2.091547045081238,
   'iteration': 9,
   'time': 0.00975489616394043},
  100000.0: {'policy': (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
   'reward': 2.1633066396664224,
   'iteration': 9,
   'time': 0.002244234085083008},
  1000000.0: {'policy': (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
   'reward': 2.0790002298047154,
   'iteration': 9,
   'time': 0.0024559497833251953}},
 0.9: {10000.0: {'policy': (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
   'reward': 2.1004140803238083,
   'iteration': 9,
   'time': 0.0030319690704345703},
  100000.0: {'policy': (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
   'reward': 2.110752373589253,
   'iteration': 9,
   'time': 0.002566814422607422},
  1000000.0: {'policy': (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
   'reward': 2.0498449287300913,
   'iteration': 9,
   'time': 0.0030012130737304688}},
 0.8: {10000.0: {'policy': (0, 1, 1, 1, 0, 0, 0, 0, 0, 0),
   'reward': 2.353651793240591,
   'iteration': 6,
   'time': 0.002204179763793945

## QL: 10

In [None]:
alpha_decs = [0.9, 0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.9, 0.99, 0.999]
iters = [1e6, 1e7, 1e8]
ql_10_result = trainQ(P_10, R_10, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)