In [36]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
import gym
import numpy as np
from numpy.random import choice
import pandas as pd

In [76]:
np.random.seed(44)
def getTestPolicyRewards(P, R, policy, test_count=1000, gamma=0.9):
    states = P.shape[-1]
    total_episodes = states * test_count
    rewards = 0
    
    for state in range(states):
        state_reward = 0
        for episode in range(test_count):
            episode_reward = 0
            discount_rate = 1
            while True:
                action = policy[state]
                probs = P[action][state]
                cand = list(range(len(P[action][state])))
                next_state = choice(cand, 1, p=probs)[0]
                reward = R[state][action] * discount_rate
                episode_reward += reward
                discount_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        rewards += state_reward
    return rewards / total_episodes

def viTrain(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration","Value Function", "Time", "Reward"])
    for e in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=e, max_iter=int(1e15))
        vi.run()
        reward = getTestPolicyRewards(P, R, vi.policy)
        information = [float(e), vi.policy, vi.iter, vi.V, vi.time, reward]
        df_len = len(vi_df)
        vi_df.loc[df_len] = information
    return vi_df

def getPolicyIteration(P, R, gamma=0.9):
    pi = PolicyIteration(P, R, gamma=gamma, max_iter=1e6)
    pi.run()
    pi_policy = pi.policy
    pi_reward = getTestPolicyRewards(P, R, pi_policy)
    pi_iteration = pi.iter
    pi_time = pi.time
    print("Policy Iteration: ", pi_iteration)
    print("Policy reward: ", pi_reward)
    print("Policy time: ", pi_time)
    return pi_iteration, pi_reward, pi_time, pi_policy

def generate_vi_pi(state, r1, r2, p, epsilons):
    P, R = forest(S=state, r1=r1, r2=r2, p=p)
    vi_df = viTrain(P, R, epsilon=epsilons)
    pi_iter, pi_reward, pi_time, pi_policy = getPolicyIteration(P, R)
    return vi_df, [pi_iter, pi_reward, pi_time, pi_policy]

def trainQLearning(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], epsilons=[1.0], epsilon_decays=[0.99], n_iteration=[1000000], mute=True):
    count = 0
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Minimum", "Epsilon", "Epsilon Decay", "Policy", "Value Function", "Time", "Reward", "Training Reward"])
    for n in n_iteration:
        for e in epsilons:
            for ed in epsilon_decays:
                for adec in alpha_dec:
                    for amin in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=adec, alpha_min=amin, epsilon=e, epsilon_decay=ed, n_iter=n)
                        q.run()
                        reward = getTestPolicyRewards(P, R, q.policy)
                        count += 1
                        if(mute == False):
                            print("{}: {}".format(count, reward))
                        qStats = q.run_stats
                        training_rewards = [stat['Reward'] for stat in qStats]
                        information = [n, adec, amin, e, ed, q.policy, q.V, q.time, reward, training_rewards]
                        df_len = len(q_df)
                        q_df.loc[df_len] = information
    return q_df
                        

In [72]:
#State is 50
value_iteration, policy_iteration = generate_vi_pi(50, 8, 12, 0.1,[1e-1, 1e-2, 1e-5, 1e-10, 1e-15, 1e-20])
value_iteration

Policy Iteration:  13
Policy reward:  2.408908999354387
Policy time:  0.0034630298614501953


Unnamed: 0,Epsilon,Policy,Iteration,Value Function,Time,Reward
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",32,"(4.3127984944130136, 4.864633290845182, 4.8646...",0.004356,2.325956
1,0.01,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",43,"(4.424067013443481, 4.976617348419744, 4.97661...",0.001894,2.472202
2,1e-05,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",76,"(4.473560831234312, 5.026046957818786, 5.02604...",0.003299,2.338835
3,1e-10,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",130,"(4.475132788012464, 5.0276189758570675, 5.0276...",0.005841,2.527056
4,1e-15,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",187,"(4.475138108399614, 5.027624296244919, 5.02762...",0.00801,2.343221
5,1e-20,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",241,"(4.475138121502507, 5.027624309347811, 5.02762...",0.01034,2.405065


In [81]:
value_iteration.Policy == policy_iteration[3]

0    True
1    True
2    True
3    True
4    True
5    True
Name: Policy, dtype: bool

In [95]:
policy_iteration

[13,
 2.408908999354387,
 0.0034630298614501953,
 (0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)]

In [80]:
alpha_decs = [0.9, 0.99, 0.999]
alpha_mins = [0.1, 0.01, 0.001]
eps = [10, 1]
eps_desc = [0.99, 0.999]
iterations = [1000000, 10000000]
P, R = forest(S=50, r1=8, r2=12, p=0.1)
q_df = trainQLearning(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, epsilons=eps, epsilon_decays=eps_desc, n_iteration=iterations, mute=False)
q_df

1: 2.3672632649274976
2: 1.25
3: 2.8810170911019823
4: 2.447725349466523
5: 2.747970541104021
6: 2.708224541392938
7: 2.5313223212341307
8: 2.636142286042605
9: 2.5210701478861246
10: 2.4934579943124002
11: 2.6649547696471414
12: 1.4
13: 2.499258700492977
14: 2.870729674563188
15: 2.918497515099719
16: 2.397493710582085
17: 1.25
18: 2.700688043057648
19: 2.4169734131063887
20: 2.6686297750115306
21: 2.883927345526327
22: 2.415184952486324
23: 2.6872247069933715
24: 2.811686621351204
25: 2.405617950537327
26: 2.3992912399240973
27: 2.3921664689132895
28: 2.372331514694544
29: 1.25
30: 2.8420910945232687
31: 2.437932631439102
32: 1.25
33: 1.4
34: 2.446687179986429
35: 2.6783041913561396
36: 2.799835479182756
37: 2.422992540446031
38: 2.3949819539124
39: 2.9370184913126023
40: 2.453280614621613
41: 2.5920343879593273
42: 2.775899525687535
43: 2.39828204585724
44: 2.3929274005451746
45: 2.5270402818187065
46: 2.3119838645529946
47: 2.5434409223207033
48: 1.35
49: 2.4897324193904735
50: 2.5

Unnamed: 0,Iterations,Alpha Decay,Alpha Minimum,Epsilon,Epsilon Decay,Policy,Value Function,Time,Reward,Training Reward
0,1000000,0.900,0.100,10,0.990,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","(4.469940568703537, 4.999107942040692, 5.01777...",37.652638,2.367263,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
1,1000000,0.900,0.010,10,0.990,"(0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...","(4.488879057038657, 5.0342657596858, 5.0384810...",34.013754,1.250000,"[0.0, 0.0, 0.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,1000000,0.900,0.001,10,0.990,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...","(4.471109138557807, 5.023890064107351, 5.03091...",33.842457,2.881017,"[1.0, 0.0, 0.0, 0.0, 1.0, 12.0, 1.0, 0.0, 1.0,..."
3,1000000,0.990,0.100,10,0.990,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","(4.515859561164833, 5.043128194420223, 5.01896...",38.112448,2.447725,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1000000,0.990,0.010,10,0.990,"(0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...","(4.469634871122391, 5.0237125620087895, 5.0257...",36.855496,2.747971,"[0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 1.0, 8.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...
67,10000000,0.990,0.010,1,0.999,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","(4.47447118539119, 5.020756964337042, 5.025379...",358.449273,2.441278,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ..."
68,10000000,0.990,0.001,1,0.999,"(0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","(4.47154530812218, 5.0214179302789645, 5.02355...",356.195920,1.400000,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ..."
69,10000000,0.999,0.100,1,0.999,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","(4.4845652843315555, 5.027855693372516, 5.0268...",358.759768,2.395357,"[1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
70,10000000,0.999,0.010,1,0.999,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","(4.46722747648816, 5.033604452680821, 5.033757...",358.971982,2.442581,"[0.0, 8.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, ..."


In [90]:
maxValIndex = q_df['Reward'].idxmax()
getTestPolicyRewards(P, R, q_df.Policy[maxValIndex])

2.8951425096440135

In [93]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Minimum,Epsilon Decay,Time,Reward
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000,0.963,0.037,0.9945,36.663847,2.384547
10000000,0.963,0.037,0.9945,371.552266,2.43133


In [94]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Minimum,Time,Reward
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.99,0.963,0.037,205.925175,2.523138
0.999,0.963,0.037,202.290938,2.292739
