In [5]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
import gym
import numpy as np
from numpy.random import choice
import pandas as pd

In [6]:
np.random.seed(44)
def getTestPolicyRewards(P, R, policy, test_count=1000, gamma=0.9):
    states = P.shape[-1]
    total_episodes = states * test_count
    rewards = 0
    
    for state in range(states):
        state_reward = 0
        for episode in range(test_count):
            episode_reward = 0
            discount_rate = 1
            while True:
                action = policy[state]
                probs = P[action][state]
                cand = list(range(len(P[action][state])))
                next_state = choice(cand, 1, p=probs)[0]
                reward = R[state][action] * discount_rate
                episode_reward += reward
                discount_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        rewards += state_reward
    return rewards / total_episodes

def viTrain(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration","Value Function", "Time", "Reward"])
    for e in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=e, max_iter=int(1e15))
        vi.run()
        reward = getTestPolicyRewards(P, R, vi.policy)
        information = [float(e), vi.policy, vi.iter, vi.V, vi.time, reward]
        df_len = len(vi_df)
        vi_df.loc[df_len] = information
    return vi_df

def getPolicyIteration(P, R, gamma=0.9):
    pi = PolicyIteration(P, R, gamma=gamma, max_iter=1e6)
    pi.run()
    pi_policy = pi.policy
    pi_reward = getTestPolicyRewards(P, R, pi_policy)
    pi_iteration = pi.iter
    pi_time = pi.time
    print("Policy Iteration: ", pi_iteration)
    print("Policy reward: ", pi_reward)
    print("Policy time: ", pi_time)
    return pi_iteration, pi_reward, pi_time, pi_policy

def generate_vi_pi(state, r1, r2, p, epsilons):
    P, R = forest(S=state, r1=r1, r2=r2, p=p)
    vi_df = viTrain(P, R, epsilon=epsilons)
    pi_iter, pi_reward, pi_time, pi_policy = getPolicyIteration(P, R)
    return vi_df, [pi_iter, pi_reward, pi_time, pi_policy]

def trainQLearning(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], epsilons=[1.0], epsilon_decays=[0.99], n_iteration=[1000000], mute=True):
    count = 0
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Minimum", "Epsilon", "Epsilon Decay", "Policy", "Value Function", "Time", "Reward", "Training Reward"])
    for n in n_iteration:
        for e in epsilons:
            for ed in epsilon_decays:
                for adec in alpha_dec:
                    for amin in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=adec, alpha_min=amin, epsilon=e, epsilon_decay=ed, n_iter=n)
                        q.run()
                        reward = getTestPolicyRewards(P, R, q.policy)
                        count += 1
                        if(mute == False):
                            print("{}: {}".format(count, reward))
                        qStats = q.run_stats
                        training_rewards = [stat['Reward'] for stat in qStats]
                        information = [n, adec, amin, e, ed, q.policy, q.V, q.time, reward, training_rewards]
                        df_len = len(q_df)
                        q_df.loc[df_len] = information
    return q_df


In [17]:
##State is 400
value_iteration, policy_iteration = generate_vi_pi(400, 8, 12, 0.1,[1e-1, 1e-2, 1e-5, 1e-10, 1e-15, 1e-20])
value_iteration

Policy Iteration:  13
Policy reward:  1.0674342416971598
Policy time:  0.025776147842407227


Unnamed: 0,Epsilon,Policy,Iteration,Value Function,Time,Reward
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",32,"(4.3127984944130136, 4.864633290845182, 4.8646...",0.004052,1.067377
1,0.01,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",43,"(4.424067013443481, 4.976617348419744, 4.97661...",0.003449,1.069081
2,1e-05,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",76,"(4.473560831234312, 5.026046957818786, 5.02604...",0.005906,1.069298
3,1e-10,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",130,"(4.475132788012464, 5.0276189758570675, 5.0276...",0.010277,1.070007
4,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",187,"(4.475138108399614, 5.027624296244919, 5.02762...",0.014774,1.070754
5,1e-20,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",241,"(4.475138121502507, 5.027624309347811, 5.02762...",0.021649,1.068822


In [13]:
value_iteration.Policy == policy_iteration[3]

0    True
1    True
2    True
3    True
4    True
5    True
Name: Policy, dtype: bool

In [12]:
alpha_decs = [0.9, 0.99, 0.999]
alpha_mins = [0.1, 0.01, 0.001]
eps = [10, 1]
eps_desc = [0.99, 0.999]
iterations = [1000000, 10000000]
P, R = forest(S=400, r1=8, r2=12, p=0.1)
q_df = trainQLearning(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, epsilons=eps, epsilon_decays=eps_desc, n_iteration=iterations, mute=False)
q_df

1: 0.5114633976057248
2: 0.6522015954764234
3: 0.796424113227919
4: 0.4775714511269725
5: 0.6160685431574414
6: 0.7428936792547435
7: 0.4879100019808509
8: 0.6763940904592289
9: 0.6843105601565413
10: 0.5247009635470448
11: 0.6356543535026414
12: 0.745198034774708
13: 0.504044985372834
14: 0.6602862881629655
15: 0.7610940128008278
16: 0.5428383138617116
17: 0.6482581570063967
18: 0.6774602587566735
19: 0.5299017195584294
20: 0.6500479795723436
21: 0.6325
22: 0.5076332558363121
23: 0.6612042467041543
24: 0.7072009393322549
25: 0.52417062712056
26: 0.5775
27: 0.6710863084760449
28: 0.4275
29: 0.5425
30: 0.7403225623314648
31: 0.5227531607291023
32: 0.59
33: 0.7202390540146003
34: 0.5022611310383219
35: 0.6398085599360729
36: 0.6910320920153414
37: 0.753051839771062
38: 0.622695725931926
39: 0.793904652045651
40: 0.7668728253983939
41: 0.6418867160872267
42: 0.7862093391952016
43: 0.7526336124055749
44: 0.6443404527622312
45: 0.740582937670518
46: 0.7566326231680599
47: 0.670269468555473


Unnamed: 0,Iterations,Alpha Decay,Alpha Minimum,Epsilon,Epsilon Decay,Policy,Value Function,Time,Reward,Training Reward
0,1000000,0.900,0.100,10,0.990,"(0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...","(4.505144504528415, 5.076296030935171, 5.05289...",55.050454,0.511463,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
1,1000000,0.900,0.010,10,0.990,"(0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, ...","(4.512876108899058, 5.04706670699751, 5.030208...",50.489033,0.652202,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1000000,0.900,0.001,10,0.990,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, ...","(4.478299378624277, 5.030681240487488, 5.02916...",53.815774,0.796424,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ..."
3,1000000,0.990,0.100,10,0.990,"(0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, ...","(4.550343231775324, 5.0907555626194245, 5.0681...",52.599838,0.477571,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1000000,0.990,0.010,10,0.990,"(0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, ...","(4.47284288846939, 5.044091989670913, 5.047727...",50.104879,0.616069,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...
67,10000000,0.990,0.010,1,0.999,"(0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, ...","(4.464395056775674, 5.01789717393831, 5.034932...",496.898760,0.655811,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
68,10000000,0.990,0.001,1,0.999,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, ...","(4.475154745605199, 5.026786508688194, 5.02722...",477.230682,0.800174,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
69,10000000,0.999,0.100,1,0.999,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...","(4.548392053314639, 5.074631730602794, 5.02396...",505.711399,0.800767,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
70,10000000,0.999,0.010,1,0.999,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, ...","(4.454633554886332, 4.999693341597504, 5.02421...",505.545032,0.665244,"[0.0, 0.0, 8.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [14]:
maxValIndex = q_df['Reward'].idxmax()
getTestPolicyRewards(P, R, q_df.Policy[maxValIndex])

0.7990855273518407

In [15]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Minimum,Epsilon Decay,Time,Reward
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000,0.963,0.037,0.9945,53.419131,0.616179
10000000,0.963,0.037,0.9945,494.25185,0.729164


In [16]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Minimum,Time,Reward
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.99,0.963,0.037,277.076172,0.668607
0.999,0.963,0.037,270.594809,0.676736
