Frozen Lake

In [None]:
import numpy as np
import gym
from gym import wrappers
import time
import sys
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from gym.envs.toy_text.frozen_lake import generate_random_map, FrozenLakeEnv
import glob
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import numpy as np
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(44)

In [None]:
np.random.seed(10)
twenty = generate_random_map(20)
MAPS = {
    "20x20": twenty
}

In [None]:
def run_process(env, policy, gamma, render = True):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward

def evaluate_policy(env, policy, gamma , n = 100):
    scores = [run_process(env, policy, gamma, False) for _ in range(n)]
    return np.mean(scores)

def get_policy(env,v, gamma):
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.nA)
        for a in range(env.nA):
            q_sa[a] = sum([p * (r + gamma * v[s_]) for p, s_, r, _ in  env.P[s][a]])
        policy[s] = np.argmax(q_sa)
    return policy

def compute_policy(env, policy, gamma):
    v = np.zeros(env.nS)
    eps = 1e-5
    while True:
        prev_v = np.copy(v)
        for s in range(env.nS):
            policy_a = policy[s]
            v[s] = sum([p * (r + gamma * prev_v[s_]) for p, s_, r, is_done in env.P[s][policy_a]])
        if (np.sum((np.fabs(prev_v - v))) <= eps):
            break
    return v

def run_policy_iteration(env, gamma):
    policy = np.random.choice(env.nA, size=(env.nS))  
    max_iters = 200000
    desc = env.unwrapped.desc
    for i in range(max_iters):
        old_policy_v = compute_policy(env, policy, gamma)
        new_policy = get_policy(env,old_policy_v, gamma)
        if (np.all(policy == new_policy)):
            k=i+1
            break
        policy = new_policy
    return policy,k

def run_value_iteration(env, gamma):
    v = np.zeros(env.nS)  # initialize value-function
    max_iters = 100000
    eps = 1e-20
    desc = env.unwrapped.desc
    for i in range(max_iters):
        prev_v = np.copy(v)
        for s in range(env.nS):
            q_sa = [sum([p*(r + gamma*prev_v[s_]) for p, s_, r, _ in env.P[s][a]]) for a in range(env.nA)] 
            v[s] = max(q_sa)
        if (np.sum(np.fabs(prev_v - v)) <= eps):
            k=i+1
            break
    return v,k

def show_policy_map(title, policy, map_desc, color_map, direction_map):
    fig = plt.figure()
    ax = fig.add_subplot(111, xlim=(0, policy.shape[1]), ylim=(0, policy.shape[0]))
    font_size = 'x-large'
    if policy.shape[1] > 16:
        font_size = 'small'
    plt.title(title)
    for i in range(policy.shape[0]):
        for j in range(policy.shape[1]):
            y = policy.shape[0] - i - 1
            x = j
            p = plt.Rectangle([x, y], 1, 1)
            p.set_facecolor(color_map[map_desc[i,j]])
            ax.add_patch(p)

            text = ax.text(x+0.5, y+0.5, direction_map[policy[i, j]], weight='bold', size=font_size,
                           horizontalalignment='center', verticalalignment='center', color='w')
            

    plt.axis('off')
    plt.xlim((0, policy.shape[1]))
    plt.ylim((0, policy.shape[0]))
    plt.tight_layout()
    plt.savefig(title+str('.png'))
    plt.close()

    return (plt)

In [None]:
def run_Frozen_Lake(option):
    # 0 = left; 1 = down; 2 = right;  3 = up
    size = 4
    if (option == "4x4"):
        environment  = 'FrozenLake-v1'
        env = gym.make(environment)
        size = 4
    else:
        env = FrozenLakeEnv(desc=MAPS["20x20"])
        size = 20
    env = env.unwrapped
    desc = env.unwrapped.desc
    time_array=[0]*10
    gamma_arr=[0]*10
    iters=[0]*10
    list_scores=[0]*10

    
    ### POLICY ITERATION ####
    print('POLICY ITERATION WITH FROZEN LAKE ' + option)
    for i in range(0,10):
        st=time.time()
        best_policy,k = run_policy_iteration(env, gamma = (i+0.5)/10)
        scores = evaluate_policy(env, best_policy, gamma = (i+0.5)/10)
        plot = show_policy_map('Frozen Lake  ' + option + ' Policy Map Iteration '+ str(i) + ' (Policy Iteration) ' + 'i: '+ str(i),best_policy.reshape(size,size),desc,colors(),directions())
        end=time.time()
        gamma_arr[i]=(i+0.5)/10
        list_scores[i]=np.mean(scores)
        iters[i] = k
        time_array[i]=end-st
    
    # print('Frozen Lake ' + option + ' - Policy Iteration')
    # print(list_scores)
    
    plt.plot(gamma_arr, time_array)
    plt.xlabel('Gammas')
    plt.title('Frozen Lake ' + option + '- Policy Iteration - Execution Time Analysis')
    plt.ylabel('Execution Time (s)')
    plt.grid()
    plt.show()

    plt.plot(gamma_arr,list_scores)
    plt.xlabel('Gammas')
    plt.ylabel('Average Rewards')
    plt.title('Frozen Lake ' + option + ' - Policy Iteration - Reward Analysis')
    plt.grid()
    plt.show()

    plt.plot(gamma_arr,iters)
    plt.xlabel('Gammas')
    plt.ylabel('Iterations to Converge')
    plt.title('Frozen Lake ' + option + ' - Policy Iteration - Convergence Analysis')
    plt.grid()
    plt.show()

    
    ### VALUE ITERATION ###
    print('VALUE ITERATION WITH FROZEN LAKE ' + option)
    best_vals=[0]*10
    for i in range(0,10):
        st=time.time()
        best_value,k = run_value_iteration(env, gamma = (i+0.5)/10)
        policy = get_policy(env,best_value, gamma = (i+0.5)/10)
        policy_score = evaluate_policy(env, policy, gamma=(i+0.5)/10, n=1000)
        gamma = (i+0.5)/10
        plot = show_policy_map('Frozen Lake  ' + option + ' Policy Map Iteration '+ str(i) + ' (Value Iteration) ' + 'Gamma: '+ str(gamma),policy.reshape(size,size),desc,colors(),directions())
        end=time.time()
        gamma_arr[i]=(i+0.5)/10
        iters[i]=k
        best_vals[i] = best_value
        list_scores[i]=np.mean(policy_score)
        time_array[i]=end-st

        
    # print('Frozen Lake ' + option + ' - Value Iteration')
    # print(list_scores)
    
    plt.plot(gamma_arr, time_array)
    plt.xlabel('Gammas')
    plt.title('Frozen Lake ' + option + ' - Value Iteration - Execution Time Analysis')
    plt.ylabel('Execution Time (s)')
    plt.grid()
    plt.show()

    plt.plot(gamma_arr,list_scores)
    plt.xlabel('Gammas')
    plt.ylabel('Average Rewards')
    plt.title('Frozen Lake ' + option + ' - Value Iteration - Reward Analysis')
    plt.grid()
    plt.show()

    plt.plot(gamma_arr,iters)
    plt.xlabel('Gammas')
    plt.ylabel('Iterations to Converge')
    plt.title('Frozen Lake ' + option + ' - Value Iteration - Convergence Analysis')
    plt.grid()
    plt.show()

    plt.plot(gamma_arr,best_vals)
    plt.xlabel('Gammas')
    plt.ylabel('Optimal Value')
    plt.legend(['epsilon=0.05','epsilon=0.15','epsilon=0.25','epsilon=0.50','epsilon=0.75','epsilon=0.95'])
    plt.title('Frozen Lake ' + option + ' - Value Iteration - Best Value Analysis')
    plt.grid()
    plt.show()

    
    ### Q-LEARNING #####
    print('Q LEARNING WITH FROZEN LAKE ' + option)
    st = time.time()
    reward_array = []
    iter_array = []
    size_array = []
    chunks_array = []
    averages_array = []
    time_array = []
    Q_array = []
    for epsilon in [0.05,0.15,0.25,0.5,0.75,0.90]:
        Q = np.zeros((env.observation_space.n, env.action_space.n))
        rewards = []
        iters = []
        optimal=[0]*env.observation_space.n
        alpha = 0.85
        gamma = 0.95
        episodes = 30000
        
        if (option == "4x4"):
            environment  = 'FrozenLake-v1'
            env = gym.make(environment)
        else:
            env = FrozenLakeEnv(desc=MAPS["20x20"])

        env = env.unwrapped
        desc = env.unwrapped.desc
        for episode in range(episodes):
            state = env.reset()
            done = False
            t_reward = 0
            max_steps = 1000000
            for i in range(max_steps):
                if done:
                    break        
                current = state
                if np.random.rand() < (epsilon):
                    action = np.argmax(Q[current, :])
                else:
                    action = env.action_space.sample()
                
                state, reward, done, info = env.step(action)
                t_reward += reward
                Q[current, action] += alpha * (reward + gamma * np.max(Q[state, :]) - Q[current, action])
            epsilon=(1-2.71**(-episode/1000))
            rewards.append(t_reward)
            iters.append(i)


        for k in range(env.observation_space.n):
            optimal[k]=np.argmax(Q[k, :])

        reward_array.append(rewards)
        iter_array.append(iters)
        Q_array.append(Q)

        env.close()
        end=time.time()
        time_array.append(end-st)

        # Plot results
        def chunk_list(l, n):
            for i in range(0, len(l), n):
                yield l[i:i + n]

        size = int(episodes / 50)
        chunks = list(chunk_list(rewards, size))
        averages = [sum(chunk) / len(chunk) for chunk in chunks]
        size_array.append(size)
        chunks_array.append(chunks)
        averages_array.append(averages)
        
    # print('Frozen Lake ' + option + ' - Q Learning Q Array')
    # print(Q_array)

    # print('Frozen Lake ' + option + ' - Q Learning Reward Array')
    # print(reward_array)

    plt.plot(range(0, len(reward_array[0]), size_array[0]), averages_array[0],label='epsilon=0.05')
    plt.plot(range(0, len(reward_array[1]), size_array[1]), averages_array[1],label='epsilon=0.15')
    plt.plot(range(0, len(reward_array[2]), size_array[2]), averages_array[2],label='epsilon=0.25')
    plt.plot(range(0, len(reward_array[3]), size_array[3]), averages_array[3],label='epsilon=0.50')
    plt.plot(range(0, len(reward_array[4]), size_array[4]), averages_array[4],label='epsilon=0.75')
    plt.plot(range(0, len(reward_array[5]), size_array[5]), averages_array[5],label='epsilon=0.95')
    plt.legend()
    plt.xlabel('Iterations')
    plt.grid()
    plt.title('Frozen Lake ' + option + ' - Q Learning - Constant Epsilon')
    plt.ylabel('Average Reward')
    plt.show()

    plt.plot([0.05,0.15,0.25,0.5,0.75,0.95],time_array)
    plt.xlabel('Epsilon Values')
    plt.grid()
    plt.title('Frozen Lake ' + option + ' - Q Learning')
    plt.ylabel('Execution Time (s)')
    plt.show()

    plt.subplot(1,6,1)
    plt.imshow(Q_array[0])
    plt.title('Epsilon=0.05')

    plt.subplot(1,6,2)
    plt.title('Epsilon=0.15')
    plt.imshow(Q_array[1])

    plt.subplot(1,6,3)
    plt.title('Epsilon=0.25')
    plt.imshow(Q_array[2])

    plt.subplot(1,6,4)
    plt.title('Epsilon=0.50')
    plt.imshow(Q_array[3])

    plt.subplot(1,6,5)
    plt.title('Epsilon=0.75')
    plt.imshow(Q_array[4])

    plt.subplot(1,6,6)
    plt.title('Epsilon=0.95')
    plt.imshow(Q_array[5])
    plt.colorbar()

    plt.show()

In [None]:
def colors():
    return {
        b'S': 'green',
        b'F': 'skyblue',
        b'H': 'black',
        b'G': 'gold',
    }

def directions():
    return {
        3: '⬆',
        2: '➡',
        1: '⬇',
        0: '⬅'
    }

In [None]:
print('STARTING FROZEN LAKE 4X4')
run_Frozen_Lake("4x4")

In [None]:
print('STARTING FROZEN LAKE 20X20')
run_Frozen_Lake("20x20")
print('END OF RUN')

In [None]:
for image_path in glob.glob("/kaggle/working/*.png"):
    img = mpimg.imread(image_path)
    plt.ion()
    plt.figure()
    plt.axis('off') 
    plt.imshow(img)
    plt.show()
    plt.close()

Forest Management

In [None]:
P, R = forest(S=400, r1=5, r2= 2, p=0.01)

In [None]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [None]:
def test_policy(P, R, policy, test_count=100, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode


In [None]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [None]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

In [None]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

In [None]:
print(pi_pol)

In [None]:
def trainQ(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [None]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_df = trainQ(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

In [None]:
vi_df.Policy == pi_pol

In [None]:
test_policy(P,R,q_df.Policy[18])

In [None]:
q_df

In [None]:
q_df.groupby("Iterations").mean()

In [None]:
q_df.groupby("Epsilon Decay").mean()