In [5]:
import warnings ; warnings.filterwarnings('ignore')

import gym
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [6]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-7oa8cdmp/gym-walk_9ac33c6252a84831acd505e3e920607d
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-7oa8cdmp/gym-walk_9ac33c6252a84831acd505e3e920607d
  Resolved https://github.com/mimoralea/gym-walk to commit b915b94cf2ad16f8833a1ad92ea94e88159279f5
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gym-walk
  Building wheel for gym-walk (setup.py) ... [?25l[?25hdone
  Created wheel for gym-walk: filename=gym_walk-0.0.2-py3-none-any.whl size=5377 sha256=a2ce3e8b82782239d65d6e535a25567a735da2187cde6b3955d0f11459d114d7
  Stored in directory: /tmp/pip-ephem-wheel-cache-1ic5icoj/wheels/bf/23/e5/a94be4a90dd18f7ce958c21f192276cb01ef0daaf2bc66583b
Successfully built gym-walk
Installing collected packages: gym-walk
Successfully installed gym-walk-0.0.2


In [7]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [8]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [23]:
def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    import random, numpy as np

    random.seed(123)
    np.random.seed(123)
    env.seed(123)

    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        while not done and steps < max_steps:
            state, _, done, _ = env.step(pi(state))
            steps += 1
        results.append(state == goal_state)

    results_array = np.array(results, dtype=np.bool_)
    return np.sum(results_array) / len(results_array)


In [10]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _ = env.step(pi(state))
            results[-1] += reward
            steps += 1
    return np.mean(results)

**CREATING** **THE** **FROZEN** **LAKE** **ENVIRONMENT**

In [18]:
envdesc  = ['HFFF','HSFF','FFGH', 'FHFH']
env = gym.make('FrozenLake-v1',desc=envdesc)
init_state = env.reset()
goal_state = 10
P = env.env.P

In [19]:
def value_iteration(P, gamma=1.0, theta=1e-10):
    V = np.zeros(len(P), dtype=np.float64)
    while True:
      Q=np.zeros((len(P),len(P[0])),dtype=np.float64)
      for s in range(len(P)):
        for a in range(len(P[s])):
          for prob, next_state, reward, done in P[s][a]:
            Q[s][a]+=prob*(reward+gamma*V[next_state]*(not done))
      if np.max(np.abs(V-np.max(Q,axis=1)))<theta:
        break
      V=np.max(Q,axis=1)
    pi=lambda s: {s:a for s,a in enumerate(np.argmax(Q,axis=1))}[s]
    return V, pi

In [20]:
# Finding the optimal policy
V_best_v, pi_best_v = value_iteration(P, gamma=0.99)

In [21]:
# Printing the policy
print("Name: Krithick Vivekananda\nRegister Number: 212223240075")
print('Optimal policy and state-value function (VI):')
print_policy(pi_best_v, P)

Name: Krithick Vivekananda
Register Number: 212223240075
Optimal policy and state-value function (VI):
Policy:
|           | 01      > | 02      < | 03      < |
|           | 05      > | 06      < | 07      ^ |
| 08      v | 09      ^ |           |           |
| 12      < |           | 14      < |           |


In [24]:
# Printing the success rate and the mean return
print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(
    probability_success(env, pi_best_v, goal_state=goal_state)*100,
    mean_return(env, pi_best_v)))

Reaches goal 100.00%. Obtains an average undiscounted return of 1.0000.


In [25]:
# Printing the state value function
print_state_value_function(V_best_v, P, prec=4)

State-value function:
|           | 01 0.8761 | 02 0.8848 | 03 0.8701 |
|           | 05 0.8939 | 06 0.9203 | 07 0.8819 |
| 08 0.8611 | 09 0.9125 |           |           |
| 12 0.8357 |           | 14 0.4975 |           |
