### Assignment : Week 1 
## Modeling simple RL problems by making their MDPs in Python

We will create the MDPs for some of the example problems from Grokking textbook.

## Environment 0 - Bandit Walk

In [1]:
bw_mdp = {
    0: {
        "Right": [(1, 0, 0, True)],
        "Left": [(1, 0, 0, True)]
    },
    1: {
        "Right": [(1, 2, 1, True)],
        "Left": [(1, 0, 0, True)]
    },
    2: {
        "Right": [(1, 2, 0, True)],
        "Left": [(1, 2, 0, True)]
    }
}

## Environment 1 - Slippery Walk (SWF)

In [2]:
swf_mdp = {}

terminal_states = [0, 6]

for s in range(7):
    if s in terminal_states:
        swf_mdp[s] = {
            "Right": [(1, s, 0, True)],
            "Left": [(1, s, 0, True)]
        }
    else:
        swf_mdp[s] = {
            "Right": [
                (1/2, min(s+1, 6), 1 if (s == 5) else 0, min(s+1, 6) in terminal_states),
                (1/3, s, 0, False),
                (1/6, max(s-1, 0), 0, max(s-1, 0) in terminal_states)
            ],
            "Left": [
                (1/2, max(s-1, 0), 1 if (s == 1) else 0, max(s-1, 0) in terminal_states),
                (1/3, s, 0, False),
                (1/6, min(s+1, 6), 0, min(s+1, 6) in terminal_states)
            ]
        }

## Environment 2 - Frozen Lake Environment

In [3]:
fl_mdp = {}

terminal_states = [5, 7, 11, 12, 15]
goal_state = 15

def to_pos(s):
    return divmod(s, 4)

def to_state(r, c):
    return r * 4 + c

moves = {
    "Up": (-1, 0),
    "Down": (1, 0),
    "Left": (0, -1),
    "Right": (0, 1)
}

left_turn = {
    "Up": "Left",
    "Left": "Down",
    "Down": "Right",
    "Right": "Up"
}

right_turn = {v: k for k, v in left_turn.items()}

for state in range(16):
    fl_mdp[state] = {}

    if state in terminal_states:
        for a in moves:
            fl_mdp[state][a] = [(1, state, 0, True)]
        continue

    r, c = to_pos(state)

    for action in moves:
        transitions = []
        for prob, act in zip([1/3, 1/3, 1/3], [left_turn[action], action, right_turn[action]]):
            dr, dc = moves[act]
            nr, nc = r + dr, c + dc

            if nr < 0 or nr >= 4 or nc < 0 or nc >= 4:
                next_state = state
            else:
                next_state = to_state(nr, nc)

            reward = 1 if next_state == goal_state else 0
            done = next_state in terminal_states

            transitions.append((prob, next_state, reward, done))

        fl_mdp[state][action] = transitions

You can compare this with OpenAI Gym FrozenLake MDP

In [4]:
import gym
import pprint

P = gym.make('FrozenLake-v1').env.P
pprint.pprint(P)