# SARSA in windy gridworld
***
### The gridworld:
<img src="windy_gridworld.png" alt="The windy gridword" title="Windy gridworld" width="400"/>

#### Choice of actions:
 - 0: up
 - 1: down
 - 2: right
 - 3: left

In [1]:
import numpy as np

In [2]:
# public variables about the env
START_STATE = 30
END_STATE = 37
rows = 7
columns = 10
num_states = rows*columns
num_actions = 4

In [3]:
def windy(state):
    col=state%10
    if (col==6 or col==7):
        return 2
    elif (col>=3 and col<=5 or col==8):
        return 1
    else:
        return 0
    
def outside_limits(state):
    if state<0 or state>69:
        return True
    else:
        return False  

In [4]:
def env_step(state, action):
    wind = windy(state)
    #transition to next state depending on action and wind in current state
    if action==0 :
        next_state = state - columns - wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    elif action==1 :
        next_state = state + columns
        if outside_limits(next_state):
            next_state=state
        next_state -= wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    elif action==2 :
        next_state = state + 1
        if state%columns==9:
            next_state=state
        next_state -= wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    elif action==3 :
        next_state = state - 1
        if state%columns==0:
            next_state=state
        next_state -= wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    # reward at each time step is -1 until the goal is reached
    reward = -1
    return [next_state, reward]



In [5]:
def sarsa(alpha, epsilon, gamma, episodes) :
    Q = np.zeros((num_states,num_actions))
    for episode in range(episodes):
        curr_state = START_STATE
        # choosing action based on epsion greedy policy over Q
        if np.random.random()>epsilon:
            # exploit greedy action
            curr_action = np.argmax(Q[curr_state])
        else:
            #explore random action
            curr_action = np.random.randint(0,num_actions)
        while curr_state!=END_STATE:
            next_state, reward = env_step(curr_state, curr_action)
            # choosing next action based on epsilon greedy poicy over Q
            if np.random.random()>epsilon:
                next_action = np.argmax(Q[next_state])
            else:
                next_action = np.random.randint(0, num_actions)
            # updating Q
            Q[curr_state, curr_action] += alpha*(reward + gamma*Q[next_state, next_action] - Q[curr_state, curr_action])
            curr_state = next_state
            curr_action = next_action
    return Q

In [6]:
def greedy_policy(Q):
    pi = np.zeros((rows, columns))
    for state in range(num_states):
        pi[int(state/10)][state%10] = np.argmax(Q[state])
    return pi

In [7]:
# public variables setting the parameters
alpha = 0.3
epsilon = 0.1
gamma = 1 # undiscounted task
episodes = 8000

In [8]:
value_func = sarsa(alpha, epsilon, gamma, episodes)
print ("value function after sarsa:  \n\n", value_func )

value function after sarsa:  

 [[-18.13802806 -18.15913542 -18.26329373 -18.15013251]
 [-18.44747074 -17.76805532 -17.80635995 -18.14067031]
 [-17.24887441 -17.24570577 -15.23658762 -18.02098559]
 [-16.03090274 -15.65591872 -14.10268709 -16.83408045]
 [-14.39303914 -14.1655091  -12.54400734 -15.52514267]
 [-13.10629431 -13.24971426 -11.64874836 -14.03822091]
 [-12.02525146 -11.64807454 -10.47596022 -12.42620216]
 [-10.96375614 -10.56106766  -9.63656953 -12.3903449 ]
 [ -9.41566104  -9.54177514  -8.32120329 -11.59621292]
 [ -8.21475012  -6.6998387   -8.42849143 -10.06557301]
 [-18.46244541 -18.45334535 -17.76554431 -18.55161237]
 [-18.14590587 -17.70009684 -16.91124684 -17.87559568]
 [-17.38834901 -17.20046707 -15.35736386 -17.91573836]
 [-15.12100357 -15.09542622 -13.58949928 -16.51548317]
 [-14.06413059 -14.23711691 -12.45048744 -14.91787845]
 [-12.53544263 -12.65770321 -11.5210509  -13.7765586 ]
 [-12.02673323 -11.61093567 -10.25514993 -12.64564142]
 [-12.10001046 -11.95181507  -9.2

In [9]:
policy = greedy_policy(value_func)
print ("policy learnt: \n\n", policy)

policy learnt: 

 [[0. 1. 2. 2. 2. 2. 2. 2. 2. 1.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 1.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 1.]
 [2. 2. 2. 2. 2. 2. 2. 0. 2. 1.]
 [2. 2. 2. 2. 2. 2. 0. 1. 3. 3.]
 [2. 2. 0. 0. 2. 0. 0. 1. 2. 0.]
 [2. 2. 2. 2. 0. 0. 0. 0. 0. 3.]]


### Visual of the policy learnt:
<img src="windy_path.png" alt="The windy gridword" title="Windy gridworld" width="400" />

In [10]:
def run(policy):
    curr_state = START_STATE
    R = 0
    while curr_state!=END_STATE:
        curr_state,reward= env_step(curr_state, policy[int(curr_state/10)][curr_state%10])
        R+=reward
    return R

In [11]:
print ("return from following optimal policy: ", run(policy))

return from following optimal policy:  -15
