# SARSA in windy gridworld
***
### The gridworld:
<img src="https://www.researchgate.net/profile/Markus_Dumke/publication/320890681/figure/fig1/AS:763210537922560@1558974980641/The-windy-gridworld-task-The-goal-is-to-move-from-the-start-state-S-to-the-goal-state-G.jpg" alt="The windy gridword" title="Windy gridworld" />

#### Choice of actions:
 - 0: up
 - 1: down
 - 2: right
 - 3:left

In [1]:
import numpy as np  
    

In [2]:
# public variables about the env
START_STATE = 30
END_STATE = 37
rows = 7
columns = 10
num_states = rows*columns
num_actions = 4

In [3]:
def windy(state):
    col=state%10
    if (col==6 or col==7):
        return 2
    elif (col>=3 and col<=5 or col==8):
        return 1
    else:
        return 0
    
def outside_limits(state):
    if state<0 or state>69:
        return True
    else:
        return False  

In [4]:
def env_step(state, action):
    wind = windy(state)
    #transition to next state depending on action and wind in current state
    if action==0 :
        next_state = state - columns - wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    elif action==1 :
        next_state = state + columns
        if outside_limits(next_state):
            next_state=state
        next_state -= wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    elif action==2 :
        next_state = state + 1
        if state%columns==9:
            next_state=state
        next_state -= wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    elif action==3 :
        next_state = state - 1
        if state%columns==0:
            next_state=state
        next_state -= wind*columns
        while outside_limits(next_state):
            next_state = next_state + columns
    # reward at each time step is -1 until the goal is reached
    reward = -1
    return [next_state, reward]



In [5]:
def sarsa(alpha, epsilon, gamma, episodes) :
    Q = np.zeros((num_states,num_actions))
    for episode in range(episodes):
        curr_state = START_STATE
        # choosing action based on epsion greedy policy over Q
        if np.random.random()>epsilon:
            # exploit greedy action
            curr_action = np.argmax(Q[curr_state])
        else:
            #explore random action
            curr_action = np.random.randint(0,num_actions)
        while curr_state!=END_STATE:
            next_state, reward = env_step(curr_state, curr_action)
            # choosing next action based on epsilon greedy poicy over Q
            if np.random.random()>epsilon:
                next_action = np.argmax(Q[next_state])
            else:
                next_action = np.random.randint(0, num_actions)
            # updating Q
            Q[curr_state, curr_action] += alpha*(reward + gamma*Q[next_state, next_action] - Q[curr_state, curr_action])
            curr_state = next_state
            curr_action = next_action
    return Q

In [6]:
def greedy_policy(Q):
    pi = np.zeros((rows, columns))
    for state in range(num_states):
        pi[int(state/10)][state%10] = np.argmax(Q[state])
    return pi

In [25]:
# public variables setting the parameters
alpha = 0.5
epsilon = 0.1
gamma = 1 # undiscounted task
episodes = 800000

In [26]:
value_func = sarsa(alpha, epsilon, gamma, episodes)
print ("value function after sarsa:  \n\n", value_func )

value function after sarsa:  

 [[-20.30219889 -20.65936121 -18.55421062 -20.45589392]
 [-20.09156595 -19.67884434 -17.44371898 -19.7732199 ]
 [-17.71814819 -17.93670987 -15.58741699 -18.68159937]
 [-16.12701517 -15.56078134 -14.58244986 -17.73387109]
 [-15.81661437 -15.33672106 -12.7893856  -15.79473408]
 [-15.51905613 -13.44412248 -12.09659627 -15.83354408]
 [-13.68131517 -12.00870231 -10.9804696  -14.85360932]
 [-11.54290526 -11.53215966  -9.1697782  -13.5167977 ]
 [-11.13897925 -10.14236146  -7.93300637 -12.15447049]
 [ -9.35684336  -6.92148078 -10.98270021 -11.44387474]
 [-20.52600076 -20.13126318 -18.33505203 -20.93987213]
 [-19.35220852 -20.01466013 -17.75751441 -20.50560417]
 [-16.45987395 -17.65477595 -17.38256435 -18.66566274]
 [-15.37974363 -16.00717456 -15.59956937 -17.63946682]
 [-14.91969845 -14.80911947 -12.93887627 -15.05878569]
 [-13.9777817  -13.23460499 -12.33113867 -15.19465275]
 [-12.21914113 -12.50491482 -10.30209844 -12.23172579]
 [-12.11806246 -11.58631735  -9.2

In [27]:
policy = greedy_policy(value_func)
print ("policy learnt: \n\n", policy)

policy learnt: 

 [[2. 2. 2. 2. 2. 2. 2. 2. 2. 1.]
 [2. 2. 0. 0. 2. 2. 2. 2. 2. 1.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 1.]
 [2. 2. 2. 2. 2. 2. 2. 0. 2. 1.]
 [2. 2. 2. 2. 2. 2. 0. 1. 3. 3.]
 [1. 2. 2. 2. 2. 0. 0. 2. 2. 0.]
 [2. 2. 2. 2. 0. 0. 0. 0. 0. 0.]]


### Visual of the policy learnt:
<img src="https://4.bp.blogspot.com/-nLpJR9NEvxA/W5FpxNVQu-I/AAAAAAAAAd8/NN0J3zNDXM0W0KLHvWIPpzGffBpYlEI3wCLcBGAs/s320/Capture.PNG" alt="The windy gridword" title="Windy gridworld" />

In [28]:
def run(policy):
    curr_state = START_STATE
    R = 0
    while curr_state!=END_STATE:
        curr_state,reward= env_step(curr_state, policy[int(curr_state/10)][curr_state%10])
        R+=reward
    return R

In [29]:
print ("return from following optimal policy: ", run(policy))

return from following optimal policy:  -15
