# Exercise 1 - Modifying the environment

We will make the problem slightly more complicated.

The floor is now covered in ice! 
When an agent makes a movement, it might slip and end up in another cell close to it.
##### The probability to arrive in the intended cell is 0.6, and the probability to end up in one of the 4 adjacent cells is 0.1.

Similar as for Lab 02, you should create a new Dungeon by inheriting from the original Dungeon environment.
##### Again, the step method will return the state instead of observations.
And the step function should incorportate these slippery dynamics.

##### When the agent slips, rewards accumulate! You could bang your head on the wall twice...


In [181]:
from dungeon.dungeon import Dungeon
import numpy as np
from collections import namedtuple, defaultdict

class IceDungeon(Dungeon):
    
    def __init__(self, N):
        
        super().__init__(N)
        
        # In order to explicitely show that the way you represent states doesn't matter, 
        # we will assign a random index for each coordinate of the grid        
        index_states = np.arange(0, N*N)
        np.random.shuffle(index_states)
        self.coord_to_index_state = index_states.reshape(N,N) #reshape!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        print(self.coord_to_index_state)
        
    def step(self, action):
        
        slip_actions = ('stay', 'up', 'down', 'left', 'right')
        obs, rew, done, bump = super().step(action)
        state = self.coord_to_index_state[ self.position_agent[0], self.position_agent[1]]
#         print(obs, rew, done, bump, state)
#         print(self.position_agent, 'time escape', self.time_elapsed)
        
        slip_action = np.random.choice(slip_actions, p=[0.6, 0.1, 0.1, 0.1, 0.1])
        
#         print(slip_action)
        if done == False and bump == False and slip_action != 'stay':
            obs, rew, done, bump = super().step(slip_action)
            
            state = self.coord_to_index_state[self.position_agent[0], self.position_agent[1]]
#             print(obs, rew, done, bump, state)
#             print(self.position_agent, 'time escape', self.time_elapsed)

        return obs, rew, done, bump, state
    
    def reset(self):
        
        super().reset()
        state = self.coord_to_index_state[ self.position_agent[0], self.position_agent[1]]
        #print(self.position_agent)
        
        return state
        

In [182]:
dungeon = IceDungeon(10)
dungeon.reset()
dungeon.display()

[[15 60 92  9 37 84 74 89 36  2]
 [77 22 46 28 72 93 75 86 23 79]
 [85 98  5 54  4 81 11  8 83 57]
 [16 73 58 13 56 20 99 68 32 41]
 [43 50 88 17 42 18 52 91 29 76]
 [27  3 55 95 10 64 62 35 61 80]
 [38 63 78 34 21 97  7 49 66  0]
 [59 71 30  1 12 51 82 26 69 39]
 [87 70 94 31 44 40 67 24  6 19]
 [33 96 53 45 48 14 47 90 25 65]]
X X X X X X X X X X 
X . . . . . . L . X 
X . . . . X . X . X 
X . . . A . . . . X 
X . . . . . . . . X 
X . . . . . L L . X 
X . . . . . . X X X 
X . . L . E . L . X 
X . . . . . X . . X 
X X X X X X X X X X 



In [183]:
a = dungeon.step('up')

# Exercise 2 - Monte-carlo learning

We will use a simple random policy function to evaluate the state values. 

First of all, create a random policy function that just picks random actions (check Lab 01).

Then, implement a MC learning class that allows to learn the values based on full rollouts of the policy in the environment.

Finally, you can generate rollouts of your policy in an environment, and update the values using MC-learning.

In [249]:
def random_policy():
    
    return np.random.choice(['up', 'down', 'left', 'right'])
    
class MC_Learning():
    
    def __init__(self, N, environment, policy, gamma, episodes):
        self.N = N
        self.environment = environment
        self.policy = policy
        self.gamma = gamma
        self.episodes = episodes
        self.values = {}
    
#     def run_single_exp(envir, policy):
    
#         obs = envir.reset()
#         done = False
    
#         total_reward = 0
    
#         while not done:
#             action = policy(obs)
#             obs, reward, done = envir.step(action)
#             total_reward += reward
        
#         return total_reward
    
#     def generate_episodes(self):
#         self.policy
#         G = 0
#         states_and_returns = []
#         first = True
#         for s, r in reversed(states_and_rewards):
#             print('11111111',s, r)
#             # the value of the terminal state is 0 by definition
#             # we should ignore the first state we encounter
#             # and ignore the last G, which is meaningless since it doesn't correspond to any move
#         if first:
#             first = False
#         else:
#             states_and_returns.append((s, G))
#         G = r + GAMMA*G
#         states_and_returns.reverse() # we want it to be in order of state visited
#         return states_and_returns
    
    def generate_episode(self): #, rollout
        
        state = self.environment.reset()
#         self.environment.display()#####################################################
        
        states_and_rewards = [(state, 0)]
        done = False
        total_reward = 0
        
        while not done:
            action = self.policy()
            obs, rew, done, bump, state = self.environment.step(action)
            #print(action,rew,state)
            states_and_rewards.append((state,rew))
            #total_reward += reward     
        #print(states_and_rewards)
        #print(done)
        
        G = 0
        states_and_returns = []
        first = True
        for state, rew in reversed(states_and_rewards):
            if first:
                first = False
            else:
                states_and_returns.append((state,G))
            G = rew + self.gamma * G
        states_and_returns.reverse()
        #print(states_and_returns)
        
        return states_and_returns
    
    def update_values(self): #, rollout
        
        # print(self.values)
        returns = {}
        states = self.environment.coord_to_index_state.reshape(self.N*self.N)
        
        for s in states:
            returns[s] = []
            
        for episode in range(self.episodes):
            states_and_returns = self.generate_episode()
            seen_states = set()
            #print(states_and_returns)
            for state, G in states_and_returns:
                #print(state,G)
                if state not in seen_states:
                    returns[state].append(G)
                    self.values[state] = np.mean(returns[state])
                    seen_states.add(state)
        print(self.values)
            
    def display_values(self):
        ...

In [None]:
dungeon = IceDungeon(10)
test = MC_Learning(10, dungeon, random_policy, 0.9, 9999)
test.update_values()

[[87 93 16 18 12 81 28 76 98 38]
 [99 36 35 95 68 90 82 92 10 86]
 [48 88 52 96 60 46  2 50 45 78]
 [21 33 94 27 64 77 97 54 19 30]
 [75 49 63 62 51 59 67  4 37 25]
 [39 43 61 66 91 70  9  1 71 34]
 [73 83 80 89 32 26  6 40 65 58]
 [31 22 14 24 42 74 57 55  3 85]
 [ 5 72 13 84 44 79 23 17  0 20]
 [29  8 69 47 11 56 53 41  7 15]]


# Exercise 3: TD-learning

We will now compare the values obtained by MC learning with values obtained with TD learning.

Create a class that allows to update values every time a new state transition occurs.


In [None]:
class TD_Learning():
    
    def __init__(self, N):
        
        self.values = ...
    
    def update_values(self, s_current, reward_next, s_next):
        ...
        
    def display_values(self):
        ...
        