# Assignment 3

## Dynamic Programming (DP)

### 4.1 Iterative Policy Evaluation


- Problem: evaluate a given policy $\pi$
- Solution: iterative application of Bellman expectation backup

$$v_{k+1}= R_\pi + \gamma  P_\pi v_k$$


In [None]:
def policy_evaluation(self, pol:Policy) -> np.array:
    '''Iterative way to find the value functions given a specific policy'''
    mrp = self.find_mrp(pol)
    v0 = np.zeros(len(self.states))
    converge = False
    while not converge:
        v1 = mrp.reward_vector + self.gamma*mrp.transition_matrix.dot(v0)
        converge = is_approx_eq(np.linalg.norm(v1), np.linalg.norm(v0))
        v0 = v1
    return v1

### 4.2 Policy Iteration

- Policy evaluation: estimate $v_\pi$ by some policy evaluation algorithm (e.g. iterative)
- Policy improvement: generate $\pi' \geq \pi$ by some policy improvement algorithm (e.g. greedy)

In [None]:
def find_improved_policy(self, pol: Policy) -> DetPolicy:
    '''Find the policy that maximizes the act-value function in one iteration (greedy)'''
    q_dict = self.find_act_value_func_dict(pol)
    return DetPolicy({s: max(v.items(), key=itemgetter(1))[0]
                          for s, v in q_dict.items()})
            
def policy_iteration(self, tol=1e-4) -> DetPolicy:
    ''' Find the optimal policy using policy iteration '''
    pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in
                      self.state_action_dict.items()})
    vf = self.find_value_func_dict(pol)
    epsilon = tol * 1e4
    while epsilon >= tol:
        pol = self.find_improved_policy(pol)
        new_vf = self.find_value_func_dict(pol)
        epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
        vf = new_vf
    return pol

### 4.3 Value Iteration


- Problem: find optimal policy $\pi$.
- Only initialize $v_0$, no policy needed. 
- Solution: iterative application of Bellman optimality backup. Only extracts the best policy at last. 

$$v_{k+1}=\max_{a \in  A}  R^a + \gamma  P^av_k$$


In [None]:
def value_iteration_1(self, tol = 1e-4) -> DetPolicy:
    # Initialize value-function
    vf = np.zeros(len(self.all_states))
    transition_graph = self.transitions
    # action_graph = [{s:[k for k, i in v.items()] for s, v in reward_graph.items()}]
    action_sequence_list = [[k for k, i in v.items()] for s, v in self.rewards.items()]
    # Permute through all probable paths of actions
    action_permutations = list(itertools.product(*action_sequence_list))
    reward_list, prob_list = [v for s, v in self.rewards.items()], \
    [v for s, v in self.transitions.items()]
    # Initialize epsilon, the residual
    epsilon = 1
    while epsilon > tol:
        value_function_matrix = np.zeros((len(action_permutations),len(self.all_states)))
        # Find transition matrix and reward vector for each permutation
        for i in range(len(action_permutations)):
            current_rewards = []
            current_transition_graph = {}
            for j in range(len(self.all_states)):
                current_rewards.append(reward_list[j][action_permutations[i][j]])
                current_transition_graph[list(transition_graph.keys())[j]]\
                = prob_list[j][action_permutations[i][j]]

            mp_obj = MP(current_transition_graph)
            current_transition_matrix = mp_obj.transition_matrix
            value_function_matrix[i,:] = current_rewards + self.gamma * current_transition_matrix\
                                                     .dot(vf)     
        # Sort out the best value_function 
        k = len(self.all_states)-1
        value_function_matrix = value_function_matrix[value_function_matrix[:,k].argsort()]
        while k != 0:
            k = k-1
            pol_ind = value_function_matrix[:,k].argsort(kind='mergesort')
            value_function_matrix = value_function_matrix[pol_ind]   
        # Update the residual
        epsilon = max(np.absolute(value_function_matrix[-1,:]-vf))
        vf = value_function_matrix[-1,:]
        # Extract the optimal policy     
    pol = {self.all_states[i]: action_permutations[pol_ind[0]][i] \
               for i in range(len(self.all_states))} 
    print(pol)
    return DetPolicy(pol)