In [5]:
#################
# DO NOT REMOVE
# Versions
# numpy==1.18.0
################
import numpy as np
import mdptoolbox.example

class MDPAgent(object):
    def __init__(self):
        self.money_gain = 0
        pass
    
    def expectation(self, is_bad_side, money_gain):
        """Calculate expectation according to die type and current gain"""
        die_size = len(is_bad_side)
        is_bad_side = np.array(is_bad_side)
        exp = (sum(np.where(is_bad_side== 0)[0]+1) - (np.count_nonzero(is_bad_side==1))*money_gain)/die_size
        
        return exp
    
    def rollresult(self, is_bad_side, money_gain):
        """the result of this roll"""
        # v value
        die_size = len(is_bad_side)
        result = np.zeros(die_size)
        for i in range(0, die_size):
            if is_bad_side[i] == 1:
                # forefeit all money
                result[i] = - money_gain
            elif is_bad_side[i] == 0:
                # add die number
                result[i] = i + 1
        
        return result
    
    def exp_threashold(self, is_bad_side, money_gain):
        """Calculate max_gain that have a positive expectation"""
        max_current = 0
        flag = 1
        while flag == 1:
            max_current = max_current + 1
            # print(self.expectation(is_bad_side, money_gain = max_current))
            if self.expectation(is_bad_side, max_current) < 0:
                flag = 0
            # print(max_current)
            
        return max_current - 1
    
    def decision(self, is_bad_side, money_gain):
        die_size = len(is_bad_side)
        quit_reward = money_gain
        roll_reward = money_gain + self.expectation(is_bad_side, money_gain)
     

    def solve(self, is_bad_side):
        """Implement the agent"""
        money_gain = self.money_gain 
        # for value > max_current, skip calculating expectation
        max_current = self.exp_threashold(is_bad_side, money_gain)
        print('max current = ', max_current)

        
        # step 1. determine maximum roll number
        # if expectations are all negative or zero, stop
        
        # initial roll must be able to run unless all sides are bad or all good
        is_bad_side = np.array(is_bad_side)
        
        if (is_bad_side == 1).sum() == len(is_bad_side):
            # all sides are bad
            exp = 0
        elif (is_bad_side == 0).sum() == len(is_bad_side):
            # all sides are good
            exp = 999999999
        else: 
            # search maximum depth 
            ini_exp = self.expectation(is_bad_side, money_gain)
            ini_result = self.rollresult(is_bad_side, money_gain)
            result = self.rollresult(is_bad_side, money_gain)
            
            
            ############################################################################
            
            run = 20
            length = len(is_bad_side)
            num_states = run * length + 2
            
            # find good sides
            is_bad_side = np.array(is_bad_side)
            is_good_side = -is_bad_side + 1
            
            ini_gain = self.rollresult(is_bad_side, money_gain)
            
            prob = np.zeros((2, num_states, num_states))

            np.fill_diagonal(prob[0], 1)
            # pylint: disable=invalid-name
            p = 1.0 / length
            zero = np.array([0]).repeat((run - 1) * length + 2)

            def summation(arr1, arr2):
                for _ in range(0, run * length + 2):
                    arr1 = np.concatenate((arr1, arr2), axis=0)
                return arr1

            # 1
            is_good_side_2 = np.concatenate((np.array([0]), is_good_side, zero), axis=0)
            is_good_side_n = np.concatenate((is_good_side_2, is_good_side_2), axis=0)
            is_good_side_n = summation(is_good_side_n, is_good_side_2)
            is_good_side_n = is_good_side_n[:(num_states ** 2)]
            is_good_side_n = is_good_side_n.reshape(num_states, num_states)
            prob[1] = np.triu(is_good_side_n)
            prob[1] = prob[1]*p
            prob_end = 1 - np.sum(prob[1, :num_states, :num_states-1], axis=1).reshape(-1, 1)

            prob[1] = np.concatenate((prob[1, :num_states, :num_states-1], prob_end), axis=1)
            np.sum(prob[0], axis=1)
            np.sum(prob[1], axis=1)
            rewards = np.zeros((2, num_states, num_states))
            rewards[0] = np.zeros((num_states, num_states))

            # 2

            dollar_2 = np.concatenate((np.array([0]), dollar, zero), axis=0)
            dollar_n = np.concatenate((dollar_2, dollar_2), axis=0)
            dollar_n = summation(dollar_n, dollar_2)
            dollar_n = dollar_n[:(num_states ** 2)]
            dollar_n = dollar_n.reshape(num_states, num_states)
            rewards[1] = np.triu(dollar_n)
            rewards_end = - np.array(range(0, num_states)).reshape(-1, 1)
            rewards[1] = np.concatenate((rewards[1, :num_states, :num_states-1], rewards_end), axis=1)

            val_it = mdptoolbox.mdp.ValueIteration(prob, rewards, 1)
            val_it.run()

            # optimal_policy = val_it.policy
            expected_values = val_it.V
            exp = (max(expected_values))
            
            ################################################################################
            
        return exp

    
    

[[0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5       ]
 [0.5 

In [3]:
import numpy as np

a = [[1,2],[3,4]]
b = [[1,2],[3,4]]



In [4]:
np.multiply(a, b)

array([[ 1,  4],
       [ 9, 16]])