In [66]:
from numpy.random import choice
import numpy as np

In [220]:
## Defined by the student markov chain example given in lecture 2

state_names = ["C1", "C2", "C3", "Pass", "Pub", "FB", "Sleep"]

p_matrix = [[0, .5, 0, 0, 0, .5, 0],
            [0, 0, .8, 0, 0, 0, .2],
            [0, 0, 0, .6, .4, 0, 0],
            [0, 0, 0, 0, 0, 0, 1],
            [.2, .4, .4, 0, 0, 0, 0],
            [.1, 0, 0, 0, 0, .9, 0],
            [0, 0, 0, 0, 0, 0, 0]]

_rewards = [-2, -2, -2, 10, 1, -1, 0]

# Just cus
class ProbabilityMatrixException(Exception):
    def __init__(self, message):
        self.message = message
    
class RewardChain:
    def __init__(self, p_matrix, rewards, state_names, terminal_state):
        self.p_matrix = p_matrix
        self.state_names = state_names
        self.terminal_state = terminal_state
        self.rewards = rewards
        
        row_lengths = [len(row) for row in p_matrix]
        if not (len(set(row_lengths)) <= 1):
            raise ProbabilityMatrixException("p_matrix rows must have equal lengths!")
        
        if len(p_matrix[0]) != len(p_matrix):
            raise ProbabilityMatrixException("p_matrix must be square!")
            
        if not (len(self.rewards) == len(p_matrix)):
            raise Exception("rewards must be same length as p_matrix")
    
    # Generates a random path through the chain
    def sample(self, start_state):
        path = []
        if isinstance(start_state, str):
            start_state = self.state_names.index(start_state)
            
        state = start_state
        
        while state != self.terminal_state:
            path.append(state)
            transition_p = self.p_matrix[state]
            
            # numpy.random.choice accepts choices (states 0 through len(p_matrix)) and
            # equally sized list of probabilities for those choices
            next_state = choice(range(len(self.p_matrix)), p=transition_p)
            state = next_state
        
        path.append(self.terminal_state)
        
        return path
    
    
    def pretty(self, path):
        return [self.state_names[i] for i in path]

    # Uses the state_names to generate pretty paths instead of raw state integers
    def sample_pretty(self, start_state):
        path = self.raw_path(start_state)
        return self.pretty(path)
    
    
    def G(self, raw_path, discount_factor):
        counter = 0
        reward = 0
        for state in raw_path:
            r = self.rewards[state] * (discount_factor**counter)
            reward += (self.rewards[state] * (discount_factor**counter))
            counter += 1
        return reward
        
        
        
        
            

In [221]:
chain = RewardChain(p_matrix, _rewards, state_names, 6)

In [222]:
# path = chain.sample(0)
path = [0, 1, 2, 3, 6]
chain.pretty(path)

['C1', 'C2', 'C3', 'Pass', 'Sleep']

In [211]:
%%time
rewards = []
for i in range(0, 10000):
    path = chain.sample(5)
    reward = chain.G(path, 1)
    rewards.append(reward)

print('{0:.2f}'.format(np.mean(rewards)))
    

-22.82
CPU times: user 15.1 s, sys: 62.5 ms, total: 15.2 s
Wall time: 15.2 s


In [242]:
gamma = 1
R = np.array(_rewards)
P = np.matrix(p_matrix)
I = np.identity(len(p_matrix))

solution = np.dot(np.linalg.inv((I-gamma*P)), R)
solution = solution.tolist()[0]
for state in range(len(state_names)):
    print(state_names[state], solution[state])
    

C1 -12.543209876543214
C2 1.4567901234567908
C3 4.320987654320986
Pass 10.0
Pub 0.8024691358024683
FB -22.543209876543223
Sleep 0.0


matrix([[-12.54320988,   1.45679012,   4.32098765,  10.        ,
           0.80246914, -22.54320988,   0.        ]])