In [307]:
from numpy.random import choice
import numpy as np

In [337]:

state_names = ["C1", "C2", "C3", "Pass", "Pub", "FB", "Sleep"]

p_matrix = [[0, .5, 0, 0, 0, .5, 0],
            [0, 0, .8, 0, 0, 0, .2],
            [0, 0, 0, .6, .4, 0, 0],
            [0, 0, 0, 0, 0, 0, 1],
            [.2, .4, .4, 0, 0, 0, 0],
            [.1, 0, 0, 0, 0, .9, 0],
            [0, 0, 0, 0, 0, 0, 0]]

_rewards = [-2, -2, -2, 10, 1, -1, 0]

# Just cus
class ProbabilityMatrixException(Exception):
    def __init__(self, message):
        self.message = message
    
class RewardChain:
    def __init__(self, p_matrix, rewards, state_names, terminal_state):
        self.p_matrix = p_matrix
        self.state_names = state_names
        self.terminal_state = terminal_state
        self.rewards = rewards
        
        assert (len(set([len(row) for row in p_matrix])) == 1), "p_matrix rows must have equal lengths!"
        assert len(p_matrix[0]) == len(p_matrix), "p_matrix must be square!"
        assert (len(self.rewards) == len(p_matrix)), "rewards must be same length as p_matrix"
    
    # Generates a random path through the chain
    def sample(self, start_state):
        path = []
        if isinstance(start_state, str):
            start_indx = self.state_names.index(start_state)
        else:
            start_indx = start_state
            
        state = start_indx
        
        while state != self.terminal_state:
            path.append(state)
            transition_p = self.p_matrix[state]
            
            # numpy.random.choice accepts choices (states 0 through len(p_matrix)) and
            # equally sized list of probabilities for those choices
            next_state = choice(range(len(self.p_matrix)), p=transition_p)
            state = next_state
        
        path.append(self.terminal_state)
        
        if isinstance(start_state, str):
            return self.pretty(path)
        else:
            return path
    
    
    def pretty(self, path):
        return [self.state_names[i] for i in path]
    
    
    def G(self, path, gamma):
        str_check = [isinstance(x, str) for x in path]
        if any(str_check):
            assert all(str_check), "Path must be all int or all string"
            path = [self.state_names.index(x) for x in path]
        
        counter = 0
        reward = 0
        for state in path:
            r = self.rewards[state] * (gamma**counter)
            reward += (self.rewards[state] * (gamma**counter))
            counter += 1
        return reward
        
        
        
        
            

In [338]:
chain = RewardChain(p_matrix, _rewards, state_names, 6)

In [339]:
path = chain.sample("C1")
chain.G(path, 1)

-6

In [361]:
%%time
rewards = []
for i in range(0, 1000):
    path = chain.sample("C3")
    reward = chain.G(path, 1)
    rewards.append(reward)

print('{0:.2f}'.format(np.mean(rewards)))
    

4.08
CPU times: user 324 ms, sys: 3.82 ms, total: 328 ms
Wall time: 326 ms


In [249]:
gamma = 1
R = np.array(_rewards)
P = np.matrix(p_matrix)
I = np.identity(len(p_matrix))

solution = np.dot(np.linalg.inv((I-gamma*P)), R)
solution = solution.tolist()[0]
for state in range(len(state_names)):
    print(state_names[state], solution[state])
    

C1 -12.543209876543214
C2 1.4567901234567908
C3 4.320987654320986
Pass 10.0
Pub 0.8024691358024683
FB -22.543209876543223
Sleep 0.0


In [347]:
states = np.zeros(len(p_matrix))
for state in range(len(states)):
    rewards = []
    for i in range(0, 10000):
        path = chain.sample(state)
        reward = chain.G(path, 1)
        rewards.append(reward)
    
    states[state] = np.mean(rewards)



Sampling 		Closed Form Solution
-------- 		--------------------
C1: -12.326			C1: -12.543
C2: 1.331			C2: 1.457
C3: 4.173			C3: 4.321
Pass: 10.000			Pass: 10.000
Pub: 0.713			Pub: 0.802
FB: -22.540			FB: -22.543
Sleep: 0.000			Sleep: 0.000


In [360]:
print("\tSampling\tSolution")
print("\t--------\t--------")
for j in range(len(states)):
    print("{state}:\t{0:0.3f}\t\t{1:0.3f}".format(states[j], solution[j], state=state_names[j]))

	Sampling	Solution
	--------	--------
C1:	-12.326		-12.543
C2:	1.331		1.457
C3:	4.173		4.321
Pass:	10.000		10.000
Pub:	0.713		0.802
FB:	-22.540		-22.543
Sleep:	0.000		0.000
