In [563]:
%pip install gym[toy_text]

zsh:1: no matches found: gym[toy_text]
Note: you may need to restart the kernel to use updated packages.


In [564]:
import gym
import numpy as np

env = gym.make("Blackjack-v1", render_mode="rgb_array")

In [565]:
import random
policy = np.random.randint(env.action_space.n, size=21)
policy

array([1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1])

In [566]:
transition = np.zeros((21 + 1, 2, 21 + 1))

for state in range(21):
    transition[state, 0, 21] = 1
    posible_states = min(21 - state, 11)
    if (state > 21 - 11):
        transition[state, 1, 21] = (11 - posible_states) / 11
    for next_state in range(state + 1, state + posible_states + 1):
        transition[state, 1, next_state] = 1 / posible_states

# transition

In [567]:
# rewards = np.zeros(env.observation_space[0].n)

# for s in range(env.observation_space[0].n):
#     rewards[s] = s if s <= 21 else -1

# rewards

rewards = np.zeros((21 + 1, 2, 21 + 1))

for state in range(21):
    rewards[state, 0, 21] = state

# rewards

In [568]:
import numpy as np
import matplotlib.pyplot as plt

class ValueIteration:
    def __init__(self, reward_function, transition_model, gamma):
        self.num_states = transition_model.shape[0]
        self.num_actions = transition_model.shape[1]
        self.reward_function = np.nan_to_num(reward_function)
        self.transition_model = transition_model
        self.gamma = gamma
        self.values = np.zeros(self.num_states)
        self.policy = None

    def one_iteration(self):
        delta = 0
        for s in range(self.num_states):
            temp = self.values[s]
            v_list = np.zeros(self.num_actions)
            for a in range(self.num_actions):
                p = self.transition_model[s, a]
                v_state = np.zeros(self.num_states)
                for next_state in range(len(p)):
                    v_state[next_state] = p[next_state] * (self.reward_function[s, a, next_state] + self.gamma * self.values[next_state])
                v_list[a] = np.sum(v_state)
            self.values[s] = max(v_list)
            delta = max(delta, abs(temp - self.values[s]))
        return delta

    def get_policy(self):
        pi = np.ones(self.num_states) * -1
        for s in range(self.num_states):
            v_list = np.zeros(self.num_actions)
            for a in range(self.num_actions):
                p = self.transition_model[s, a]
                v_state = np.zeros(self.num_states)
                for next_state in range(len(p)):
                    v_state[next_state] = p[next_state] * (self.reward_function[s, a, next_state] + self.gamma * self.values[next_state])
                v_list[a] = np.sum(v_state)

            max_index = []
            max_val = np.max(v_list)
            for a in range(self.num_actions):
                if v_list[a] == max_val:
                    max_index.append(a)
            pi[s] = np.random.choice(max_index)
        return pi.astype(int)

    def train(self, tol=1e-3, plot=True):
        epoch = 0
        delta = self.one_iteration()
        delta_history = [delta]
        while delta > tol:
            epoch += 1
            delta = self.one_iteration()
            delta_history.append(delta)
            if delta < tol:
                break
        self.policy = self.get_policy()
        
        print(f'# iterations of policy improvement: {len(delta_history)}')
        print(f'delta = {delta_history}')
        print(self.policy)

        if plot is True:
            _, ax = plt.subplots(1, 1, figsize=(3, 2), dpi=200)
            ax.plot(np.arange(len(delta_history)) + 1, delta_history, marker='o', markersize=4,
                    alpha=0.7, color='#2ca02c', label=r'$\gamma= $' + f'{self.gamma}')
            ax.set_xlabel('Iteration')
            ax.set_ylabel('Delta')
            ax.legend()
            plt.tight_layout()
            plt.show()








In [None]:
import numpy as np

solver = ValueIteration(rewards, transition, gamma=0.9)
solver.train()