In [None]:
import os  # noqa
import pickle  # noqa
import numpy as np  # noqa


class environment:
    def steps(self, state, action):
        self.state_win = state + action
        self.state_lose = state - action
        return self.state_win, self.state_lose


class agent:
    def __init__(self):
        self.states = 101
        self.v = np.zeros((self.states), dtype="float")
        self.v[-1] = 1
        self.stable = False
        self.theta = 0.001

    def possible_actions(self, state):
        actions = np.arange(1, min(state, 100 - state) + 1, 1)
        return actions

    def value_iteration(self, ph, env):
        delta = self.theta
        sweeps = []
        sweep = 0
        while delta >= self.theta:
            old_values = self.v.copy()
            for state in range(1, self.states - 1):
                values = []
                actions = self.possible_actions(state)
                for a in actions:
                    state_win, state_lose = env.steps(state, a)
                    value = (ph * self.v[state_win]) + ((1 - ph) * self.v[state_lose])
                    values.append(value)

                values = np.array(values)
                self.v[state] = np.amax(
                    values
                )  # update value function with value maximising action
            sweeps.append(old_values)
            sweep += 1
            delta = np.max(np.abs(old_values - self.v))

            print(f"Probability of Heads: {ph}")
            print(f"End of sweep: {sweep}, Delta = {delta}")

        return self.v, sweeps

    def find_policy(self, v, env, ph):
        stakes = []
        for state in range(1, self.states - 1):
            a_vals = []
            actions = self.possible_actions(state)
            for a in actions:
                state_win, state_lose = env.steps(state, a)
                a_val = (ph * v[state_win]) + ((1 - ph) * v[state_lose])
                a_vals.append(a_val)

            a_arr = np.array(a_vals)
            a_max = np.argmax(a_arr) + 1
            stakes.append(a_max)

        return stakes

In [None]:
    env = environment()
    agent = agent()

    phs = [0.25, 0.55]

    final_vf = {}
    sweeps = {}
    policy = {}

    for ph in phs:
        v_func, sweep = agent.value_iteration(ph, env)
        stakes = agent.find_policy(v_func, env, ph)
        final_vf[str(ph)] = v_func
        sweeps[str(ph)] = sweep
        policy[str(ph)] = stakes

    with open("data/final_v_functions.pickle", "wb") as f:
        pickle.dump(final_vf, f)

    with open("data/sweeps.pickle", "wb") as f:
        pickle.dump(sweeps, f)

    with open("data/policy.pickle", "wb") as f:
        pickle.dump(policy, f)

In [None]:
import pickle

import matplotlib.pyplot as plt
import numpy as np

with open("data/final_v_functions.pickle", "rb") as f:
    v_functions = pickle.load(f)

with open("data/sweeps.pickle", "rb") as f:
    sweeps = pickle.load(f)

with open("data/policy.pickle", "rb") as f:
    policy = pickle.load(f)

ph = ["0.25", "0.55"]

for p in ph:
    sweep_p = []
    policy_p = []
    for arr in sweeps[p]:
        sweep_p.append(arr.flatten())
    for arr in policy[p]:
        policy_p.append(arr)

    x = np.arange(0, 101, 1)
    x_pol = np.arange(1, 100, 1)

    fig = plt.figure(figsize=(15, 10))
    fig.suptitle(f"$P_h({p})$", fontsize=16)

    ax1 = fig.add_subplot(211)
    ax1.title.set_text("Value Function Approximation per Sweep")

    i = 1
    for arr in sweep_p:
        ax1.plot(x, arr, label="sweep: {}".format(i))
        i += 1

    ax1.legend()

    ax2 = fig.add_subplot(212)
    ax2.title.set_text("Optimal Policy")
    ax2.plot(x_pol, policy_p)

    plt.savefig("images/p_{}.png".format(p), dpi=300)