In [1]:
import sys

sys.path.append("/Users/kanishkjain/opt/anaconda3/envs/gym/lib/python3.9/site-packages")

import random
import collections
from pprint import pprint

import gym
import gym_toytext
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class Agent:
    def __init__(
        self, environment="Roulette-v0", gamma=0.1, theta=1e-6, epsilon=1.0, alpha=0.1
    ) -> None:

        self.env = gym.make(environment)
        self.env.reset()

        self.gamma = gamma
        self.theta = theta
        self.epsilon = epsilon

        self.alpha = alpha

        self.A_space = self.env.action_space
        self.S_space = self.env.observation_space
        self.R_range = self.env.reward_range

        self.Num_A = self.A_space.n
        self.Num_S = self.S_space.n
        
        if environment == 'CliffWalking-v0':
            for s in range(self.Num_S):
                for a in range(self.Num_A):
                    P, S_, R_, T = self.env.P[s][a][0]
                    if T:
                        self.env.P[s][a] = [(P, S_, 0, T)]

    def soft_policy(self):

        Pi = np.ones((self.Num_S, self.Num_A)) / self.Num_A
        return Pi

    def greedy_policy(self, Q):
        Pi = np.zeros((self.Num_S, self.Num_A))
        for s in range(self.Num_S):
            a_star = np.argmax([Q[(s, a)] for a in range(self.Num_A)])
            Pi[s, a_star] = 1.0
        return Pi

    def epsilon_greedy_policy(self, Q, s):

        p = random.random()
        if p < self.epsilon:
            return np.random.choice(self.Num_A)
        else:
            # A = np.argmax([Q[(s, a)] for a in range(self.Num_A)])
            # prob = np.array([np.exp(x) for x in Q[s]])
            # prob = prob/(sum(prob))
            # A = np.random.choice(self.Num_A, p=prob)
            A = np.argmax(Q[s])
            return A

    def on_policy_monte_carlo(self, num_iter=10):

        self.epsilon = 0.9

        NUM_ITER = num_iter

        Q = collections.defaultdict(lambda: np.ones(self.Num_A)/self.Num_A)
        returns = collections.defaultdict(float)

        Pi = self.soft_policy()
        print("Starting Policy:, ", Pi)

        S_A_count = collections.defaultdict(int)

        rewards_per_episode = []
        unique_states = []

        for it in range(NUM_ITER):
            episode = self.generate_episode(Pi)
            if it % 50 == 0:
                print(f"Generating Episode Number: {it}")

            # self.epsilon = max(self.epsilon * 0.99, 0.01)

            S_A = set([(S, A) for (S, A, _) in episode])

            for S, A in S_A:
                first_idx = [
                    i for i, (s, a, _) in enumerate(episode) if (s == S and a == A)
                ][0]
                G = sum(
                    [
                        r * (self.gamma ** i)
                        for i, (s, a, r) in enumerate(episode[first_idx:])
                    ]
                )

                returns[(S, A)] += G
                S_A_count[(S, A)] += 1
                Q[S][A] = returns[(S, A)] / S_A_count[(S, A)]

            distinct_states = set([s for s, a in S_A])

            for s in distinct_states:
                a_star = np.argmax(Q[s])
                for a in range(self.Num_A):
                    if a == a_star:
                        Pi[s][a] = 1 - self.epsilon + self.epsilon / self.Num_A
                    else:
                        Pi[s][a] = self.epsilon / self.Num_A

        return Pi

    def off_policy_monte_carlo(self, num_iter=10):

        self.epsilon = 0.3

        NUM_ITER = num_iter

        Q = collections.defaultdict(lambda: np.ones(self.Num_A)/self.Num_A)
        C = collections.defaultdict(lambda: np.zeros(self.Num_A))
        
        Mu = self.soft_policy()

        for it in range(NUM_ITER):
            episode = self.generate_episode(Mu)
            if it % 50 == 0:
                print(f"Generating Episode Number: {it}")

            G = 0
            W = 1
            
            state = set()
            for it_, info in enumerate(episode[::-1]):
                S, A, R = info
                
                state.add(S)
                
                G = self.gamma * G + R
                
                C[S][A] += W
                Q[S][A] = Q[S][A] + (W/C[S][A]) * (G - Q[S][A])

                Pi = self.epsilon_greedy_policy(Q, S)
                
                W = W * Pi / Mu[S, A]
                
                if W == 0:
                    print(S, A, R, it_)
                    break

            print(state)
        pprint(Q)
        return Q

    def q_learning(self, num_iter=301):

        self.epsilon = .9

        NUM_ITER = num_iter

        Q = collections.defaultdict(lambda: np.zeros(self.Num_A))

        for it in range(NUM_ITER):

            if it % 50 == 0:
                print(f"Generating Episode Number: {it}")

            S = self.env.reset()

            # self.epsilon = max(self.epsilon * 0.999, 0.1)

            while True:
                A = self.epsilon_greedy_policy(Q, S)
                S_, R, terminal, _ = self.env.step(A)

                Q[S][A] += self.alpha * (R + self.gamma * max(Q[S_]) - Q[S][A])

                S = S_

                if terminal:
                    break

        return Q

    def sarsa(self, num_iter=301):

        self.epsilon = .9

        NUM_ITER = num_iter

        Q = collections.defaultdict(lambda: np.zeros(self.Num_A))

        for it in range(NUM_ITER):
            if it % 50 == 0:
                print(f"Generating Episode Number: {it}")

            S = self.env.reset()
            A = self.epsilon_greedy_policy(Q, S)

            #self.epsilon = max(self.epsilon * 0.999, 0.1)

            while True:
                S_, R, terminal, _ = self.env.step(A)
                A_ = self.epsilon_greedy_policy(Q, S_)
                
                Q[S][A] += self.alpha*(R + (self.gamma * Q[S_][A_]) - Q[S][A])

                S = S_
                A = A_

                if terminal:
                    break
        return Q

    def generate_episode(self, Pi):

        episode = []

        S = self.env.reset()
        while True:
            A = np.random.choice(np.arange(self.Num_A), p=Pi[S])
            S_, R, terminal, _ = self.env.step(A)
            episode.append((S, A, R))
            S = S_
            if terminal:
                break
        return episode

    def show_policy(self, Q):

        MAX_STEPS = 500

        S = self.env.reset()
        print(f"Starting state: {S}")
        # self.env.render()

        step = 0
        while step < MAX_STEPS:
            # A = self.epsilon_greedy_policy(Q, S)
            A = np.argmax(Q[S])
            S_, R, done, _ = self.env.step(A)
            # self.env.render()
            if done:
                break
            print(
                f"Current State: {S}, action: {A}, reward: {R}, done: {done}, step: {step}"
            )
            S = S_
            step += 1
        print(
            f"Current State: {S}, action: {A}, reward: {R}, done: {done}, step: {step}"
        )
        # self.env.render()
        self.env.close()
        print("Finished", done)

In [3]:
agent = Agent(environment='CliffWalking-v0')

In [4]:
agent.env.P[36]

{0: [(1.0, 24, -1, False)],
 1: [(1.0, 36, -100, False)],
 2: [(1.0, 36, -1, False)],
 3: [(1.0, 36, -1, False)]}

In [5]:
S = agent.env.reset()
print(S)

36


In [6]:
on_policy = agent.on_policy_monte_carlo()

Starting Policy:,  [[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.2

In [7]:
agent.show_policy(on_policy)

Starting state: 36
Current State: 36, action: 0, reward: -1, done: False, step: 0
Current State: 24, action: 0, reward: -1, done: False, step: 1
Current State: 12, action: 0, reward: -1, done: False, step: 2
Current State: 0, action: 3, reward: -1, done: False, step: 3
Current State: 0, action: 3, reward: -1, done: False, step: 4
Current State: 0, action: 3, reward: -1, done: False, step: 5
Current State: 0, action: 3, reward: -1, done: False, step: 6
Current State: 0, action: 3, reward: -1, done: False, step: 7
Current State: 0, action: 3, reward: -1, done: False, step: 8
Current State: 0, action: 3, reward: -1, done: False, step: 9
Current State: 0, action: 3, reward: -1, done: False, step: 10
Current State: 0, action: 3, reward: -1, done: False, step: 11
Current State: 0, action: 3, reward: -1, done: False, step: 12
Current State: 0, action: 3, reward: -1, done: False, step: 13
Current State: 0, action: 3, reward: -1, done: False, step: 14
Current State: 0, action: 3, reward: -1, do

In [8]:
off_policy = agent.off_policy_monte_carlo()

Generating Episode Number: 0
35 2 0 0
{35}
35 2 0 0
{35}
22 2 -1 2
{34, 35, 22}
35 2 0 0
{35}
35 2 0 0
{35}
35 2 0 0
{35}
35 2 0 0
{35}
23 2 -1 1
{35, 23}
35 2 0 0
{35}
35 2 0 0
{35}
defaultdict(<function Agent.off_policy_monte_carlo.<locals>.<lambda> at 0x7f80aad33d30>,
            {22: array([ 0.25,  0.25, -1.1 ,  0.25]),
             23: array([ 0.25,  0.25, -1.  ,  0.25]),
             34: array([ 0.25, -1.  ,  0.25,  0.25]),
             35: array([0.25, 0.25, 0.  , 0.25])})


In [9]:
agent.show_policy(off_policy)

Starting state: 36
Current State: 36, action: 0, reward: -1, done: False, step: 0
Current State: 24, action: 0, reward: -1, done: False, step: 1
Current State: 12, action: 0, reward: -1, done: False, step: 2
Current State: 0, action: 0, reward: -1, done: False, step: 3
Current State: 0, action: 0, reward: -1, done: False, step: 4
Current State: 0, action: 0, reward: -1, done: False, step: 5
Current State: 0, action: 0, reward: -1, done: False, step: 6
Current State: 0, action: 0, reward: -1, done: False, step: 7
Current State: 0, action: 0, reward: -1, done: False, step: 8
Current State: 0, action: 0, reward: -1, done: False, step: 9
Current State: 0, action: 0, reward: -1, done: False, step: 10
Current State: 0, action: 0, reward: -1, done: False, step: 11
Current State: 0, action: 0, reward: -1, done: False, step: 12
Current State: 0, action: 0, reward: -1, done: False, step: 13
Current State: 0, action: 0, reward: -1, done: False, step: 14
Current State: 0, action: 0, reward: -1, do

In [10]:
q_policy = agent.q_learning()

Generating Episode Number: 0
Generating Episode Number: 50
Generating Episode Number: 100
Generating Episode Number: 150
Generating Episode Number: 200
Generating Episode Number: 250
Generating Episode Number: 300


In [11]:
agent.show_policy(q_policy)

Starting state: 36
Current State: 36, action: 0, reward: -1, done: False, step: 0
Current State: 24, action: 1, reward: -1, done: False, step: 1
Current State: 25, action: 1, reward: -1, done: False, step: 2
Current State: 26, action: 1, reward: -1, done: False, step: 3
Current State: 27, action: 1, reward: -1, done: False, step: 4
Current State: 28, action: 1, reward: -1, done: False, step: 5
Current State: 29, action: 1, reward: -1, done: False, step: 6
Current State: 30, action: 1, reward: -1, done: False, step: 7
Current State: 31, action: 1, reward: -1, done: False, step: 8
Current State: 32, action: 1, reward: -1, done: False, step: 9
Current State: 33, action: 1, reward: -1, done: False, step: 10
Current State: 34, action: 1, reward: -1, done: False, step: 11
Current State: 35, action: 2, reward: 0, done: True, step: 12
Finished True


In [12]:
sarsa_policy = agent.sarsa()

Generating Episode Number: 0
Generating Episode Number: 50
Generating Episode Number: 100
Generating Episode Number: 150
Generating Episode Number: 200
Generating Episode Number: 250
Generating Episode Number: 300


In [13]:
agent.show_policy(sarsa_policy)

Starting state: 36
Current State: 36, action: 0, reward: -1, done: False, step: 0
Current State: 24, action: 0, reward: -1, done: False, step: 1
Current State: 12, action: 0, reward: -1, done: False, step: 2
Current State: 0, action: 3, reward: -1, done: False, step: 3
Current State: 0, action: 3, reward: -1, done: False, step: 4
Current State: 0, action: 3, reward: -1, done: False, step: 5
Current State: 0, action: 3, reward: -1, done: False, step: 6
Current State: 0, action: 3, reward: -1, done: False, step: 7
Current State: 0, action: 3, reward: -1, done: False, step: 8
Current State: 0, action: 3, reward: -1, done: False, step: 9
Current State: 0, action: 3, reward: -1, done: False, step: 10
Current State: 0, action: 3, reward: -1, done: False, step: 11
Current State: 0, action: 3, reward: -1, done: False, step: 12
Current State: 0, action: 3, reward: -1, done: False, step: 13
Current State: 0, action: 3, reward: -1, done: False, step: 14
Current State: 0, action: 3, reward: -1, do

Current State: 0, action: 3, reward: -1, done: False, step: 364
Current State: 0, action: 3, reward: -1, done: False, step: 365
Current State: 0, action: 3, reward: -1, done: False, step: 366
Current State: 0, action: 3, reward: -1, done: False, step: 367
Current State: 0, action: 3, reward: -1, done: False, step: 368
Current State: 0, action: 3, reward: -1, done: False, step: 369
Current State: 0, action: 3, reward: -1, done: False, step: 370
Current State: 0, action: 3, reward: -1, done: False, step: 371
Current State: 0, action: 3, reward: -1, done: False, step: 372
Current State: 0, action: 3, reward: -1, done: False, step: 373
Current State: 0, action: 3, reward: -1, done: False, step: 374
Current State: 0, action: 3, reward: -1, done: False, step: 375
Current State: 0, action: 3, reward: -1, done: False, step: 376
Current State: 0, action: 3, reward: -1, done: False, step: 377
Current State: 0, action: 3, reward: -1, done: False, step: 378
Current State: 0, action: 3, reward: -1,

In [14]:
dict(sarsa_policy)

{36: array([  -1.20091309, -101.74962122,   -3.65018643,   -2.48144622]),
 24: array([-1.1146366 , -3.52126048, -2.29739163, -1.26445332]),
 25: array([  -1.14796998,   -4.41455618, -103.83050652,   -1.22181707]),
 26: array([  -1.14452302,   -1.41895577, -102.61450119,   -2.78873296]),
 13: array([-1.11161537, -1.18458809, -1.72359029, -1.11656789]),
 12: array([-1.11123377, -1.1366191 , -1.2648748 , -1.11401786]),
 0: array([-1.11126455, -1.11149955, -1.11619181, -1.11121115]),
 1: array([-1.11200075, -1.11162754, -1.12773244, -1.11123554]),
 2: array([-1.11228749, -1.1129993 , -1.17113046, -1.11180111]),
 14: array([-1.11258206, -1.1225903 , -5.15097387, -1.15964637]),
 3: array([-1.11229811, -1.11329454, -1.18232036, -1.11192564]),
 4: array([-1.11331542, -1.11239441, -1.1885532 , -1.11238761]),
 16: array([-1.11490012, -1.14141725, -3.85584926, -1.13550054]),
 28: array([  -1.13963841,   -3.99490059, -103.78844717,   -5.64267201]),
 29: array([  -1.20178896,   -2.72291147, -103.35

In [15]:
dict(q_policy)

{36: array([  -1.11111111, -100.11111111,   -1.11111111,   -1.11111111]),
 24: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 25: array([  -1.11111111,   -1.11111111, -100.11111111,   -1.11111111]),
 26: array([  -1.11111111,   -1.11111111, -100.11111111,   -1.11111111]),
 14: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 13: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 12: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 0: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 1: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 2: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 3: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 15: array([-1.11111111, -1.11111111, -1.11111111, -1.11111111]),
 27: array([  -1.11111111,   -1.1111111 , -100.11111111,   -1.11111111]),
 28: array([  -1.11111111,   -1.111111  , -100.11111111,   -1.11111111]),
 16: array([-1.11111111, -1.1111111 , -1

In [16]:
# sarsa_policy[0]

In [17]:
# prob = np.array([np.exp(x) for x in sarsa_policy[0]])
# prob = prob/(sum(prob))
# print(prob)

In [18]:
#Building the environment
env = gym.make('Roulette-v0')

In [19]:
def create_random_policy(nA):
    """
    Creates a random policy function.
    
    Args:
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes an observation as input and returns a vector
        of action probabilities
    """
    A = np.ones(nA, dtype=float) / nA
    def policy_fn(observation):
        return A
    return policy_fn

In [20]:
def create_greedy_policy(Q):
    """
    Creates a greedy policy based on Q values.
    
    Args:
        Q: A dictionary that maps from state -> action values
        
    Returns:
        A function that takes an observation as input and returns a vector
        of action probabilities.
    """
    
    def policy_fn(state):
        A = np.zeros_like(Q[state], dtype=float)
        best_action = np.argmax(Q[state])
        A[best_action] = 1.0
        return A
    return policy_fn

In [21]:
def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):
    """
    Monte Carlo Control Off-Policy Control using Weighted Importance Sampling.
    Finds an optimal greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        behavior_policy: The behavior to follow while generating episodes.
            A function that given an observation returns a vector of probabilities for each action.
        discount_factor: Gamma discount factor.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities. This is the optimal greedy policy.
    """
    
    # The final action-value function.
    # A dictionary that maps state -> action values
    Q = collections.defaultdict(lambda: np.zeros(env.action_space.n))
    # The cumulative denominator of the weighted importance sampling formula
    # (across all episodes)
    C = collections.defaultdict(lambda: np.zeros(env.action_space.n))
    
    # Our greedily policy we want to learn
    target_policy = create_greedy_policy(Q)
        
    for i_episode in range(1, num_episodes + 1):
        # Print out which episode we're on, useful for debugging.
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        episode = []
        state = env.reset()
        for t in range(100):
            # Sample an action from our policy
            probs = behavior_policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        
        # Sum of discounted returns
        G = 0.0
        # The importance sampling ratio (the weights of the returns)
        W = 1.0
        # For each step in the episode, backwards
        for t in range(len(episode))[::-1]:
            state, action, reward = episode[t]
            # Update the total reward since step t
            G = discount_factor * G + reward
            # Update weighted importance sampling formula denominator
            C[state][action] += W
            # Update the action-value function using the incremental update formula (5.7)
            # This also improves our target policy which holds a reference to Q
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            # If the action taken by the behavior policy is not the action 
            # taken by the target policy the probability will be 0 and we can break
            if action !=  np.argmax(target_policy(state)):
                break
            W = W * 1./behavior_policy(state)[action]
        
    return Q, target_policy

In [22]:
random_policy = create_random_policy(env.action_space.n)
Q, policy = mc_control_importance_sampling(env, num_episodes=15000, behavior_policy=random_policy)

Episode 15000/15000.

In [23]:
Q

defaultdict(<function __main__.mc_control_importance_sampling.<locals>.<lambda>()>,
            {0: array([-2.93646592,  0.04      ,  0.03448276,  0.125     , -0.90627063,
                    -0.18032787, -1.26666667, -0.03125   ,  0.18518519, -0.1       ,
                     3.98123626, -0.98828125,  0.0625    , -0.30434783, -0.23529412,
                    -0.22580645,  0.15151515, -0.08571429, -0.02702703, -0.15151515,
                     1.32786885,  0.01886792,  2.99996739, -1.21212121, -0.2173913 ,
                    -0.03448276, -0.875     ,  2.86129458, -1.10958904, -0.14285714,
                     1.33928571, -0.07142857, -0.125     , -0.21428571, -0.90633245,
                     1.11940299, -1.25      ,  0.        ])})

In [24]:
agent.show_policy(Q)

Starting state: 36
Current State: 36, action: 0, reward: -1, done: False, step: 0
Current State: 24, action: 0, reward: -1, done: False, step: 1
Current State: 12, action: 0, reward: -1, done: False, step: 2


KeyError: 10