In [2]:
import numpy as np
import random

# Define the environment (state transitions and rewards)
class SimpleEnvironment:
    def __init__(self):
        self.states = ['A', 'B', 'C']  # Example states
        self.actions = ['left', 'right']  # Two possible actions
        self.current_state = 'A'  # Start state
    
    def step(self, action):
        if self.current_state == 'A':
            next_state = 'B' if action == 'right' else 'A'
            reward = 1 if action == 'right' else 0
        elif self.current_state == 'B':
            next_state = 'C' if action == 'right' else 'A'
            reward = 2 if action == 'right' else 0
        else:
            next_state = 'C'
            reward = 0
        
        done = next_state == 'C'  # Terminal state
        self.current_state = next_state
        return next_state, reward, done
    
    def reset(self):
        self.current_state = 'A'
        return self.current_state

# Monte Carlo Off-policy Control with Importance Sampling
class MonteCarloOffPolicy:
    def __init__(self, env, gamma=1.0):
        self.env = env
        self.gamma = gamma  # Discount factor
        self.Q = {s: {a: 0 for a in env.actions} for s in env.states}  # Q-values
        self.returns = {s: {a: [] for a in env.actions} for s in env.states}  # Store returns
        self.C = {s: {a: 0 for a in env.actions} for s in env.states}  # Importance sampling weights
        self.policy = {s: random.choice(env.actions) for s in env.states}  # Target policy (greedy)
    
    def generate_episode(self, behavior_policy):
        episode = []
        state = self.env.reset()
        done = False
        
        while not done:
            action = random.choices(self.env.actions, weights=behavior_policy[state])[0]
            next_state, reward, done = self.env.step(action)
            episode.append((state, action, reward))
            state = next_state
        
        return episode
    
    def train(self, num_episodes=1000):
        behavior_policy = {s: [0.5, 0.5] for s in self.env.states}  # Random behavior policy
        
        for _ in range(num_episodes):
            episode = self.generate_episode(behavior_policy)
            G = 0
            W = 1  # Importance Sampling Weight
            
            for t in range(len(episode) - 1, -1, -1):  # Iterate backwards
                state, action, reward = episode[t]
                G = self.gamma * G + reward
                self.C[state][action] += W
                self.Q[state][action] += (W / self.C[state][action]) * (G - self.Q[state][action])
                
                # Update policy greedily
                self.policy[state] = max(self.Q[state], key=self.Q[state].get)
                
                # If the behavior policy took an action different from the optimal policy, break
                if action != self.policy[state]:
                    break
                W *= 1.0 / behavior_policy[state][self.env.actions.index(action)]
    
    def get_optimal_policy(self):
        return self.policy

# Run the Monte Carlo off-policy algorithm
env = SimpleEnvironment()
mc_offpolicy = MonteCarloOffPolicy(env)
mc_offpolicy.train(num_episodes=10000)

# Print the learned optimal policy
print("Optimal Policy:", mc_offpolicy.get_optimal_policy())


Optimal Policy: {'A': 'left', 'B': 'left', 'C': 'right'}
