Iowa Gambling Task is an interesting research point out unconscious human decision. In the experiments, each player is given \$2000. There are four stacks of cards, A, B, C, D. Each of them has positive and negative rewards. After 100 turn, the game will end. Stack A, and B guarantee that player will get negative networth. In constrast, if player draws cards, from C and D. They will get positive networth.

Review of SARSA (On-Policy):
$$Q(s, a) \leftarrow Q(s, a) + \alpha \left( r + \gamma Q(s', a') - Q(s, a) \right)
$$

- **$Q(s, a)$**: Current Q-value of choosing a particular deck $a$ when the agent has a certain amount of money $s$.
- **$r$**: Reward received after choosing the deck (could be positive or negative).
- **$Q(s', a')$**: Q-value of the next action $a'$ that will actually be taken in the new state $s'$ according to the current policy.
- **$\alpha$**: Learning rate.
- **$\gamma$**: Discount factor.
<div style="text-align:center;">
    <img src="ql2.svg" />
</div>

1. Next step is to define envirnment, state, action, reward, and hyperparameter:

    **state**: current money  
    **action**: draw card from one of four decks  
    **reward**: monetary gain or loss  
    **Q-table**: calculate future reward for each state  

    **learning rate** $\alpha$: 0.1  
    **discount factor** $\gamma$: 0.9 (review: discount factor is to reduce the future value)  
    **exploration rate** $\eta$: 0.1  


## First trial

In [39]:
import numpy as np
import random

# Initialize parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
initial_money = 2000  # Initial money
num_turns = 100  # Number of turns
num_decks = 4  # Number of decks

# Initialize Q-table with zeros
Q_table = np.zeros((num_decks,))
print(f"Q_table.shape: {Q_table}")

# Reward structure for each deck
def get_reward(deck, money):
    if deck == 0:  # Deck A
        reward = 100
        loss = -150 if random.random() < 0.5 else 0
    elif deck == 1:  # Deck B
        reward = 200
        loss = -250 if random.random() < 0.5 else 0
    elif deck == 2:  # Deck C
        reward = 50
        loss = -25 if random.random() < 0.5 else 0
    else:  # Deck D
        reward = 25
        loss = -75 if random.random() < 0.5 else 0
    
    # Check if the loss makes the money negative
    if money + reward + loss < 0:
        return -money  # Lose all the money if it goes negative
    else:
        return reward + loss

# Simulation
money = initial_money
choices = []

for turn in range(num_turns):
    # Choose action (deck) based on epsilon-greedy policy
    if random.random() < epsilon:
        # Exploration: Randomly choose a deck
        chosen_deck = random.randint(0, num_decks - 1)
    else:
        # Exploitation: Choose the deck with the highest Q-value
        chosen_deck = np.argmax(Q_table)

    # Get reward after choosing the deck
    reward = get_reward(chosen_deck, money)
    
    # Update the money
    # if the agent is going to be bankrupt money will equal 0
    money += reward
    
    # Update the Q-value using Q-Learning off-policy method
    best_next_action = np.argmax(Q_table)  # Best action for next state
    Q_table[chosen_deck] = (1 - alpha) * Q_table[chosen_deck] + alpha * (reward + gamma * Q_table[best_next_action])
    
    # Record the chosen deck for this turn
    choices.append(chosen_deck)

# Results
final_money = money
optimal_choices = choices

final_money, Q_table, optimal_choices


Q_table.shape: [0. 0. 0. 0.]


(4900,
 array([119.01536325,  83.33828231,  10.12354408,   0.        ]),
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])

## Upper-Confidence-Bound Action
<img src="image.png" height="70" />

In [8]:
# Initialize parameters for training over multiple episodes
num_episodes = 1000
final_money_list = []
optimal_choices_list = []

# Run multiple episodes
for episode in range(num_episodes):
    # Reinitialize Q-table and money for each episode
    Q_table = np.zeros((num_decks,))
    money = initial_money
    choices = []

    for turn in range(num_turns):
        # Choose action (deck) based on epsilon-greedy policy
        if random.random() < epsilon:
            # Exploration: Randomly choose a deck
            chosen_deck = random.randint(0, num_decks - 1)
        else:
            # Exploitation: Choose the deck with the highest Q-value
            chosen_deck = np.argmax(Q_table)

        # Get reward after choosing the deck
        reward = get_reward(chosen_deck, money)
        
        # Update the money
        money += reward
        
        # Update Q-value for the chosen deck
        best_next_action = np.argmax(Q_table)  # Best action for next state
        Q_table[chosen_deck] = (1 - alpha) * Q_table[chosen_deck] + alpha * (reward + gamma * Q_table[best_next_action])
        
        # Record the chosen deck for this turn
        choices.append(chosen_deck)

    # Record results for this episode
    final_money_list.append(money)
    optimal_choices_list.append(choices)

# Calculate average final money over all episodes
average_final_money = np.mean(final_money_list)

average_final_money


6568.05

## Comparing between on-policy and off-policy

In [28]:
# Importing the necessary libraries again and redefining the functions and parameters

import numpy as np
import random

# Initialize parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
num_decks = 4  # Number of decks in the Iowa Gambling Task
num_turns = 100  # Number of turns in each episode
num_episodes = 1000  # Number of episodes for training

# Function to get reward based on the chosen deck and current money
def get_reward(deck, current_money):
    if deck == 0:
        return np.random.choice([100, -150])
    elif deck == 1:
        return np.random.choice([200, -250])
    elif deck == 2:
        return np.random.choice([50, -50])
    else:
        return np.random.choice([100, -100])

# Function to run a single episode using SARSA
def run_episode_sarsa(Q_table):
    money = 2000  # Initial money
    for turn in range(num_turns):
        # Choose action (deck) based on epsilon-greedy policy
        if np.random.random() < epsilon:
            chosen_deck = np.random.randint(0, num_decks)
        else:
            chosen_deck = np.argmax(Q_table)
        
        # Get reward after choosing the deck
        reward = get_reward(chosen_deck, money)
        
        # Choose next action (deck) based on epsilon-greedy policy for next state
        if np.random.random() < epsilon:
            next_chosen_deck = np.random.randint(0, num_decks)
        else:
            next_chosen_deck = np.argmax(Q_table)
        
        # Update the Q-value using SARSA
        Q_table[chosen_deck] = (1 - alpha) * Q_table[chosen_deck] + \
                                alpha * (reward + gamma * Q_table[next_chosen_deck])
        
        # Update the money
        money += reward
    return money

# Function to run a single episode using Q-Learning
def run_episode_q_learning(Q_table):
    money = 2000  # Initial money
    for turn in range(num_turns):
        # Choose action (deck) based on epsilon-greedy policy
        if np.random.random() < epsilon:
            chosen_deck = np.random.randint(0, num_decks)
        else:
            chosen_deck = np.argmax(Q_table)
        
        # Get reward after choosing the deck
        reward = get_reward(chosen_deck, money)
        
        # Update the Q-value using Q-Learning
        Q_table[chosen_deck] = (1 - alpha) * Q_table[chosen_deck] + \
                                alpha * (reward + gamma * np.max(Q_table))
        
        # Update the money
        money += reward
    return money

# Initialize Q-tables
Q_table_sarsa = np.zeros(num_decks)
Q_table_q_learning = np.zeros(num_decks)

# Run SARSA and Q-Learning for multiple episodes and store final money
final_money_sarsa = []
final_money_q_learning = []

for episode in range(num_episodes):
    final_money_sarsa.append(run_episode_sarsa(Q_table_sarsa))
    final_money_q_learning.append(run_episode_q_learning(Q_table_q_learning))

# Calculate average final money for both methods
average_final_money_sarsa = np.mean(final_money_sarsa)
average_final_money_q_learning = np.mean(final_money_q_learning)

average_final_money_sarsa, average_final_money_q_learning


(1730.95, 1701.75)