# CS 182 Project

*Hal Watts and Justin Gonzalez*

## Set up

In [None]:
import three_card_game
import five_card_game
import matplotlib
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pickle
from init_q import init_three_card_Q_table, init_five_card_Q_table
from random import uniform

## Initialize Q-tables

In [None]:
init_three_card_Q_table() # Initialize Q-table with zeros for larger state space
print("three_card_Q_table initialized")
init_five_card_Q_table() # Initialize Q-table with zeros for larger state space
print("five_card_Q_table initialized")

## Define methods for Q-learning

In [None]:
def get_Q_state(state):
    """ 
    Converts a given state to a string where each element in state is separated
    with a hyphen
    """
    state_string = '-'.join(str(e) for e in state)
    return state_string

def get_action(state_string):
    """ 
    Returns the maximum Q value for the given state along with the action that 
    results in that Q value
    """
    if state_string not in q_table:
        return -1, 0
    qSA = q_table[state_string]
    maxQ = max(qSA)
    return maxQ, qSA.index(maxQ)

## Results using pure random strategy

In [None]:
game = three_card_game.BlackJack()

total_reward = 0
episodes = 10000

for _ in range(episodes):
    state = game.reset()
    epochs, epoch_reward = 0, 0

    done = False
    
    while not done:
        state_string = get_Q_state(state)
        action = game.random_action()
        state, reward, done = game.step(action)

print(f"Results after {episodes} episodes:")
game.printWinPercentage()

## Train agent using Q-learning

In [None]:
# Load game environment
game = three_card_game.BlackJack()

# Load initialized Q-table
q_table = pickle.load(open("three_card_Q_table.p", "rb"))

# Hyperparameters
alpha = 0.0001
gamma = 1
epsilon = 0.1
episodes = 20000

# For plotting metrics
win_rate = [] #store the penalties per episode

for i in range(0, episodes):
    state = game.reset()
    state_string = get_Q_state(state)
    total_reward = 0
    done = False
    
    while not done:
        """
        You will find this code particularly helpful to get the next state from 
        your chosen action:
        next_state, reward, done, info = env.step(action)
        """
        # Find action with largest Q value for current state
        action_val, action = get_action(state_string)

        # Use epsilon-greedy to determine whether or not to explore
        r = uniform(0, 1)
        if r < epsilon:
            action = game.random_action()
            
        # Take action
        next_state, reward, done = game.step(action)
        next_state_string = get_Q_state(next_state)

        # Update q_table
        currentQ = q_table[state_string][action]
        nextAction_val, nextAction = get_action(next_state_string)
        q_table[state_string][action] = currentQ + alpha * (reward + gamma * nextAction_val - currentQ)

        # Update state_string
        state_string = next_state_string
        total_reward += reward

#     print(f"Episode: {i},\tReward: {total_reward}")
    
    win_rate.append(game.percentWin())

    # if i % 1000 == 0: #adjust to save the q_table dictionary at checkpoints
    #     pickle.dump( q_table, open( "q_table.p", "wb" ) )
print("Training finished.\n")
pickle.dump( q_table, open( "q_table.p", "wb" ) )

# Plot reward over episodes of training
x = np.arange(episodes)
plt.ylim(0.2,0.6)
plt.plot(x, win_rate, 'o', color='black', markersize=2)

# Plot average random strategy win rate
random_win_rate = [0.28] * episodes
plt.plot(x, random_win_rate, '-', color='red', markersize=0.1)

# Plot average human strategy win rate
human_win_rate = [0.425] * episodes
plt.plot(x, human_win_rate, '-', color='blue', markersize=0.1)

# Create legend
red_patch = mpatches.Patch(color='red', label='Random strategy')
blue_patch = mpatches.Patch(color='blue', label='Optimal human strategy')
black_patch = mpatches.Patch(color='black', label='Our agent\'s strategy')
plt.legend(handles=[black_patch, blue_patch, red_patch])

plt.show()

## Results using strategy from Q-learning

In [None]:
game = three_card_game.BlackJack()
q_table = pickle.load(open("q_table.p", "rb"))

total_reward = 0
episodes = 10000
win_rate = []

for _ in range(episodes):
    state = game.reset()
    epochs, epoch_reward = 0, 0

    done = False
    
    while not done:
        state_string = get_Q_state(state)
        action_val, action = get_action(state_string)
        state, reward, done = game.step(action)
    
    win_rate.append(game.percentWin())

print(f"Results after {episodes} episodes:")
game.printWinPercentage()

# Plot reward over episodes of training
x = np.arange(episodes)
plt.ylim(0.2,0.6)
plt.plot(x, win_rate, 'o', color='black', markersize=2)

# Plot average random strategy win rate
random_win_rate = [0.28] * episodes
plt.plot(x, random_win_rate, '-', color='red', markersize=0.1)

# Plot average human strategy win rate
human_win_rate = [0.425] * episodes
plt.plot(x, human_win_rate, '-', color='blue', markersize=0.1)

# Create legend
red_patch = mpatches.Patch(color='red', label='Random strategy')
blue_patch = mpatches.Patch(color='blue', label='Optimal human strategy')
black_patch = mpatches.Patch(color='black', label='Our agent\'s strategy')
plt.legend(handles=[black_patch, blue_patch, red_patch])

plt.show()

## Retrain using decaying epsilon

In [None]:
# Load game environment
game = three_card_game.BlackJack()

# Load initialized Q-table
q_table = pickle.load(open("three_card_Q_table.p", "rb"))

# Hyperparameters
alpha = 0.0001
gamma = 1
epsilon = 1
episodes = 20000

# For plotting metrics
win_rate = [] #store the penalties per episode

for i in range(0, episodes):
    state = game.reset()
    state_string = get_Q_state(state)
    total_reward = 0
    done = False
    
    while not done:
        """
        You will find this code particularly helpful to get the next state from 
        your chosen action:
        next_state, reward, done, info = env.step(action)
        """
        # Find action with largest Q value for current state
        action_val, action = get_action(state_string)

        # Use epsilon-greedy to determine whether or not to explore
        r = uniform(0, 1)
        if r < epsilon:
            action = game.random_action()
            
        # Take action
        next_state, reward, done = game.step(action)
        next_state_string = get_Q_state(next_state)

        # Update q_table
        currentQ = q_table[state_string][action]
        nextAction_val, nextAction = get_action(next_state_string)
        q_table[state_string][action] = currentQ + alpha * (reward + gamma * nextAction_val - currentQ)

        # Update state_string
        state_string = next_state_string
        total_reward += reward
        
    epsilon = epsilon * 0.999

#     print(f"Episode: {i},\tReward: {total_reward}")
    
    win_rate.append(game.percentWin())

    # if i % 1000 == 0: #adjust to save the q_table dictionary at checkpoints
    #     pickle.dump( q_table, open( "q_table.p", "wb" ) )
print("Training finished.\n")
pickle.dump( q_table, open( "q_table.p", "wb" ) )

# Plot reward over episodes of training
x = np.arange(episodes)
plt.ylim(0.2,0.6)
plt.plot(x, win_rate, 'o', color='black', markersize=2)

# Plot average random strategy win rate
random_win_rate = [0.28] * episodes
plt.plot(x, random_win_rate, '-', color='red', markersize=0.1)

# Plot average human strategy win rate
human_win_rate = [0.425] * episodes
plt.plot(x, human_win_rate, '-', color='blue', markersize=0.1)

# Create legend
red_patch = mpatches.Patch(color='red', label='Random strategy')
blue_patch = mpatches.Patch(color='blue', label='Optimal human strategy')
black_patch = mpatches.Patch(color='black', label='Our agent\'s strategy')
plt.legend(handles=[black_patch, blue_patch, red_patch])

plt.show()

## Results using decaying epsilon

In [None]:
game = three_card_game.BlackJack()
q_table = pickle.load(open("q_table.p", "rb"))

total_reward = 0
episodes = 10000
win_rate = []

for _ in range(episodes):
    state = game.reset()
    epochs, epoch_reward = 0, 0

    done = False
    
    while not done:
        state_string = get_Q_state(state)
        action_val, action = get_action(state_string)
        state, reward, done = game.step(action)
    
    win_rate.append(game.percentWin())

print(f"Results after {episodes} episodes:")
game.printWinPercentage()

# Plot reward over episodes of training
x = np.arange(episodes)
plt.ylim(0.2,0.6)
plt.plot(x, win_rate, 'o', color='black', markersize=2)

# Plot average random strategy win rate
random_win_rate = [0.28] * episodes
plt.plot(x, random_win_rate, '-', color='red', markersize=0.1)

# Plot average human strategy win rate
human_win_rate = [0.425] * episodes
plt.plot(x, human_win_rate, '-', color='blue', markersize=0.1)

# Create legend
red_patch = mpatches.Patch(color='red', label='Random strategy')
blue_patch = mpatches.Patch(color='blue', label='Optimal human strategy')
black_patch = mpatches.Patch(color='black', label='Our agent\'s strategy')
plt.legend(handles=[black_patch, blue_patch, red_patch])

plt.show()

## Retrain using larger state space

In [None]:
# Load game environment
game = five_card_game.BlackJack()

# Load initialized Q-table
q_table = pickle.load(open("five_card_Q_table.p", "rb"))

# Hyperparameters
alpha = 0.0001
gamma = 1
epsilon = 0.1
episodes = 20000

# For plotting metrics
win_rate = [] #store the penalties per episode

for i in range(0, episodes):
    state = game.reset()
    state_string = get_Q_state(state)
    total_reward = 0
    done = False
    
    while not done:
        """
        You will find this code particularly helpful to get the next state from 
        your chosen action:
        next_state, reward, done, info = env.step(action)
        """
        # Find action with largest Q value for current state
        action_val, action = get_action(state_string)

        # Use epsilon-greedy to determine whether or not to explore
        r = uniform(0, 1)
        if r < epsilon:
            action = game.random_action()
            
        # Take action
        next_state, reward, done = game.step(action)
        next_state_string = get_Q_state(next_state)

        # Update q_table
        currentQ = q_table[state_string][action]
        nextAction_val, nextAction = get_action(next_state_string)
        q_table[state_string][action] = currentQ + alpha * (reward + gamma * nextAction_val - currentQ)

        # Update state_string
        state_string = next_state_string
        total_reward += reward

#     print(f"Episode: {i},\tReward: {total_reward}")
    
    win_rate.append(game.percentWin())

    # if i % 1000 == 0: #adjust to save the q_table dictionary at checkpoints
    #     pickle.dump( q_table, open( "q_table.p", "wb" ) )
print("Training finished.\n")
pickle.dump( q_table, open( "q_table.p", "wb" ) )

# Plot reward over episodes of training
x = np.arange(episodes)
plt.ylim(0.2,0.6)
plt.plot(x, win_rate, 'o', color='black', markersize=2)

# Plot average random strategy win rate
random_win_rate = [0.28] * episodes
plt.plot(x, random_win_rate, '-', color='red', markersize=0.1)

# Plot average human strategy win rate
human_win_rate = [0.425] * episodes
plt.plot(x, human_win_rate, '-', color='blue', markersize=0.1)

# Create legend
red_patch = mpatches.Patch(color='red', label='Random strategy')
blue_patch = mpatches.Patch(color='blue', label='Optimal human strategy')
black_patch = mpatches.Patch(color='black', label='Our agent\'s strategy')
plt.legend(handles=[black_patch, blue_patch, red_patch])

plt.show()

## Results using larger state space

In [None]:
game = three_card_game.BlackJack()
q_table = pickle.load(open("q_table.p", "rb"))

total_reward = 0
episodes = 10000
win_rate = []

for _ in range(episodes):
    state = game.reset()
    epochs, epoch_reward = 0, 0

    done = False
    
    while not done:
        state_string = get_Q_state(state)
        action_val, action = get_action(state_string)
        state, reward, done = game.step(action)
    
    win_rate.append(game.percentWin())

print(f"Results after {episodes} episodes:")
game.printWinPercentage()

# Plot reward over episodes of training
x = np.arange(episodes)
plt.ylim(0.2,0.6)
plt.plot(x, win_rate, 'o', color='black', markersize=2)

# Plot average random strategy win rate
random_win_rate = [0.28] * episodes
plt.plot(x, random_win_rate, '-', color='red', markersize=0.1)

# Plot average human strategy win rate
human_win_rate = [0.425] * episodes
plt.plot(x, human_win_rate, '-', color='blue', markersize=0.1)

# Create legend
red_patch = mpatches.Patch(color='red', label='Random strategy')
blue_patch = mpatches.Patch(color='blue', label='Optimal human strategy')
black_patch = mpatches.Patch(color='black', label='Our agent\'s strategy')
plt.legend(handles=[black_patch, blue_patch, red_patch])

plt.show()