In [None]:
import numpy as np
import pickle
from itertools import product

In [None]:
class Qlearning:
    # learn_r -> learning rate      exp_r -> experimenting rate (how often to randomly choose)
    def __init__(self, learn_r=0.1, exp_r=0.2, train=True, gamma = 0.9):  

        self.player_Q_Values = {}  
        
        # key: [(player_val, up_card, ace)][action] = value
        # initialise Q values | (12-21) x (1-10) x (True, False) x (1, 0) 400 in total
            # This essentially represents each possible action in the game. Each combination of
                # player sum (hard and soft)/ dealer up card with 1 and 0 to represent the choice for that combination
        
        for i in range(12, 22):
            for j in range(1, 11):
                for k in [True, False]:
                    self.player_Q_Values[(i, j, k)] = {}
                    for a in [1, 0]:
                        if i == 21 and a == 0:
                            self.player_Q_Values[(i, j, k)][a] = 1
                        else:
                            self.player_Q_Values[(i, j, k)][a] = 0
        
        
        self.state = (0, 0, False)  # default state
        self.actions = [1, 0]  # 1 -> hit, 0 -> stand
        self.player_state_action = []
        self.end = False
        self.learn_r = learn_r
        self.exp_r = exp_r
        self.training = train
        self.gamma = gamma

    # Same logic here as in MC
    @staticmethod
    def giveCard():
        cards = [1,2,3,4,5,6,7,8,9,10,10,10,10]
        return np.random.choice(cards)
    
    # Used to allow auto win if 21 is pulled off deal
    def deal2cards(self, show=False):
        cards = [self.giveCard(), self.giveCard()]
        
        val = sum(cards)
        if 1 in cards:
            val += 10
            ace = True
        else:
            ace = False

        if show:
            return val, ace, cards[0]
        else:
            return val, ace
    
    def dealerLogic(self, val, ace):
        if val > 21:
            if ace:
                val -= 10
                ace = False
            else:
                return val, ace, True    

        # Assuming dealer stands on hard, hits soft 17, can be changed later
        if val > 17 or (val == 17 and not ace):
            return val, ace, True
        
        card = self.giveCard()
        if card == 1:
            if val + 11 > 21:
                # Card must be hard ace (1)
                return val + 1, ace, False
            else:
                # Card can be a soft ace (11)
                return val + 11, True, False
        else:
            return val+card, ace, False
    

    def chooseAction(self):
        # Always hit if val <= 11. Can change for testing
        current_val = self.state[0]
        if current_val <= 11:
            return 1
        
        # Here we balance experimentation vs explotiation
            # This means exp_r precent of the time, hit or stay is chosen at random
                # Otherwise, we rely on action that gains the besr reward
        
        # 'Decide' if a random choice will be made 
        if self.training and np.random.uniform(0, 1) <= self.exp_r:
            decision = np.random.choice(self.actions)

        else:
            # Greedy action
            v = float('-inf')
            decision = 0
            for a in self.player_Q_Values[self.state]:
                if self.player_Q_Values[self.state][a] > v:
                    decision = a
                    v = self.player_Q_Values[self.state][a]
        return decision

    # Take in action, update to next state, and determine if game is over
    def playerNxtState(self, action):
            current_val, up_card, ace = self.state      #unpack tuple
            
            if action:
                card = self.giveCard()
                if card == 1:
                    if current_val <= 10:
                        current_val += 11
                        ace = True
                    else:
                        current_val += 1
                else:
                    current_val += card
            else:
                # Player stands
                self.end = True
            
            if current_val > 21:
                if ace:
                    current_val -= 10
                    ace = False
                else:
                    self.end = True
            self.state = (current_val, up_card, ace)

        
    def reward(self, player_val, dealer_val, end=True):
        reward = 0
        if end:
            if player_val > 21:
                reward = -1
            else:
                if dealer_val > 21 or player_val > dealer_val:
                    reward = 1
                else:
                    reward = -1 if player_val < dealer_val else 0
        
        gamma = self.gamma  # Discount factor
        if not self.training:
            return reward

        # Backpropagate the determined reward
        # Called a Q-value update gradient descent
          # Backpropagate the determined reward
        # Called a Q-value update gradient descent
        for s in reversed(self.player_state_action):
            state, action = s
            reward = self.learn_r * (reward - self.player_Q_Values[state][action]) + self.player_Q_Values[state][action]
            self.player_Q_Values[state][action] = round(reward, 3)


    def reset(self):
        self.player_state_action = []
        self.state = (0, 0, False)  # initial state
        self.end = False


    def play(self, rounds=1000):
        for round in range(rounds):
            if round % 1000 == 0:  # Check every 1000 rounds
                pass
                #print(f"Round: {round}")

            # Deal
            dealer_val, d_ace, up_card = self.deal2cards(show=True)
            player_val, p_ace = self.deal2cards(show=False)

            self.state = (player_val, up_card, p_ace)
            #print("init", self.state)

            if player_val != 21 and dealer_val != 21:
                while not self.end:
                    action = self.chooseAction() 
                    if self.state[0] >= 12:
                        state_action_pair = [self.state, action]
                        self.player_state_action.append(state_action_pair)
                    self.playerNxtState(action)

                # Dealer plays
                end = False
                while not end:
                    dealer_val, d_ace, end = self.dealerLogic(dealer_val, d_ace)

                # Give reward and update Q value
                player_val = self.state[0]
                #print("player value {} | dealer value {}".format(player_val, dealer_val))
                self.reward(player_val, dealer_val)

            self.reset()

    def saveStrategy(self, file = "strategy"):
        f = open(file, 'wb')
        pickle.dump(self.player_Q_Values, f)
        f.close()
    
    def loadStrategy(self, file= "strategy"):
        f = open(file, 'rb')
        self.player_Q_Values = pickle.load(f)
        f.close()


    def playStrategy(self, rounds=1000, file = "strategy"):
        self.loadStrategy(file)
        self.exp_r = 0

        result = [0,0,0]  #[win, draw, lose]
        for round in range(rounds):

            dealer_val, d_ace, show_card = self.deal2cards(show=True)
            player_val, p_ace = self.deal2cards(show=False)

            self.state = (player_val, show_card, p_ace)

            # judge winner after 2 cards
            #if player_val == 21 or dealer_val == 21:
                #if player_val == dealer_val:
                    #result[1] += 1
                #elif player_val > dealer_val:
                    #result[0] += 1
                #else:
                    #result[2] += 1

            if player_val == 21:
                if dealer_val == 21:
                    result[1] += 1
                else:
                    result[0] += 1
            elif dealer_val == 21:
                result[2] += 1

            else:
                # player's turn
                while not self.end:
                    action = self.chooseAction()
                    self.playerNxtState(action)
                
                end = False
                while not end:
                    dealer_val, d_ace, end = self.dealerLogic(dealer_val, d_ace)
                player_val = self.state[0]
                # print("player value {} | dealer value {}".format(player_value, dealer_value))
                w = self.reward(player_val, dealer_val)
                if w == 1:
                    result[0] += 1
                elif w == 0:
                    result[1] += 1
                else:
                    result[2] += 1
            self.reset()
        return result

In [None]:
def displayStrategy(strat):  
    RED = '\033[91m'
    GREEN = '\033[92m'
    RESET = '\033[0m'

    # Prepare headers for the tables
    headers = "    A  " + "  ".join(f"{i}" for i in range(2, 11))
    
    # Prepare the rows for both Hard and Soft hands
    hard_rows = []
    soft_rows = []
    print("            Hard Hands                           Soft Hands")
    print(headers + "   " + headers)  # Print headers side by side
    
    for i in range(12, 22):
        hard_row = f"{i:<2}  "
        soft_row = f"{i:<2}  "
        for j in range(1, 11):
            # Hard hands row
            if strat[(i, j, False)][1] > strat[(i, j, False)][0]:
                hard_action = GREEN + 'H'
            else:
                hard_action = RED + 'S'  # Red for 'S'
            hard_row += f"{hard_action + RESET}  "
            
            # Soft hands row
            if strat[(i, j, True)][1] > strat[(i, j, True)][0]:
                soft_action = GREEN + 'H' 
            else:
                soft_action = RED + 'S'  # Red for 'S'
            soft_row += f"{soft_action + RESET}  "

        # Append each row to their respective list
        hard_rows.append(hard_row)
        soft_rows.append(soft_row)
    
    # Print each row side by side
    for hard_row, soft_row in zip(hard_rows, soft_rows):
        print(hard_row + "  " + soft_row)


In [None]:
### TRAIN ###

learn_r = 0.1
exp_r = 0.2
gamma_r = 0.9

rounds = 100000

bot = Qlearning(learn_r, exp_r,True,gamma_r)
bot.play(rounds)
print("Training Complete")
bot.saveStrategy()

In [51]:
### Compare Gameplay Between Book and Trained Strategies ###

games = 20000000

bookBot = Qlearning(train=False)
book_score = bookBot.playStrategy(games,"book_strategy")

trainedBot = Qlearning(train=False)
trained_score = bookBot.playStrategy(games,"averaged_strategy")


print(f"BookBot:\n[W-T-L]: {book_score}\nWin rate: {round((book_score[0]/games)*100,3)}%\n")
print("Not Losing Rate:",round((book_score[0]+book_score[1])/games*100,3))
print(f"\nTrainedBot:\n[W-T-L]: {trained_score}\nWin rate: {round((trained_score[0]/games)*100,3)}%")
print("Not Losing Rate:",round((trained_score[0]+trained_score[1])/games*100,3))


BookBot:
[W-T-L]: [8641884, 1741956, 9616160]
Win rate: 43.209%

Not Losing Rate: 51.919

TrainedBot:
[W-T-L]: [8639117, 1740740, 9620143]
Win rate: 43.196%
Not Losing Rate: 51.899


In [52]:
### Compare Table of Trained Strategy to Book Strategy ###

bookStrat = Qlearning()
bookStrat.loadStrategy("book_strategy")

compareStrat = Qlearning()
compareStrat.loadStrategy("averaged_strategy")

print("\t\t\t    Book Strategy\n")
displayStrategy(bookStrat.player_Q_Values)
print("\n\t\t\t    Trained Strategy\n")
displayStrategy(compareStrat.player_Q_Values)

			    Book Strategy

            Hard Hands                           Soft Hands
    A  2  3  4  5  6  7  8  9  10       A  2  3  4  5  6  7  8  9  10
12  [92mH[0m  [92mH[0m  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m    12  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  
13  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m    13  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  
14  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m    14  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  
15  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH

In [48]:
# Train x amount of times and average together
cohorts = 1000
learn_r = 0.09
exp_r = 0.23
gamma_r = 0.8 #not being used right now
rounds = 50000

#1000, 0.09, 0.23, 50000 resulted in 100% accuracy. 22 min to train.

mainstrat = Qlearning()
mainstrat.loadStrategy("empty_strategy")


for i in range(cohorts):
    if i % 500 == 0:
        print("Round:",i)
    bot = Qlearning(learn_r, exp_r,True,gamma_r)
    bot.play(rounds)

    for j in bot.player_Q_Values:
        '''
        if bot.player_Q_Values[j][0] > bot.player_Q_Values[j][1]:
            mainstrat.player_Q_Values[j][0] += 1
        else:
            mainstrat.player_Q_Values[j][1] += 1
        '''
        
        mainstrat.player_Q_Values[j][0] += bot.player_Q_Values[j][0]
        mainstrat.player_Q_Values[j][1] += bot.player_Q_Values[j][1]

displayStrategy(mainstrat.player_Q_Values)
mainstrat.saveStrategy("averaged_strategy1")


Round: 0
Round: 500
            Hard Hands                           Soft Hands
    A  2  3  4  5  6  7  8  9  10       A  2  3  4  5  6  7  8  9  10
12  [92mH[0m  [92mH[0m  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m    12  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  
13  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m    13  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  
14  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m    14  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  [92mH[0m  
15  [92mH[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [91mS[0m  [92mH[0m  [92mH[0

### Top three performances
* Learn Rate: 0.09, Exp Rate: 0.19, Avg Win Rate: 43.39333333333334%
 * Learn Rate: 0.01, Exp Rate: 0.23, Avg Win Rate: 43.339999999999996%
 * Learn Rate: 0.06, Exp Rate: 0.9, Avg Win Rate: 43.333333333333336%

In [None]:
### Chat GPT generated code. Goal is to find the best learn/ exp rate combination ###

def test_learning_rates(learn_rates, exp_rates, trials=10, training_rounds=20000, test_games=5000):
    # Initialize results dictionary
    results = {}
    performance_list = []

    # Loop through each combination of learning and experimenting rates
    for lr, er in product(learn_rates, exp_rates):
        avg_trained_wins = []
        print(f"Testing learn_r={lr}, exp_r={er}")

        # Perform trials
        for trial in range(trials):
            print(f" Trial {trial+1}/{trials}")

            # Train the bot
            bot = Qlearning(learn_r=lr, exp_r=er)
            bot.play(training_rounds)
            bot.saveStrategy("temp_strategy")

            # Test the trained bot
            trainedBot = Qlearning(train=False)
            trained_score = trainedBot.playStrategy(test_games, "temp_strategy")

            # Calculate win rates
            trained_win_rate = (trained_score[0] / test_games) * 100

            # Store results for this trial
            avg_trained_wins.append(trained_win_rate)

        # Compute average win rate over all trials for the current parameter combination
        average_trained = np.mean(avg_trained_wins)

        # Store in results dictionary
        results[(lr, er)] = average_trained

        # Add to the list for sorting and determining top performances
        performance_list.append((average_trained, lr, er))

        # Output intermediate results
        print(f"Finished testing for learn_r={lr}, exp_r={er}. Trained Avg: {average_trained}%")

    # Sort performances and select the top three
    top_performances = sorted(performance_list, reverse=True, key=lambda x: x[0])[:3]

    # Print the top three performances
    print("Top three performances:")
    for performance in top_performances:
        win_rate, lr, er = performance
        print(f"Learn Rate: {lr}, Exp Rate: {er}, Avg Win Rate: {win_rate}%")

    return top_performances

# Example usage:
learn_rates = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,0.00001]
exp_rates = [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25,0.5,0.9]
#top_three = test_learning_rates(learn_rates, exp_rates, trials=3)
