
Reinforcement Learning and Dynamic Optimization,\
Poker Playing Agent Project : Second Assignment

Group 18:\
Leonidas Bakopoulos AM 2018030036 \
Alexandra Tsipouraki AM 2018030089


#Note:
In order for a human to play against the (pre-trained) DQN, please run the human evaluation.py in the .py scripts (not in colab). Otherwise the necessery files (weights of the pretained models) will not be available.

In [1]:
!pip3 install rlcard

Collecting rlcard
  Downloading rlcard-1.2.0.tar.gz (269 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/269.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m194.6/269.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.0/269.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rlcard
  Building wheel for rlcard (setup.py) ... [?25l[?25hdone
  Created wheel for rlcard: filename=rlcard-1.2.0-py3-none-any.whl size=325793 sha256=c9f7a170be3b97d877424530cfb499958e2011d502e4fb6c2cb201b0eeaf7015
  Stored in directory: /root/.cache/pip/wheels/a5/0a/39/26d73b035027276e526bec94b0217ed799109d7890c34a7d9b
Successfully built rlcard
Installing collected packages: rlcard
Successfully installed rlcard-1.2.0


In [2]:
import torch.nn as nn

# Define the neural network architecture
class DQN_Network(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(DQN_Network, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.2)  # Dropout with 20% probability

        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        actions = self.fc3(x)
        return actions



In [3]:
from collections import deque
import random
import numpy as np

class ReplayBuffer():

    def __init__(self, batch_size, buffer_size, device):
        self.device = device
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.memory = deque(maxlen=self.buffer_size)

    def add(self, tuple):
        for t in tuple:
            self.memory.append(t)

    def sample(self):
        sampled_elements = random.sample(self.memory, self.batch_size)

        state = list([s[0] for s in sampled_elements])
        action = list([s[1] for s in sampled_elements])
        reward = list([s[2] for s in sampled_elements])
        next_state = list([s[3] for s in sampled_elements])
        done = list([s[4] for s in sampled_elements])

        return state, action, reward, next_state, done


    def __len__(self):
        return len(self.memory)

class PriorizedExperienceReplay():
    def __init__(self, batch_size, buffer_size, device, alpha, beta):

        self.device = device
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.memory = deque(maxlen=self.buffer_size)
        self.priorities = np.zeros(buffer_size, dtype = np.float32)
        self.index = 0
        self.full = False
        self.alpha = alpha
        self.beta = beta

    def add(self, tuple):
        for t in tuple:
            self.memory.append(t)
            #The new tuple must be selected at least the first time
            self.priorities[self.index] = 1 if not self.full and self.index == 0 \
                                            else self.priorities.max()
            self.index = (self.index + 1) % self.buffer_size
            self.full = len(self.memory) == self.buffer_size

    def sample(self):
        if self.full:
            prios = self.priorities
        else:
            prios = self.priorities[:self.index]

        # calc P = p^a/sum(p^a)
        probs  = prios ** self.alpha
        P = probs/probs.sum()

        #gets the indices depending on the probability p
        indices = np.random.choice(len(self.memory), self.batch_size, p=P)
        sampled_elements = [self.memory[idx] for idx in indices]



        #Compute importance-sampling weight
        weights  = (len(self.memory) * P[indices]) ** (-self.beta)
        # normalize weights
        weights /= weights.max()
        weights  = np.array(weights, dtype=np.float32)

        state = list([s[0] for s in sampled_elements])
        action = list([s[1] for s in sampled_elements])
        reward = list([s[2] for s in sampled_elements])
        next_state = list([s[3] for s in sampled_elements])
        done = list([s[4] for s in sampled_elements])

        return state, action, reward, next_state, done, indices, weights

    def __len__(self):
        return len(self.memory)

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio

    def set_alpha(self, alpha):
        self.alpha = alpha

    def set_beta(self, beta):
        self.beta = beta

In [4]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

class Agent():
    def __init__(   self,
                    input_size,
                    hidden_size1,
                    hidden_size2,
                    num_actions,
                    device,
                    batch_size = 256,
                    buffer_size = 10_000,
                    gamma = .99,
                    horizon = 1_000_000,
                    lr = .001,
                    decrease = .99,
                    goal = .02,
                    per = False,
                    a = 0,
                    b = 0
                ):
        self.device = device

        #networks
        self.num_actions= num_actions
        self.model = DQN_Network(input_size, hidden_size1, hidden_size2, num_actions).to(self.device)
        self.target_model = DQN_Network(input_size, hidden_size1, hidden_size2, num_actions).to(self.device)



        #memory staffs
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.per = per
        if not per:
            self.replay_buffer = ReplayBuffer(batch_size= self.batch_size,buffer_size= self.buffer_size,device= self.device)
        else:
            self.a = a
            self.b = b
            self.offset = .05
            self.replay_buffer = PriorizedExperienceReplay(batch_size= self.batch_size,buffer_size= self.buffer_size,device= self.device, alpha = a, beta = b)


        #miscellaneous
        self.gamma = gamma
        self.horizon = horizon
        self.lr = lr
        self.criterion = nn.MSELoss(reduction='mean')
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.use_raw = False #just for the env, False because the agent is not a human
        self.counter = 0
        self.TAU = .005
        #epsilon greedy part
        self.eps = 1.0
        self.decrease = decrease
        self.goal = goal
        self.dt = .001
        self.epsilon_values = np.linspace(1.0, goal+self.dt, self.decrease)
        self.index = 0

    def push(self, tuples):
        self.replay_buffer.add(tuples)


    def no_grad_predict(self,state, network):

        ''' Predict the masked Q-values

        Args:
            state (numpy.array): current state,
            the network that is going to use,
            model (boolean): defines if the model or the model network is used (in case of training) in order to
            use or not grand,

        Returns:
            q_values (numpy.array): a 1-d array where each entry represents a Q value and sets -inf to the illegal
        '''
        training = len(state) != 5
        network.eval()
        #we should remember that state['obs'] is the 72 (or more in case of an extended environment) vector
        if not training: #in case that there is just one tuple
            input = torch.Tensor(np.expand_dims(state['obs'], 0)).to(self.device)
        else:
            new_state_obs = []
            new_state_legal_action = []
            for s in state:
                new_state_obs.append(s['obs'])
                new_state_legal_action.append(list(s['legal_actions'].keys()))
            input = torch.Tensor(np.array(new_state_obs)).to(self.device)
        #taking all the q-values
        with torch.no_grad():
            q_values = network(input)[0].cpu().detach().numpy() if not training else network(input).cpu().detach().numpy()

        network.train()
        #mask the illegal actions
        masked_q_values = -np.inf * np.ones(self.num_actions, dtype=float) if not training else -np.inf*np.ones((self.batch_size, self.num_actions), dtype = float)
        #I want the keys not the values and I have implement the values
        legal_actions = list(state['legal_actions'].keys()) if len(state) == 5 else list(new_state_legal_action)
        if training:
            for i,(m,q) in enumerate(zip(masked_q_values, q_values)):
                m[legal_actions[i]] = q[legal_actions[i]] #replace the -infinity with the true value when an action is legal
        else:
            masked_q_values[legal_actions] = q_values[legal_actions]

        return masked_q_values

    def eval_step(self,state):
        """
        method required from the rl-card environment.
        This method is called in env.run(is_training = False)
        and returns a clear action.
        """
        qs = self.no_grad_predict(state, network = self.model)
        action = np.argmax(qs)

        return action, None

    def step(self, state):
        """
        method required from the rl-card environment.
        This method is called in env.run(is_training = True)
        and returns an action, selected by eps-greedy.
        """
        self.update_eps()
        p = random.random()
        legal_actions = list(state['legal_actions'].keys())

        if p < self.eps: #return random move
            return legal_actions[random.randint(a = 0, b =len(legal_actions)-1)]

        q_values = self.no_grad_predict(state, network = self.model)

        return np.argmax(q_values)


    def agents_step(self, tuples):
        """
        method responsible for storing the new experience in replay buffer,
        and train the agent. Basically the method must be called in every timestep
        of the training loop
        """
        self.counter += 1
        #stores new experience in replay buffer
        self.push(tuples)
        if len(self.replay_buffer) < 2*self.batch_size: return

        #enough experience was stored, so I can sample a minibatch
        experience = self.replay_buffer.sample()
        self.set_per_values()
        #now it is time for training
        if not self.per:self.train(experience)
        else: self.train_per(experience)



    def update_eps(self):
        if self.eps > self.goal+self.dt:
            self.index +=  1
            self.eps = self.epsilon_values[self.index]

        if (self.eps == self.goal + self.dt):
            print("\n----------exploration ended--------------")
            self.eps = self.goal - self.dt
            #self.optimizer.param_groups[0]['lr'] = self.lr*0.1

    def train_per(self,experience):
        self.model.train()
        self.optimizer.zero_grad()
        state, action, reward, next_state, done, idx, weights = experience
        action = torch.tensor(action).to(self.device)
        legal_actions_batch = list([ns['legal_actions'] for ns in next_state])

        #calulating the max(Q(s',a'))
        next_qs = self.no_grad_predict(state = next_state, network = self.target_model)

        legal_actions = []
        for b in range(self.batch_size):
            legal_actions.extend([i + b * self.num_actions for i in legal_actions_batch[b]])

        #masking the illegal moves for Q(s',a')
        masked_q_values = -np.inf * np.ones(self.num_actions * self.batch_size, dtype=float)
        masked_q_values[legal_actions] = next_qs.flatten()[legal_actions]
        masked_q_values = masked_q_values.reshape((self.batch_size, self.num_actions))
        #calculating the best action based in the Q(s', a')
        best_actions = np.argmax(masked_q_values, axis=1)

        #calulating the target
        done = list(map(float, done))
        ones= np.ones_like(done)
        y = reward + self.gamma* next_qs[np.arange(self.batch_size), best_actions]*(ones-done)
        y = torch.tensor(y, dtype = torch.float32).to(self.device)
        #so y = rewards + gamma* max(Q(s',a')) * done
        #calulating the Q(s,a) using the model network
        state = list([s['obs'] for s in state])
        state = torch.Tensor(np.array(state)).to(self.device)
        qs = self.model(state) #calulating the Q(s,a) for every a
        Q = torch.gather(qs, dim=-1, index=action.unsqueeze(-1)).squeeze(-1).to(self.device) #filtering the selected a


        #It's time for training
        w = torch.Tensor((weights**(1-self.b))).to(self.device)
        loss = (self.criterion(Q, y)*w).mean()
        loss.backward()
        self.optimizer.step()
        self.soft_update()
        self.model.eval()

        #updating the propabillities
        td_error =  Q - y
        difference = td_error + self.offset
        self.replay_buffer.update_priorities(idx, abs(difference))

        return


    def train(self,experience):

        self.model.train()
        self.optimizer.zero_grad()

        state, action, reward, next_state, done = experience
        action = torch.tensor(action).to(self.device)
        legal_actions_batch = list([ns['legal_actions'] for ns in next_state])

        #calulating the max(Q(s',a'))
        next_qs = self.no_grad_predict(state = next_state, network = self.target_model)

        legal_actions = []
        for b in range(self.batch_size):
            legal_actions.extend([i + b * self.num_actions for i in legal_actions_batch[b]])

        #masking the illegal moves for Q(s',a')
        masked_q_values = -np.inf * np.ones(self.num_actions * self.batch_size, dtype=float)
        masked_q_values[legal_actions] = next_qs.flatten()[legal_actions]
        masked_q_values = masked_q_values.reshape((self.batch_size, self.num_actions))
        #calculating the best action based in the Q(s', a')
        best_actions = np.argmax(masked_q_values, axis=1)

        #calulating the target
        done = list(map(float, done))
        ones= np.ones_like(done)
        y = reward + self.gamma* next_qs[np.arange(self.batch_size), best_actions]*(ones-done)
        y = torch.tensor(y, dtype = torch.float32).to(self.device)
        #so y = rewards + gamma* max(Q(s',a')) * done
        #calulating the Q(s,a) using the model network
        state = list([s['obs'] for s in state])
        state = torch.Tensor(np.array(state)).to(self.device)
        qs = self.model(state) #calulating the Q(s,a) for every a
        Q = torch.gather(qs, dim=-1, index=action.unsqueeze(-1)).squeeze(-1) #filtering the selected a


        #It's time for training
        loss = self.criterion(Q,y)
        loss.backward()
        self.optimizer.step()
        self.soft_update()
        self.model.eval()



        return

    def soft_update(self):

        for target_param, local_param in zip(self.target_model.parameters(), self.model.parameters()):
            target_param.data.copy_(self.TAU*local_param.data + (1.0-self.TAU)*target_param.data)

    def set_per_values(self):
        if not self.per: return
        self.replay_buffer.set_alpha(1-self.eps)
        self.replay_buffer.set_beta(1-self.eps)


    def load_model(self, weights):
        self.model.load_state_dict(weights)
        self.target_model.load_state_dict(weights)
        self.model.eval()
        self.target_model.eval()

In [5]:
br = 0 #stands for bad reward
mr = 1 #stands for medium reward
gr = 2 #stands for good reward

#dictionary that represents the value of each card
cards = {
    "2": br,
    "3": br,
    "4": br,
    "5": br,
    "6": br,
    "7": mr,
    "8": mr,
    "9": mr,
    "10": mr,
    "J": gr,
    "Q": gr,
    "K": gr,
    "A": gr
}

action_hierarchy = {"raise":1, "call":0, "check":3, "fold":2}

def get_action(legal_moves, desired_move):
    """
        method that gets the -enumerated- desired action and returns the
        closest one(based on poker criteria and the so-called'legal' moves).
        returns 2 (fold) in case of an error
    """

    action_hierarchy_list = list(action_hierarchy.keys())
    index = action_hierarchy_list.index(desired_move)
    if desired_move in legal_moves: return action_hierarchy[desired_move]
    for i in range(3):
        next_move = action_hierarchy_list[(index+i+1)%len(action_hierarchy_list)]
        if next_move in legal_moves: return action_hierarchy.get(next_move, 2)
    return 2

class Threshold_Agent():
    #aka the offensive/loose agent
    def __init__(self):
        self.use_raw = False #just for the env, False because the agent is not a human
        self.pair = False

    def step(self, state):
        retval=self.eval_step(state)[0]
        return retval


    def eval_step(self, state):
        """
            method that sends/decides the action of the layer.
        """

        hand = self.get_hand(state)
        table = self.get_table(state)
        if table == -1: return self.play_preflop(state, hand), None
        return self.play_post_flop(state, hand, table), None



    def get_hand(self, state):
        hand = state["raw_obs"]["hand"]
        hand=list([list(h)[-1]for h in hand]) #hand variable contains the rank of the cards
        return hand

    def get_table(self, state):
        table = state["raw_obs"].get("table", False) #return false in case of preflop
        if(not table): return -1
        table=list([list(h)[-1]for h in table]) #table variable contains the rank of the cards
        return table

    def play_preflop(self, state,hand):
        """
        check if agent has strong preflop hand.
        ARGUMENTS: the ranks of the hand, state as the environment returns
        """
        pair = hand[0] == hand[1]
        self.pair = pair
        value = cards.get(hand[0], 0) + cards.get(hand[1], 0)
        legal_actions = state["raw_legal_actions"]
        if pair or value >= 3:
            return get_action(legal_moves=legal_actions, desired_move = "raise")
        return get_action(legal_moves=legal_actions, desired_move = "call")

    def play_post_flop(self, state, hand, table):
        """
        Decides either to raise in case of a match on the table,
        or to just call
        Arguments: ranked (hand and table)
        """
        match = False
        legal_actions = state["raw_legal_actions"]

        for h in hand:
            match = True if h in table else match
        #in case of a match within the hand and the table or just a pair in hand
        if match or self.pair: return get_action(legal_moves = legal_actions, desired_move="raise")
        return get_action(legal_moves = legal_actions, desired_move="call")


class Tight_Threshold_Agent(Threshold_Agent):

    def play_preflop(self, state,hand):
        """
        check if agent has strong preflop hand.
        ARGUMENTS: the ranks of the hand, state as the environment returns
        """
        pair = hand[0] == hand[1]
        self.pair = pair
        value = cards.get(hand[0], 0) + cards.get(hand[1], 0)
        legal_actions = state["raw_legal_actions"]
        if pair or value >= 3:
            return get_action(legal_moves=legal_actions, desired_move = "call")
        return get_action(legal_moves=legal_actions, desired_move = "check")


    def play_post_flop(self, state, hand, table):
        """
        Decides either to raise in case of a match on the table,
        or to just call
        Arguments: ranked (hand and table)
        """
        match = False
        legal_actions = state["raw_legal_actions"]

        for h in hand:
            match = True if h in table else match
        #in case of a match within the hand and the table or just a pair in hand
        if match or self.pair: return get_action(legal_moves = legal_actions, desired_move="call")
        return get_action(legal_moves = legal_actions, desired_move="check")



In [6]:
import rlcard
import torch
from rlcard.agents import RandomAgent
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import os

from rlcard.models.limitholdem_rule_models import LimitholdemRuleAgentV1

from rlcard.utils import (
    set_seed,
    tournament,
    reorganize
)


if __name__ == '__main__':
    seed = 42
    env = rlcard.make("limit-holdem", config={'seed': seed,})
    set_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"used device is {device}")
    horizon = 3_000_000
    num_eval_games = 2_000 #how many hands will be played in every tournament
    evaluate_every = 100_000
    index = 0

    threshold = True
    per = False
    loose = False
    best_threshold = False

    threshold = True if best_threshold else threshold
    loose =False if not threshold else loose


    agent = Agent(
        input_size= env.state_shape[0][0],
        hidden_size1= 512,
        hidden_size2=256,
        num_actions=env.num_actions,
        device=device,
        batch_size=64,
        buffer_size=100_000,
        gamma = .99,
        lr = 10**(-5), #good lr is .00003
        decrease= int(2*1.7*0.4*horizon), #exploration in the 40% of the horizon. # because the agent's step is called almost 1.7 times ine evry game
        goal = .1,
        per = per
    )

    agents=[agent]
    for _ in range(1, env.num_players):
        if threshold:
            opp = Threshold_Agent() if loose else Tight_Threshold_Agent()
            opp = LimitholdemRuleAgentV1() if best_threshold else opp
        else:
            opp = RandomAgent(num_actions=env.num_actions)
        agents.append(opp)
    print(f"the opponent of the agent is {type(opp)}")
    env.set_agents(agents)

    rewards = np.zeros(int(horizon/evaluate_every))
    for episode in tqdm(range(horizon), desc="Processing items", unit="item"):

        # Generate data from the environment
        trajectories, payoffs = env.run(is_training=True)
        # Reorganaize the data to be state, action, reward, next_state, done
        trajectories = reorganize(trajectories, payoffs)
        agent.agents_step(trajectories[0])

        #logistics/evaluation on clear data
        if episode%evaluate_every == 0 and index < len(rewards):
            rewards[index] = tournament(env,num_eval_games)[0]
            index+=1

    print(f"the buffer size at the end is {len(agent.replay_buffer)}")
    file_path = f"./data/final/"
    if not os.path.exists(file_path):
    # If it doesn't exist, create the directory
        os.makedirs(file_path)
        print(f"Directory '{file_path}' has been created.")
    else:
        print(f"Directory '{file_path}' already exists.")
    torch.save(agent.model.state_dict(), file_path+f'models/threshold_{threshold}_per_{per}_loose_{loose}_best_{best_threshold}_model.pth')
    np.save(file_path+f"threshold_{threshold}_per_{per}_loose_{loose}_best_{best_threshold}.npy", rewards, allow_pickle=True)

    plt.figure(1)
    plt.title(f" Agent's Reward ")
    plt.xlabel("Round T")
    plt.ylabel("Average Score")
    plt.plot(np.linspace(0, horizon, int(horizon/evaluate_every)),rewards, label="Average reward per episode")
    plt.grid()
    plt.legend()
    plt.savefig(f"./images/threshold_{threshold}_per_{per}_loose_{loose}_best_{best_threshold}")
    plt.show()


used device is cpu
the opponent of the agent is <class '__main__.Tight_Threshold_Agent'>


Processing items:   0%|          | 938/3000000 [00:14<12:31:37, 66.50item/s]


KeyboardInterrupt: ignored