In [1]:
# Definition on the Agent that plays Connect4
import json
import numpy as np
import pickle
from importnb import Notebook
with Notebook():
    import Board

Columna inválida. Elija un número entre 1 y 7.
Columna inválida. Elija un número entre 1 y 7.
Ficha no valida
Columna llena. Elija otra columna.
All tests passed successfully!


In [2]:
class Agent:
    def __init__(self, env ,chip, reward_scheme = (0.0, -1.0, 0.5, 1.0), epsilon = 0.95 filename = ''):
        # '''
        # reward_scheme <(float, float, float, float)> : (reward for a move that doesn´t end the game, reward for losing, reward fora tied game, reward for winning)
        # chip <string> = chip to be played by the agent. must be either "X" or "O"
        # 
        # '''
        self.env = env
        self.epsilon = epsilon
        self.state = []
        self.action = 0
        self.reward = 0
        # Verificar que reward_scheme es una tupla de 4 floats
        if not (isinstance(reward_scheme, tuple) and len(reward_scheme) == 4 and all(isinstance(x, float) for x in reward_scheme)):
            raise ValueError("reward_scheme must be a tupla with 4 floats")
        self.rewards = reward_scheme
     
        if chip not in ["X", "O"]:
            raise ValueError("chip must be 'X' or 'O'")
        self.chip = chip

        if filename != '':
            self.filename = filename
            self.q_table = self.import_Qtable(filename)
        else:
            self.q_table = {} # La tabla que hay que exportar y leer. self.import_Qtable(self.filename)

    def import_Qtable(self, filename):
        try:
            with open(filename, 'rb') as file:
                self.q_table = pickle.load(file)
        except FileNotFoundError:
            print("File not found. Starting with an empty Q-Table.")
        
    def export_Qtable(self,filename):
        with open(filename, 'wb') as file:
            pickle.dump(self.q_table, file)
    
    def get_actual_state(self):
        state = []
        for column in range(self.env.columns):
            vectors =self.env.get_vectors_of_column(column)
            max_own = 0
            max_other = 0
            if(vectors[0] == [] and vectors[1] == [] and vectors[2] == [] and vectors[3] == []):
                state.append((-1, -1))
            else :
                for index, vector in enumerate(vectors):
                    result = self.env.verify_vector(vector[0], vector[1], self.chip)
                    max_own = max(max_own, result[0])
                    max_other = max(max_other, result[0])
                state.append((max_own, max_other))
        return state

    def get_value(self, state, action:int) -> float:
        if self.q_table.get((state,action)) == None:
            return 0
        return self.q_table.get((state,action))
    
    def best_action(self, state) -> int:
        actions = self.env.get_possible_actions()
        values = {}
        
        if len(actions) == 0 :
            return -1
        
        for action in actions:
            values[(state,action)] = self.get_value(state, action)
            
        best_state, best_action = max(values, key=values.get)
        
        #Se busca si existe mas de una tupla con el mismo q valor
        filtered_values = {clave: valor for clave, valor in values.items() if valor == self.get_value(best_state, best_action)}
        
        # Se selecciona aleatoriamente la mejor accion de las tuplas empatadas
        best_tuple, best_action_value = random.choice(list(filtered_values.items()))

        return best_tuple[1]
    
    def choose_action(self, state) -> int:
        actions = self.env.get_possible_actions()
        action = -1
        prob = random.uniform(0,1)
        if prob <= self.epsilon:
            action = random.choice(actions)
        else:
            action = self.best_action(state)
        return action
    
    def update_values(self, state, action:int, next_state, reward:int) -> None:
        #𝑄(𝑠,𝑎)=(1−𝛼)𝑄(𝑠,𝑎)+𝛼[𝑟+𝛾max𝑎′𝑄(𝑠′,𝑎′)]
        actual_Q_value = self.get_value(state, action)
        next_action = self.best_action(next_state)
        next_Q_value = self.get_value(next_state, next_action)
        new_Q_value = ((1-self.alpha)*actual_Q_value) + self.alpha * (reward+(self.gamma*next_Q_value))
        self.q_table[(state,action)] = new_Q_value
        return new_Q_value

    def step(self, state, action:int) -> tuple[tuple[int,int],int, bool, str]:
        own_neighbors, opponent_neighbors = state[action]
        # Aca se define la estrategia a tomar, si es defensiva u ofensiva
        own_neighbors_rewards = [0,20,50,200] # Son las recompensas dadas por la cantidad de vecinos propios
        opponent_neighbors_rewards = [0,5,15,70] # Son las recompensas dadas por la cantidad de vecinos oponentes

        reward = own_neighbors_rewards[own_neighbors] + opponent_neighbors_rewards[opponent_neighbors]
        self.env.place_chip(action)
        status= self.env.verify_winner(self.chip)
        info = ''
        if status : 
            info = 'El estado es terminal'
        else :
            info = 'El juego continua'
        return (reward, status, info)

    
    def play_turn(self, episode = 1):
            
        if self.state == []:
            self.state = self.get_actual_state()
            self.action = self.choose_action(self.state)
        else:
            next_state = self.get_actual_state()
            next_action = self.choose_action(next_state)
            self.update_values(self.state, self.action, next_state,self.reward)
            self.state, self.action = next_state, next_action

        self.reward, done, info = self.step(self.state, self.action)
        

        if ((episode+1) % 100) == 0:
            if self.epsilon > 0.01:
                    
                self.epsilon -= (self.epsilon*0.1)
        
    def test_performance(self) -> tuple[dict, dict]:
        actions = {}
        values = {} 
        for i in range(self.env.nrows):
            for j in range(self.env.ncols):                    
                    if not self.env.is_terminal((i,j)):
                        action = self.best_action((i,j))
                        actions[(i,j)] = action
                        values[(i,j)] = self.get_value((i,j), action)
        return actions, values



    

# Hay que definir que estrategia va a tener el agente para terminar de definir su estructura, métodos y eso


In [16]:
def test():
    env = [
            ['-','-','-','-','-','-','-'],
            ['-','-','-','-','-','-','-'],
            ['-','-','O','-','-','-','-'],
            ['-','-','O','-','-','-','-'],
            ['O','-','X','X','X','X','O'],
            ['-','-','O','O','X','O','X']
          ]
    agente = Agent(env,'X')
    
    vector = ['-','X','X','-','X','O']
    assert agente.verify_vector(3,vector,'X') == (3,0), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','O','O','-','X','O']
    assert agente.verify_vector(3,vector,'X') == (1,2), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','O','O','-','X','O']
    assert agente.verify_vector(0,vector,'X') == (0,2), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','O','O','X','X','-']
    assert agente.verify_vector(5,vector,'X') == (2,0), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','-','-','-']
    assert agente.verify_vector(2,vector,'X') == (0,0), "La funcion no esta retornando los valores correctos"

In [None]:
# import numpy as np
# import random
# import pickle

# class Connect4Agent:
#     def __init__(self, chip, reward_scheme=(0.0, -1.0, 0.5, 1.0), filename=None):
#         '''
#         chip <string>: Chip to be played by the agent. Must be either "X" or "O".
#         reward_scheme <tuple>: (reward for a non-terminal move, reward for losing, reward for a tied game, reward for winning).
#         filename <string>: Filename to load or save the Q-Table.
#         '''
#         if chip not in ["X", "O"]:
#             raise ValueError("chip must be 'X' or 'O'")
        
#         self.chip = chip
#         self.rewards = reward_scheme
#         self.filename = filename
#         self.q_table = {}  # Q-Table as a dictionary to store state-action values

#         # Load Q-Table if a filename is provided
#         if filename:
#             self.import_Qtable(filename)

#     def get_state_representation(self, board):
#         """
#         Converts the game board into a tuple using booleans and empty spaces.
#         Returns:
#             A tuple representing the board state where:
#             - None: The cell is empty.
#             - True: The cell contains the agent's chip.
#             - False: The cell contains the opponent's chip.
#         """
#         agent_chip = True
#         opponent_chip = False
#         state = []

#         for row in range(6):
#             for col in range(7):
#                 if board[row][col] == self.chip:
#                     state.append(agent_chip)
#                 elif board[row][col] != "-":  # "-" indicates an empty cell
#                     state.append(opponent_chip)
#                 else:
#                     state.append(None)
#         return tuple(state)  # Convert the state to a tuple to use as a key in the Q-Table

#     def get_available_actions(self, board):
#         """
#         Returns a list of available columns where a chip can be placed.
#         """
#         return [col for col in range(7) if board[0][col] == "-"]  # Check the top cell of each column

#     def choose_action(self, board, epsilon=0.1):
#         """
#         Chooses an action using the epsilon-greedy strategy.
#         """
#         state = self.get_state_representation(board)
#         available_actions = self.get_available_actions(board)

#         if random.random() < epsilon:
#             # Explore: choose a random action
#             return random.choice(available_actions)
#         else:
#             # Exploit: choose the action with the highest Q-value
#             q_values = [self.q_table.get((state, action), 0.0) for action in available_actions]
#             max_q_value = max(q_values)
#             best_actions = [action for action, q in zip(available_actions, q_values) if q == max_q_value]
#             return random.choice(best_actions)  # Choose randomly among the best actions

#     def update_q_table(self, board, action, reward, next_board, alpha=0.1, gamma=0.9):
#         """
#         Updates the Q-Table using the Q-Learning formula.
#         """
#         state = self.get_state_representation(board)
#         next_state = self.get_state_representation(next_board)
#         next_available_actions = self.get_available_actions(next_board)

#         # Current Q-value
#         current_q_value = self.q_table.get((state, action), 0.0)

#         # Max Q-value for the next state
#         if next_available_actions:
#             next_q_values = [self.q_table.get((next_state, next_action), 0.0) for next_action in next_available_actions]
#             max_next_q_value = max(next_q_values)
#         else:
#             max_next_q_value = 0.0  # No future actions if the game is over

#         # Q-Learning update
#         new_q_value = current_q_value + alpha * (reward + gamma * max_next_q_value - current_q_value)
#         self.q_table[(state, action)] = new_q_value

#     def import_Qtable(self, filename):
#         """
#         Imports the Q-Table from a file.
#         """
#         try:
#             with open(filename, 'rb') as file:
#                 self.q_table = pickle.load(file)
#         except FileNotFoundError:
#             print("File not found. Starting with an empty Q-Table.")

#     def export_Qtable(self, filename):
#         """
#         Exports the Q-Table to a file.
#         """
#         with open(filename, 'wb') as file:
#             pickle.dump(self.q_table, file)

In [None]:
# # Crear una instancia del agente
# agent = Connect4Agent(chip="X", reward_scheme=(0.0, -1.0, 0.5, 1.0))

# # Ejemplo de un tablero de juego
# board = [
#     ["-", "-", "-", "-", "-", "-", "-"],
#     ["-", "-", "-", "-", "-", "-", "-"],
#     ["-", "-", "-", "-", "X", "-", "-"],
#     ["-", "-", "-", "X", "O", "-", "-"],
#     ["X", "X", "X", "O", "X", "-", "-"],
#     ["O", "X", "O", "X", "O", "-", "-"]
# ]

# # Escoger una acción con un 10% de exploración y 90% de explotación
# epsilon = 0.1  # Tasa de exploración
# action = agent.choose_action(board, epsilon)
# print("Chosen action (column):", action)