In [22]:
# Definition on the Agent that plays Connect4
import json
import numpy as np
import random
import pickle
from importnb import Notebook
with Notebook():
    import Board

In [24]:
class Agent:
    def __init__(self, env ,chip, epsilon = 0.95, alpha = 0.5, gamma = 1, filename = ''):
        # '''
        # reward_scheme <(float, float, float, float)> : (reward for a move that doesn´t end the game, reward for losing, reward fora tied game, reward for winning)
        # chip <string> = chip to be played by the agent. must be either "X" or "O"
        # 
        # '''
        self.env = env
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.state = []
        self.action = 0
        self.reward = 0
        
     
        if chip not in ["X", "O"]:
            raise ValueError("chip must be 'X' or 'O'")
        self.chip = chip

        if filename != '':
            self.filename = filename
            self.q_table = self.import_Qtable(filename)
        else:
            self.q_table = {} # La tabla que hay que exportar y leer. self.import_Qtable(self.filename)

    def import_Qtable(self, filename):
        try:
            print(filename)
    
            with open(filename, 'rb') as file:
                q_table = pickle.load(file)
                print("Q-table load successfull new")
                print(q_table)
                return q_table
        except FileNotFoundError:
            print("File not found. Starting with an empty Q-Table.")
        except pickle.UnpicklingError:
            print("Error: The file content is not a valid pickle format.")
        except EOFError:
            print("Error: The file is incomplete or corrupted.")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
        
    def export_Qtable(self,filename):
        with open(filename, 'wb') as file:
            pickle.dump(self.q_table, file)

    
    def export_Qtable_JSON(self, filename):
        try:
            with open(filename, 'w') as file:
                # Convertir claves a cadenas para guardarlas en JSON
                raw_data = {str(key): value for key, value in self.q_table.items()}
                json.dump(raw_data, file, indent=4)
                print("Q-table saved successfully")
        except Exception as e:
            print(f"Error saving Q-table: {e}")

    
    def get_current_state(self):
        state = []
        for column in range(self.env.columns):
            vectors =self.env.get_vectors_of_column(column)
            max_own = 0
            max_other = 0
            if(vectors[0] == [] and vectors[1] == [] and vectors[2] == [] and vectors[3] == []):
                state.append((-1, -1))
            else :
                for index, vector in enumerate(vectors):
                    result = self.env.verify_vector(vector[0], vector[1], self.chip)
                    max_own = max(max_own, result[0])
                    max_other = max(max_other, result[0])
                state.append((max_own, max_other))
        return state

    def get_value(self, state, action:int) -> float:
        if self.q_table.get((tuple(state),action)) == None:
            return 0
        return self.q_table.get((tuple(state),action))
    
    def best_action(self, state) -> int:
        actions = self.env.get_possible_actions()
        values = {}
    
        if len(actions) == 0:
            return -1
    
        for action in actions:
            values[(tuple(state), action)] = self.get_value(state, action)
    
        best_state, best_action = max(values, key=values.get)
    
        # Filtra acciones empatadas en el mejor valor
        filtered_values = {key: val for key, val in values.items() if val == values[(best_state, best_action)]}
    
        # Selecciona aleatoriamente entre las mejores acciones empatadas
        best_tuple, _ = random.choice(list(filtered_values.items()))
        return best_tuple[1]

    
    def choose_action(self, state) -> int:
        actions = self.env.get_possible_actions()
        action = -1
        prob = random.uniform(0,1)
        if prob <= self.epsilon:
            action = random.choice(actions)
        else:
            action = self.best_action(state)
        return action
    
    def update_values(self, state, action: int, next_state, reward: int) -> None:
        actual_Q_value = self.get_value(state, action)
        next_action = self.best_action(next_state)
        next_Q_value = self.get_value(next_state, next_action)
        new_Q_value = ((1 - self.alpha) * actual_Q_value) + self.alpha * (reward + (self.gamma * next_Q_value))
        self.q_table[(tuple(state), action)] = new_Q_value


    def step(self, state, action:int) -> tuple[tuple[int,int],int, bool, str]:
        own_neighbors, opponent_neighbors = state[action - 1]        
        # Aca se define la estrategia a tomar, si es defensiva u ofensiva
        own_neighbors_rewards = [0,20,50,200,500] # Son las recompensas dadas por la cantidad de vecinos propios
        opponent_neighbors_rewards = [0,5,15,70,-100] # Son las recompensas dadas por la cantidad de vecinos oponentes

        reward = own_neighbors_rewards[own_neighbors] + opponent_neighbors_rewards[opponent_neighbors]
        self.env.place_chip(action, self.chip)
        status= self.env.verify_winner(self.chip)
        info = ''
        if status : 
            info = 'El estado es terminal'
        else :
            info = 'El juego continua'
        return (reward, status, info)

    
    def play_turn(self, episode = 1):
            
        if self.state == []:
            self.state = self.get_current_state()
            self.action = self.choose_action(self.state)
        else:
            next_state = self.get_current_state()
            next_action = self.choose_action(next_state)
            self.update_values(self.state, self.action, next_state,self.reward)
            self.state, self.action = next_state, next_action

        self.reward, done, info = self.step(self.state, self.action)
        

        if ((episode+1) % 200) == 0:
            if self.epsilon > 0.01:                    
                self.epsilon -= (self.epsilon*0.1)
        
    def test_performance(self) -> tuple[dict, dict]:
        actions = {}
        values = {}
        for i in range(self.env.nrows):
            for j in range(self.env.ncols):
                state = (i, j)  # Asegúrate de que `state` sea hashable
                if not self.env.is_terminal(state):
                    action = self.best_action(state)
                    actions[(i, j)] = action
                    values[(tuple(state), action)] = self.get_value(state, action)
        return actions, values


In [26]:
def test():
    env = [
            ['-','-','-','-','-','-','-'],
            ['-','-','-','-','-','-','-'],
            ['-','-','O','-','-','-','-'],
            ['-','-','O','-','-','-','-'],
            ['O','-','X','X','X','X','O'],
            ['-','-','O','O','X','O','X']
          ]
    agente = Agent(env,'X')
    
    vector = ['-','X','X','-','X','O']
    assert agente.verify_vector(3,vector,'X') == (3,0), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','O','O','-','X','O']
    assert agente.verify_vector(3,vector,'X') == (1,2), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','O','O','-','X','O']
    assert agente.verify_vector(0,vector,'X') == (0,2), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','O','O','X','X','-']
    assert agente.verify_vector(5,vector,'X') == (2,0), "La funcion no esta retornando los valores correctos"
    
    vector = ['-','-','-','-']
    assert agente.verify_vector(2,vector,'X') == (0,0), "La funcion no esta retornando los valores correctos"

In [28]:
# # Crear una instancia del agente
# agent = Connect4Agent(chip="X", reward_scheme=(0.0, -1.0, 0.5, 1.0))

# # Ejemplo de un tablero de juego
# board = [
#     ["-", "-", "-", "-", "-", "-", "-"],
#     ["-", "-", "-", "-", "-", "-", "-"],
#     ["-", "-", "-", "-", "X", "-", "-"],
#     ["-", "-", "-", "X", "O", "-", "-"],
#     ["X", "X", "X", "O", "X", "-", "-"],
#     ["O", "X", "O", "X", "O", "-", "-"]
# ]

# # Escoger una acción con un 10% de exploración y 90% de explotación
# epsilon = 0.1  # Tasa de exploración
# action = agent.choose_action(board, epsilon)
# print("Chosen action (column):", action)