# QLearning - LightsOut (3x3, 4x4 o 5x5)

# Importación de Bibliotecas y Definición de Parámetros

Empezamos importando las bibliotecas necesarias y estableciendo algunos parámetros iniciales.

In [None]:
import numpy as np
from itertools import product
import random
from tqdm import tqdm

rows, columns = 4, 4
states = rows * columns
on, off = 1, 0
gamma = 0.9

# Funciones de Conversión de Estado y Visualización
Luego, se definen algunas funciones auxiliares para convertir entre las coordenadas (x, y) y el estado, así como para visualizar el tablero.

In [None]:
def state_from_xy(x, y):
    if isinstance(x, int):
        return x
    return y * columns + x

def xy_from_state(state):
    return divmod(state, columns)

def all_possible_states():
    return np.array(list(product([0, 1], repeat=(rows ** 2))))

def view3x3(state):
    return f"{state[0]} {state[1]} {state[2]}\n{state[3]} {state[4]} {state[5]}\n{state[6]} {state[7]} {state[8]}"

def view4x4(state):
    return f"{state[0]} {state[1]} {state[2]} {state[3]}\n{state[4]} {state[5]} {state[6]} {state[7]}\n{state[8]} {state[9]} {state[10]} {state[11]}\n{state[12]} {state[13]} {state[14]} {state[15]}"

def view5x5(state):
    return f"{state[0]} {state[1]} {state[2]} {state[3]} {state[4]}\n{state[5]} {state[6]} {state[7]} {state[8]} {state[9]}\n{state[10]} {state[11]} {state[12]} {state[13]} {state[14]}\n{state[15]} {state[16]} {state[17]} {state[18]} {state[19]}\n{state[20]} {state[21]} {state[22]} {state[23]} {state[24]}"

def view_board(state):
    if rows == 3:
        return view3x3(state)
    elif rows == 4:
        return view4x4(state)
    elif rows == 5:
        return view5x5(state)

Estas funciones son útiles para convertir entre las coordenadas del estado y para visualizar el tablero en una cuadrícula de 3x3, 4x4 o 5x5, dependiendo del tamaño especificado.

# Funciones Auxiliares y Barras de Progreso

* La función loading_bar se utiliza para visualizar una barra de progreso durante el entrenamiento y la evaluación.
* perform_action se encarga de realizar una acción en el estado actual del juego.
* calculate_reward calcula la recompensa actual basada en cuántas luces están apagadas en el estado actual.

In [None]:
def loading_bar(n_epochs):
    print(f'\r[{"#" * (n_epochs // 2)}{" " * (50 - n_epochs // 2)}] {n_epochs}% ', end='')

def perform_action(state, n_state):
    new_state = state.copy()
    row_size = rows

    new_state[n_state] = on if new_state[n_state] == off else off
    if n_state - row_size >= 0:
        new_state[n_state - row_size] = on if new_state[n_state - row_size] == off else off
    if n_state + row_size < (rows ** 2):
        new_state[n_state + row_size] = on if new_state[n_state + row_size] == off else off
    if n_state - 1 >= 0 and n_state % row_size != 0:
        new_state[n_state - 1] = on if new_state[n_state - 1] == off else off
    if n_state + 1 < (rows ** 2) and (n_state + 1) % row_size != 0:
        new_state[n_state + 1] = on if new_state[n_state + 1] == off else off

    return new_state

def calculate_reward(state):
    lights_off = 0
    for i in state:
        if i == 0:
            lights_off += 1
    return lights_off

# Algoritmo de Q-Learning

* La función Q_learning implementa el algoritmo de Q-Learning para aprender una política óptima.
* Se inicializa una matriz Q con ceros y se actualiza iterativamente con las recompensas obtenidas por las acciones tomadas.

In [None]:
def Q_learning(states, max_steps, episodes):
    b = rows ** 2
    a = 2 ** b
    Q = np.zeros((a, b))
    alpha = 0.1

    for episode in range(episodes):
        state_index = random.randint(0, b - 1)
        state = states[state_index]
        for step in range(max_steps):
            if random.uniform(0, 1) < 0.5:
                action_taken = random.randint(0, b - 1)
            else:
                action_taken = np.argmax(Q[state_index])

            next_state = perform_action(state, action_taken)
            reward_next_state = calculate_reward(next_state)
            Q[state_index, action_taken] += alpha * (reward_next_state + gamma * np.max(Q[next_state]) - Q[state_index, action_taken])
            alpha = 1 / np.sqrt(episode + 1)
            state = next_state
        loading_bar(episode * 100 // episodes)
    return Q

states = all_possible_states()[1:]

Q = Q_learning(states, 500, 500)

# Evaluación del Agente

* evaluate_agent evalúa el desempeño del agente entrenado utilizando el Q-table aprendido.
* Se realizan múltiples episodios para calcular el promedio de las recompensas obtenidas y su desviación estándar.

In [None]:
def evaluate_agent(states, Q, max_steps, episodes):
    episode_rewards = []
    b = rows ** 2
    for episode in range(episodes):
        state_index = random.randint(0, b - 1)
        state = states[state_index]
        total_rewards_ep = 0

        for _ in range(max_steps):
            action_taken = np.argmax(Q[state_index])
            next_state = perform_action(state, action_taken)
            reward_next_state = calculate_reward(next_state)
            total_rewards_ep += reward_next_state
            state_index = np.argmax(Q[next_state])  # Update the state index for the next step

        episode_rewards.append(total_rewards_ep)
        loading_bar(episode * 100 // episodes)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)
    return mean_reward, std_reward

mean_reward, std_reward = evaluate_agent(states, Q, 500, 500)
print()
print(f"Mean Reward: {mean_reward}, Std Reward: {std_reward}")

# Simulación del Juego

* simulate_game simula un juego del problema de luces apagadas utilizando el Q-table aprendido.
* El juego comienza con un estado inicial específico y el agente selecciona acciones utilizando una política greedy basada en los valores de Q aprendidos.

In [None]:
def simulate_game(Q):
    # Start with a random initial state
    b = rows ** 2
    actions = []
    initial_state = [1, 1, 0, 0,
                     1, 1, 1, 0,
                     0, 0, 1, 1,
                     1, 1, 0, 1,]
    current_state = initial_state.copy()

    print("Initial State:")
    print(view_board(current_state))

    while True:
        # Choose action based on learned Q-values (greedy policy)
        if random.uniform(0, 1):
            action_taken = random.randint(0, b - 1)
        else:
            action_taken = np.argmax(Q[state_from_xy(*xy_from_state(current_state))])

        # Perform the selected action
        current_state = perform_action(current_state, action_taken)

        actions += [action_taken]

        # Check if all lights are turned off
        if calculate_reward(current_state) == 0:
            print("Final State:")
            print(view_board(current_state))
            print("Congratulations! All lights are turned off.")
            return actions
            break

        if len(actions) > 10:
          current_state = initial_state
          actions = []

    return 0

print(simulate_game(Q))