# Estudio comparativo de algoritmos en un problema de k-armed bandit

*Descrition:* El experimento compara el rendimiento de algoritmos epsilon-greedy en un problema de k-armed bandit.
Se generan gráficas de recompensas promedio y selecciones óptimas para cada algoritmo.

_Author: Luis Daniel Hernández Molinero_
_Email: ldaniel@um.es_
_Date: 2025/01/29_

This software is licensed under the GNU General Public License v3.0 (GPL-3.0),
with the additional restriction that it may not be used for commercial purposes.

For more details about GPL-3.0: https://www.gnu.org/licenses/gpl-3.0.html



In [None]:
#@title Copiar el repositorio
import os
from shutil import rmtree
import sys

# Cambiar al directorio /content
os.chdir('/content')

# Verificar si el directorio existe antes de borrarlo
if os.path.exists("k_bandits"):  # Asegurar de que el directorio se corresopnde al github
    rmtree("k_bandits")

!git clone https://github.com/ldaniel-hm/k_bandits.git
# Navegar al directorio del proyecto
os.chdir('k_bandits')
!pwd
!ls

# Añadir los directorio fuentes al path de Python
sys.path.append('/content/algorithms')
sys.path.append('/content/arms')

# Verificar que se han añadido correctamente
print(sys.path)

In [None]:
#@title Importamos todas las clases y funciones

import numpy as np
from typing import List

from algorithms import Algorithm, EpsilonGreedy
from arms import ArmNormal, Bandit
from plotting import plot_average_rewards, plot_optimal_selections


In [None]:

def run_experiment(bandit: Bandit, algorithms: List[Algorithm], steps: int, runs: int):

    optimal_arm = bandit.optimal_arm

    rewards = np.zeros((len(algorithms), steps))
    optimal_selections = np.zeros((len(algorithms), steps))

    for run in range(runs):
        current_bandit = Bandit(arms=bandit.arms)
        q_max = current_bandit.get_expected_value(current_bandit.optimal_arm)

        for algo in algorithms:
            algo.reset()

        total_rewards_per_algo = np.zeros(len(algorithms))

        for step in range(steps):
            for idx, algo in enumerate(algorithms):
                chosen_arm = algo.select_arm()
                reward = current_bandit.pull_arm(chosen_arm)
                algo.update(chosen_arm, reward)

                rewards[idx, step] += reward
                total_rewards_per_algo[idx] += reward

                if chosen_arm == optimal_arm:
                    optimal_selections[idx, step] += 1

    rewards /= runs
    optimal_selections = (optimal_selections / runs) * 100

    return rewards, optimal_selections


In [None]:

# Parámetros del experimento
seed = 42
np.random.seed(seed)

k = 10  # Número de brazos
steps = 1000  # Número de pasos
runs = 500  # Número de ejecuciones

# Creación del bandit
bandit = Bandit(arms=ArmNormal.generate_arms(k))
print(bandit)

optimal_arm = bandit.optimal_arm
print(f"Optimal arm: {optimal_arm + 1} with expected reward={bandit.get_expected_value(optimal_arm)}")

# Definir los algoritmos a comparar
algorithms = [EpsilonGreedy(k=k, epsilon=0), EpsilonGreedy(k=k, epsilon=0.01), EpsilonGreedy(k=k, epsilon=0.1)]

# Ejecutar el experimento
rewards, optimal_selections = run_experiment(bandit, algorithms, steps, runs)


In [None]:

# Graficar los resultados
plot_average_rewards(steps, rewards, algorithms)
# plot_optimal_selections(steps, optimal_selections, algorithms)
