Multi-armed badit에서 순수 탐험과 $\epsilon$-greedy 정책의 기대이득

In [63]:
import numpy as np
import matplotlib.pyplot as plt

In [64]:
def simulate_pull_bandit(handle, visit, win):
    visit[handle] += 1
    win[handle] += 1 if np.random.rand() < arm_prob[handle] else 0

In [65]:
def exploration():
    visit = np.ones(num_arms)
    win = np.ones(num_arms)
    for t in range(num_pulls):
        handle = np.random.randint(num_arms)
        simulate_pull_bandit(handle, visit, win)
    return visit, win

def epsilon_greedy(epsilon):
    visit = np.ones(num_arms)
    win = np.ones(num_arms)
    for t in range(num_pulls):
        if np.random.rand() < epsilon:
            handle = np.random.randint(num_arms)
        else:
            q = np.array([win[i]/visit[i] for i in range(num_arms)])
            handle = np.random.choice(np.where(q==np.max(q))[0])
        simulate_pull_bandit(handle, visit, win)
    return visit, win


In [66]:
num_arms = 6
arm_prob = [0.4, 0.2, 0.1, 0.5, 0.3, 0.6]
num_pulls = 10000

visit, win = exploration()
print('순수 탐험 정책')
print('- 승률(%):', [f'{win[i]/visit[i]:.4f}' for i in range(num_arms)])
print('- 수익($):', [f'{2*win[i]-visit[i]:.0f}' for i in range(num_arms)])  # 2*승리횟수 - 시도횟수
print('- 총수익($):', sum([2*win[i]-visit[i] for i in range(num_arms)]))

visit, win = epsilon_greedy(0.1)
print('epsilon 탐험 정책')
print('- 승률(%):', [f'{win[i]/visit[i]:.4f}' for i in range(num_arms)])
print('- 수익($):', [f'{2*win[i]-visit[i]:.0f}' for i in range(num_arms)])  # 2*승리횟수 - 시도횟수
print('- 총수익($):', sum([2*win[i]-visit[i] for i in range(num_arms)]))


순수 탐험 정책
- 승률(%): ['0.4038', '0.1941', '0.1084', '0.4985', '0.2845', '0.6052']
- 수익($): ['-320', '-1024', '-1265', '-5', '-709', '367']
- 총수익($): -2956.0
epsilon 탐험 정책
- 승률(%): ['0.4294', '0.2312', '0.0520', '0.5000', '0.3260', '0.5949']
- 수익($): ['-25', '-100', '-155', '0', '-63', '1733']
- 총수익($): 1390.0
