In [5]:
import numpy as np
import random

class MontyHallAgent:
    def __init__(self, epsilon, alpha, gamma, num_doors):
        self.epsilon = epsilon  # 탐험 확률
        self.alpha = alpha  # 학습 속도
        self.gamma = gamma  # 할인 계수
        self.num_doors = num_doors  # 문의 개수
        self.q_values = {}  # Q-Value 테이블 초기화

    def get_state_key(self, player_choice, revealed_door):
        return f"{player_choice}_{revealed_door}"

    def get_action(self, state):
        # 탐험 또는 이전에 학습한 Q-Value에 기반하여 행동 선택
        if random.uniform(0, 1) < self.epsilon:
            return random.choice([0, 1])  # 0: 변경하지 않음, 1: 변경
        else:
            return np.argmax(self.q_values.get(state, [0, 0]))

    def update_q_value(self, state, action, reward, next_state):
        current_q = self.q_values.get(state, [0, 0])
        max_future_q = np.max(self.q_values.get(next_state, [0, 0]))
        new_q = current_q[action] + self.alpha * (reward + self.gamma * max_future_q - current_q[action])
        current_q[action] = new_q
        self.q_values[state] = current_q

def play_monty_hall(agent, num_episodes):
    for episode in range(num_episodes):
        # 초기 상태 설정
        player_choice = random.randint(0, 2)
        car_door = random.randint(0, 2)
        revealed_door = [door for door in range(3) if door != player_choice and door != car_door][0]

        # 플레이어 선택 변경 여부 결정
        action = agent.get_action(agent.get_state_key(player_choice, revealed_door))

        # 보상 계산
        reward = 1 if (player_choice == car_door and action == 1) or (player_choice != car_door and action == 0) else 0

        # 다음 상태 키 생성
        next_state = agent.get_state_key(player_choice, revealed_door)

        # Q-Value 업데이트
        agent.update_q_value(agent.get_state_key(player_choice, revealed_door), action, reward, next_state)

# Monty Hall 에이전트 생성
agent = MontyHallAgent(epsilon=0.1, alpha=0.5, gamma=0.9, num_doors=3)

# 학습 실행
play_monty_hall(agent, num_episodes=10000)

# 학습된 Q-Value 테이블 출력
print("학습된 Q-Value 테이블:")
print(agent.q_values)

학습된 Q-Value 테이블:
{'0_1': [3.973052686718854, 4.051168521868455], '1_0': [4.820608253620243, 3.6043626048771307], '2_0': [5.611490289119962, 4.722033404701614], '0_2': [9.999999999999979, 8.999999999999979], '1_2': [9.999999999999979, 8.999999999999645], '2_1': [9.999999999999979, 8.999999999999979]}
