<a href="https://colab.research.google.com/github/juhumkwon/source_code/blob/main/Q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import numpy as np

# 환경 설정: 1차원 격자판 (예: 길이 5)
# 목표 지점은 상태 4
class Environment:
    def __init__(self):
        self.state_space = 5  # 상태 공간의 크기
        self.action_space = 2  # 행동 공간 (0: 왼쪽, 1: 오른쪽)

    def step(self, state, action):
        if action == 0:  # 왼쪽으로 이동
            next_state = max(0, state - 1)
        else:  # 오른쪽으로 이동
            next_state = min(self.state_space - 1, state + 1)

        # 보상 설정: 목표 지점 (상태 4)에 도달하면 +1 보상, 그 외에는 0 보상
        reward = 1 if next_state == 4 else 0
        done = next_state == 4
        return next_state, reward, done

# Q-learning 에이전트
class QAgent:
    def __init__(self, state_space, action_space, learning_rate=0.1, discount_factor=0.9, epsilon=1.0):
        self.q_table = np.zeros((state_space, action_space))
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.1

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(2)  # 무작위 선택
        return np.argmax(self.q_table[state])  # Q값이 가장 큰 행동 선택

    def learn(self, state, action, reward, next_state):
        best_future_q = np.max(self.q_table[next_state])
        target = reward + self.discount_factor * best_future_q
        self.q_table[state, action] += self.learning_rate * (target - self.q_table[state, action])
        # 탐색률 감소
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

# 학습 수행
env = Environment()
agent = QAgent(state_space=5, action_space=2)

episodes = 100
for episode in range(episodes):
    state = 0  # 시작 상태
    total_reward = 0
    done = False

    while not done:
        action = agent.choose_action(state)
        next_state, reward, done = env.step(state, action)
        agent.learn(state, action, reward, next_state)
        state = next_state
        total_reward += reward

    print(f"Episode: {episode+1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

# 학습 후 Q-테이블 출력
print("\nQ-table after training:")
print(agent.q_table)

Episode: 1, Total Reward: 1, Epsilon: 0.61
Episode: 2, Total Reward: 1, Epsilon: 0.22
Episode: 3, Total Reward: 1, Epsilon: 0.14
Episode: 4, Total Reward: 1, Epsilon: 0.10
Episode: 5, Total Reward: 1, Epsilon: 0.10
Episode: 6, Total Reward: 1, Epsilon: 0.10
Episode: 7, Total Reward: 1, Epsilon: 0.10
Episode: 8, Total Reward: 1, Epsilon: 0.10
Episode: 9, Total Reward: 1, Epsilon: 0.10
Episode: 10, Total Reward: 1, Epsilon: 0.10
Episode: 11, Total Reward: 1, Epsilon: 0.10
Episode: 12, Total Reward: 1, Epsilon: 0.10
Episode: 13, Total Reward: 1, Epsilon: 0.10
Episode: 14, Total Reward: 1, Epsilon: 0.10
Episode: 15, Total Reward: 1, Epsilon: 0.10
Episode: 16, Total Reward: 1, Epsilon: 0.10
Episode: 17, Total Reward: 1, Epsilon: 0.10
Episode: 18, Total Reward: 1, Epsilon: 0.10
Episode: 19, Total Reward: 1, Epsilon: 0.10
Episode: 20, Total Reward: 1, Epsilon: 0.10
Episode: 21, Total Reward: 1, Epsilon: 0.10
Episode: 22, Total Reward: 1, Epsilon: 0.10
Episode: 23, Total Reward: 1, Epsilon: 0.