<a href="https://colab.research.google.com/github/juhumkwon/Defense_Cloud/blob/main/CartPole_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym pygame tensorflow

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/721.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/721.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/721.7 kB[0m [31m14.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pygame
  Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting

In [3]:
import gymnasium as gym # Modified: Changed import from gym to gymnasium
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
import random
from collections import deque
import time

# 환경 생성 및 렌더링 모드 지정
env = gym.make("CartPole-v1", render_mode="human") # Modified: Removed new_step_api=True
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

# 모델 정의
def build_model():
    model = models.Sequential([
        layers.Dense(24, activation='relu', input_shape=(num_states,)),
        layers.Dense(24, activation='relu'),
        layers.Dense(num_actions, activation='linear')
    ])
    return model

model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

# 하이퍼파라미터
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 64
memory = deque(maxlen=2000)
optimizer = optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.MeanSquaredError()

# 학습 루프
for ep in range(300):
    state, info = env.reset()
    state = np.reshape(state, [1, num_states]) # 신경망 모델(특히 TensorFlow, Keras 모델)은 입력 데이터를 보통 (배치 크기, 입력 차원) 형태로 받습니다.
    total_reward = 0

    for t in range(500):
        # ✅ 시각화
        env.render()
        time.sleep(0.02)  # 50 FPS 정도로 느리게 보여줌

        # 행동 선택
        if np.random.rand() < epsilon:
            action = random.randint(0, num_actions - 1)
        else:
            q_vals = model(state, training=False).numpy()[0]
            action = np.argmax(q_vals)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = np.reshape(next_state, [1, num_states])
        memory.append((state, action, reward, next_state, done))
        state = next_state # 에이전트가 환경에서 행동(action)을 취한 후 환경으로부터 받은 next_state를 다음 타임스텝의 상태로 사용하기 위해 바꾸는 것입니다.


        total_reward += reward

        if done:
            break

        # 학습
        if len(memory) >= batch_size:
            minibatch = random.sample(memory, batch_size)
            states = np.vstack([x[0] for x in minibatch])
            actions = [x[1] for x in minibatch]
            rewards = [x[2] for x in minibatch]
            next_states = np.vstack([x[3] for x in minibatch])
            dones = [x[4] for x in minibatch]

            """
            만약 같은 네트워크를 현재 상태와 다음 상태 모두에 사용하면,
            네트워크의 파라미터가 학습 도중 계속 바뀌기 때문에
            타겟 값이 계속 변하는 moving target problem이 발생합니다.
            """
            q_targets = model.predict(states, verbose=0)
            q_next = target_model.predict(next_states, verbose=0)

            for i in range(batch_size):
                q_targets[i][actions[i]] = rewards[i] if dones[i] else rewards[i] + gamma * np.max(q_next[i])

            with tf.GradientTape() as tape:
                q_preds = model(states, training=True)
                loss = loss_fn(q_targets, q_preds)
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))


    # 타겟 네트워크 업데이트
    if ep % 20 == 0:
        target_model.set_weights(model.get_weights())

    # Epsilon 감소
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if (ep + 1) % 50 == 0:
        print(f"Episode {ep+1}, Reward: {total_reward}, Epsilon: {epsilon:.3f}")

env.close()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode 50, Reward: 11.0, Epsilon: 0.778
Episode 100, Reward: 80.0, Epsilon: 0.606
Episode 150, Reward: 211.0, Epsilon: 0.471
Episode 200, Reward: 219.0, Epsilon: 0.367
Episode 250, Reward: 152.0, Epsilon: 0.286
Episode 300, Reward: 131.0, Epsilon: 0.222


In [2]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.1.1-py3-none-any.whl (965 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.1.1
