In [2]:
import gymnasium as gym
env = gym.make("LunarLander-v2", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5)

In [3]:
import copy
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import os
import argparse, pdb
import numpy as np
import tqdm
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from PIL import Image
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from typing import Callable
import matplotlib.pyplot as plt
import torch.nn.functional as F
from collections import namedtuple, deque
import math

In [4]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state'))

class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)
    
    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [5]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc_out = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc_out(x)
        return x

class DQNAgent:
    def __init__(self, state_size, action_size, maxlen=50000):
        self.state_size = state_size
        self.action_size = action_size

        # 하이퍼 파라미터
        self.discount_factor = 0.99
        self.learning_rate = 5e-4
        self.epsilon_start = 0.9
        self.epsilon_min = 0.05
        self.epsilon_decay = 10000
        self.batch_size = 64
        self.tau = 0.005
        self.steps_done = 0
        self.epsilon = 0
        self.loss_val = 0
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Policy model과 target model 따로 정의 (파라미터 초기화는 동일하게)
        self.policy_net = DQN(self.state_size, self.action_size).to(self.device)
        self.target_net = DQN(self.state_size, self.action_size).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        
        self.memory = ReplayMemory(maxlen=maxlen)

        # Huber loss
        self.lossfn = nn.SmoothL1Loss()

    # Epsilon-greedy (epsilon 지수 감쇠 업데이트)
    # 입력: ndarray (1, state_size), 출력: int
    def get_action(self, state):
        self.epsilon = self.epsilon_min + (self.epsilon_start - self.epsilon_min) *\
                    math.exp(-1 * self.steps_done / self.epsilon_decay)       
        self.steps_done += 1
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        else :
            state = torch.FloatTensor(state).to(self.device)
            with torch.no_grad():
                q_val = self.policy_net(state)
            return torch.argmax(q_val[0]).item()

    # 배치차원 추가 후 memory 저장
    def append_memory(self, state, action, reward, next_state, done):
        state = torch.tensor([state], dtype=torch.float32, device=self.device)
        action = torch.tensor([[action]], dtype=torch.long, device=self.device)
        reward = torch.tensor([reward], dtype=torch.float32, device=self.device)
        if not done:
            next_state = torch.tensor([next_state], dtype=torch.float32, device=self.device)
        else:
            next_state = None
        self.memory.push(state, action, reward, next_state)

    def train_model(self):
        if len(self.memory) < self.batch_size:
            return
        
        # Transition 객체 batch list
        transitions = self.memory.sample(self.batch_size)
        # batch.state > batch state 텐서 담긴 튜플
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                      device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(self.batch_size, device=self.device)
        with torch.no_grad():
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1).values

        target_values = (next_state_values * self.discount_factor) + reward_batch

        loss = self.lossfn(state_action_values, target_values.unsqueeze(1))
        self.loss_val = loss
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 50)
        self.optimizer.step()

    def update_target_net(self):
        target_net_state_dict = self.target_net.state_dict()
        policy_net_state_dict = self.policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)
        self.target_net.load_state_dict(target_net_state_dict)

In [6]:
root = os.getcwd()
save_dir = root + '\saved_models\dqn'
print(save_dir)

c:\Users\Lee\PythonWorkspace\RL\saved_models\dqn


In [None]:
env = gym.make("LunarLander-v2", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

scores, episodes = [], []
EPISODES = 2000

for e in range(EPISODES):
    done = False
    score = 0
    # env 초기화
    state, info = env.reset()
    state = np.reshape(state, [1, state_size])

    while not done:
        # 현재 상태에 대한 행동 선택
        action = agent.get_action(state)

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        next_state = np.reshape(next_state, [1, state_size])

        agent.append_memory(state[0], action, reward, next_state[0], done)

        # 샘플로 모델 학습
        agent.train_model()
        
        agent.update_target_net()
        loss = agent.loss_val
        score += reward
        state = next_state

        if done:
            print(f"episode: {e:4d}, score: {score:2f}, loss: {loss:2f}, epsilon: {agent.epsilon:.3f}")
            scores.append(score)
            episodes.append(e)

    # 100 에피소드마다 모델 저장
    if e > 0 and e % 100 == 0:
        file_name = f"lunar_lander_dqn_ep{e}.pth"
        save_path = os.path.join(save_dir, file_name)
        torch.save(agent.policy_net.state_dict(), save_path)
        print(f"--- Model saved at {save_path} ---")

plt.figure(figsize=(10, 6))
plt.plot(episodes, scores, 'b')
plt.title("Lunar Lander - DQN Training")
plt.xlabel("Episode")
plt.ylabel("Score")
plt.grid(True)
plt.show()

env.close()

ValueError: cannot reshape array of size 27648 into shape (1,96)

In [7]:
MODEL_PATH = save_dir + "\lunar_lander_dqn_ep1800.pth"

episodes_n = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym.make("LunarLander-v2", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode="human")

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

model = DQN(state_size, action_size).to(device)
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()

for e in range(episodes_n):
    state, info = env.reset()
    state = np.reshape(state, [1, state_size])

    done = False
    score = 0

    while not done:
        state_tensor = torch.FloatTensor(state).to(device)
        with torch.no_grad():
            q_values = model(state_tensor)
        
        action = torch.argmax(q_values).item()

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        score += reward
        state = np.reshape(next_state, [1, state_size])

    print(f"Episode {e+1}: Score = {score:.2f}")

env.close()

  model.load_state_dict(torch.load(MODEL_PATH))


Episode 1: Score = 275.76
Episode 2: Score = 305.04
Episode 3: Score = 271.00
