# Catan DQN Training Notebook

In [None]:
import sys
import os
import shutil
if not os.path.exists('catan-ai-cpsc474-574'):
    !git clone https://github.com/haroonmoh/catan-ai-cpsc474-574

if os.path.exists('catan-ai-cpsc474-574/code'):
    sys.path.append(os.path.abspath('catan-ai-cpsc474-574/code'))
elif os.path.exists('code'):
    sys.path.append(os.path.abspath('code'))
elif os.path.exists('../code'):
    sys.path.append(os.path.abspath('../code'))

import random
import numpy as np
import collections
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam

from catan_dqn_env import CatanEnv

# can use or not use. I am just using it for now for debugging
np.random.seed(100)
tf.random.set_seed(100)
random.seed(100)

## DQN Agent Implementation

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = collections.deque(maxlen=100000)
        self.gamma = 0.95
        self.epsilon = 1.0 
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.state_size,)))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def record(self, state, action, reward, next_state, done, next_mask):
        self.memory.append((state, action, reward, next_state, done, next_mask))

    def act(self, state, action_mask):
        valid_actions = np.flatnonzero(action_mask)
        if valid_actions.size == 0:    # if no valid actions, return 0 which is END_TURN
            return 0

        if np.random.rand() <= self.epsilon:    # explore with epsilon probability
            return int(np.random.choice(valid_actions))

        # get the q-values for all actions
        q = self.model.predict(state.reshape(1, -1), verbose=0)[0]
        q = q.copy()

        # mask out invalid actions. again have to do this so it doesn't choose invalid actions
        mask_value = np.finfo(q.dtype).min
        q[~action_mask] = mask_value

        # choose the action with the highest q-value
        return int(np.argmax(q))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        # sample k transitions
        minibatch = random.sample(self.memory, batch_size)

        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])
        next_masks = np.array([i[5] for i in minibatch])

        # get the q-values for all actions
        target = self.model.predict(states, verbose=0)
        target_next = self.target_model.predict(next_states, verbose=0)

        for i in range(batch_size):
            if dones[i]:
                target[i][actions[i]] = rewards[i]
            else:
                qn = target_next[i].copy()
                mask_value = np.finfo(qn.dtype).min
                qn[~next_masks[i]] = mask_value
                # function from lecture r + gamma * max(q(s', a'))
                target[i][actions[i]] = rewards[i] + self.gamma * np.max(qn)

        # train q_train to match q_target
        self.model.fit(states, target, batch_size=batch_size, epochs=1, verbose=0)

    def load(self, name):
        self.model.load_weights(name)
    def save(self, name):
        self.model.save_weights(name)


## Training Loop

In [None]:
env = CatanEnv()
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

EPISODES = 1000
BATCH_SIZE = 32
TRAIN_EVERY_N_ACTIONS = 4
UPDATE_TARGET_EVERY_M_EPISODES = 10

scores = []
total_actions = 0

for e in range(EPISODES):
    state = env.reset()
    score = 0
    done = False
    steps = 0

    while not done:   # while s is not terminal
        mask = env.action_mask()        # masking out invalid actions
        action = agent.act(state, mask)      # choose a with epsilon-greedy

        # record (s, a, r, s')
        next_state, reward, done, _ = env.step(action)
        next_mask = env.action_mask() if not done else np.ones(action_size, dtype=bool)
        agent.record(state, action, reward, next_state, done, next_mask)

        # update s <- s'
        state = next_state
        score += reward
        steps += 1
        total_actions += 1
        
        # update every n actions pseudocode from lecture
        if total_actions % TRAIN_EVERY_N_ACTIONS == 0 and len(agent.memory) > BATCH_SIZE:
            agent.replay(BATCH_SIZE)

    if (e + 1) % UPDATE_TARGET_EVERY_M_EPISODES == 0:
        agent.update_target_model()
        print(f"Target network updated at episode {e+1}")
    
    # Decay epsilon every 2 episodes (instead of every episode)
    if (e + 1) % 2 == 0 and agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay

    print(f"episode: {e}/{EPISODES}, score: {score:.2f}, steps: {steps}, e: {agent.epsilon:.3}")
    scores.append(score)

    if e % 10 == 0:
        agent.save(f"catan-dqn-{e}.weights.h5")

In [None]:
import matplotlib.pyplot as plt
plt.plot(scores)
plt.ylabel('Score')
plt.xlabel('Episode')
plt.show()