***Import libraries***

In [1]:
import gym
import gym_2048
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import numpy as np
import random
import time
import pandas as pd
from IPython import display

***Hyperparameters***

In [2]:
max_epsilon = 1.0
min_epsilon = 0.1
gamma = 0.99
memory_size = 5000 # number of experiences being stored in the memory
batch_size = 64 # batch size to sample experiences
target_update_frequency = 100 # after this amount of actions, update the target-network with the q-network

***Q-Network Class***

In [16]:
class NeuralNetwork(nn.Module):
    def __init__(self, env):
        super(NeuralNetwork, self).__init__()

        self.network = nn.Sequential(
            # Hidden layers
            nn.Flatten(),
            nn.Linear(in_features = 16, out_features = 64), # First hidden layer
            nn.Tanh(),
            nn.Linear(in_features = 64, out_features = 64), # Second hidden layer
            nn.Tanh(),
            nn.Linear(in_features = 64, out_features = 64), # Third hidden layer
            nn.Tanh(),
            # Output layer
            nn.Linear(64, 4)
        )

    def forward(self, x):
        return self.network(x)

    def choose_action(self, state):
        state = np.ascontiguousarray(state, dtype=np.float32)
        state = torch.tensor(state, dtype=torch.float32)
        q_values = self(state.unsqueeze(0)) # Pytorch require input in terms of batch
        best_action_index = torch.argmax(q_values, dim=1)[0]

        return best_action_index.detach().item()
    
    def choose_action_and_print_q_values(self, state):
        state = np.ascontiguousarray(state, dtype=np.float32)
        state = torch.tensor(state, dtype=torch.float32)
        q_values = self(state.unsqueeze(0)) # Pytorch require input in terms of batch
        print("q-values: ", q_values)
        best_action_index = torch.argmax(q_values, dim=1)[0]
        print(f"Best value: {torch.max(q_values)}, action: {best_action_index.detach().item()}")

        return best_action_index.detach().item()
    
    def choose_random_action(self, env):
        return random.randrange(0, env.action_space.n)

***Create the game and Experience replay memory***

In [1]:
env = gym.make('2048-v0')
memory = deque(maxlen = memory_size)

***Create the 2 network***

In [5]:
q_net = NeuralNetwork(env)
target_net = NeuralNetwork(env)
target_net.load_state_dict(q_net.state_dict())
optimizer = optim.Adam(q_net.parameters(), lr=5e-4)

***Train the 2 network***

In [2]:
num_episode = 2000
epsilon_decay_intervals = num_episode
max_num_steps = 10000

reward_per_episode = 0.0
count_step = 0

training_info = {
    "Episode": np.zeros(shape=num_episode, dtype=np.uint16),
    "Total_reward": np.zeros(shape=num_episode, dtype=np.uint16),
    "Max_value": np.zeros(shape=num_episode, dtype=np.uint16),
    "Total_actions": np.zeros(shape=num_episode, dtype=np.uint32),
}

start_time = time.process_time()
for episode in range(num_episode):
    # Reset game
    state = env.reset()
    reward_per_episode = 0.0
    
#     epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_intervals*episode)
    epsilon = np.interp(episode, [0, epsilon_decay_intervals], [max_epsilon, min_epsilon])
    
    for step in range(max_num_steps):
        # Count number of action has been executed to update the target-network
        count_step += 1

        # Choose action
        exploration = np.random.uniform(0, 1)
        if exploration <= epsilon:
            action = env.action_space.sample()
        else:
            action = q_net.choose_action(state)

        next_state, reward, done, info = env.step(action)
        reward_per_episode += reward
        
        # Get max grid's value
        max_grid_value = np.amax(next_state)

        # Store experience
        experience = (state, action, reward, done, next_state)
        memory.append(experience)
        
        # Change to next state
        state = next_state

        # Sample batch of experiences to learn
        if len(memory) >= batch_size:
            # Take batch_size experiences from the memory
            experiences = random.sample(memory, batch_size)

            states = [ex[0] for ex in experiences]
            actions = [ex[1] for ex in experiences]
            rewards = [ex[2] for ex in experiences]
            dones = [ex[3] for ex in experiences]
            next_states = [ex[4] for ex in experiences]

            # Change to tensor
            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(-1) # (batch_size,) --> (batch_size, 1)
            rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(-1)
            dones = torch.tensor(dones, dtype=torch.float32).unsqueeze(-1)
            next_states = torch.tensor(next_states, dtype=torch.float32)

            # Compute target values using the formulation sample = r + gamma * max q(s', a')
            target_q_values = target_net(next_states)
            max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0] # index 0 to take the max values, index 1 to take the max values's index
            targets = rewards + gamma * (1 - dones) * max_target_q_values

            # Compute loss
            q_values = q_net(states)
            action_q_values = torch.gather(input=q_values, dim =1, index = actions)
            loss = nn.functional.mse_loss(action_q_values, targets)

            # Gradient descent for q-network
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        # Update target-netword
        if count_step % target_update_frequency == 0:
#             print("update target-net")
            target_net.load_state_dict(q_net.state_dict())
            
        if done:
            print(f"Episode {episode+1}   Reward: {reward_per_episode}   Max value: {max_grid_value}   Total actions: {step}")
#             env.render()
            
            # Store information about an episode
            training_info["Episode"][episode] = episode+1
            training_info["Total_reward"][episode] = reward_per_episode
            training_info["Max_value"][episode] = max_grid_value
            training_info["Total_actions"][episode] = step
            break

end_time = time.process_time()
print('\nTime To train: ', end_time - start_time, " seconds")

# Saving training_info and the q-net
df = pd.DataFrame(training_info)
name = "3layers_1000eps" # Name of the model and the excel file to store results
df.to_excel(excel_writer="training_info_" + name + ".xlsx", index=False)
torch.save(q_net.state_dict(), "model_" + name + ".pth")

***Load the model has been trained to play***

In [3]:
model = NeuralNetwork(env)
model.load_state_dict(torch.load("model_3layers_2000eps_5000memorySize.pth"))
play_multiple_times(env, model, num_episodes=100)