<a href="https://colab.research.google.com/github/jphan345/minecraft-ai/blob/main/minerl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 --version

In [None]:
# !sudo apt-get update -y
# !sudo apt-get install python3.6
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
# !sudo update-alternatives --config python3
# !sudo apt install python3-pip

# import tarfile
# my_tar = tarfile.open('./drive/MyDrive/Colab Notebooks/minerl-0.4.4.tar.gz')
# my_tar.extractall('./') # specify which folder to extract to
# my_tar.close()

# wait(100)
# !python3 ./minerl-0.4.4/setup.py install

In [None]:
%%capture
!sudo add-apt-repository -y ppa:openjdk-r/ppa
!sudo apt-get purge openjdk-*
!sudo apt-get install openjdk-8-jdk
!sudo apt-get install xvfb xserver-xephyr vnc4server python-opengl ffmpeg

In [None]:
%%capture
!sudo pip3 install imageio==2.4.1

In [None]:
%%capture
!pip3 install --upgrade minerl==0.3.7
!pip3 install pyvirtualdisplay
!pip3 install pytorch
!pip3 install scikit-learn
!pip3 install -U colabgymrender

In [None]:
%%capture
!sudo apt install ffmpeg
!pip3 install imageio==2.4.1

In [None]:
# !pip3 install --upgrade numpy==1.21.6
# !pip3 uninstall minerl
# !pip3 install minerl==0.3.7

# Import Libraries

In [None]:
import random
import numpy as np
import torch as th
from torch import nn
import torch.nn.functional as F
import gym
import minerl
from tqdm.notebook import tqdm
from colabgymrender.recorder import Recorder
from pyvirtualdisplay import Display
from sklearn.cluster import KMeans
import logging
import math
logging.disable(logging.ERROR) # reduce clutter, remove if something doesn't work to see the error logs.

# Traditional CNN

In [None]:
class NatureCNN(nn.Module):
    """
    CNN from DQN nature paper:
        Mnih, Volodymyr, et al.
        "Human-level control through deep reinforcement learning."
        Nature 518.7540 (2015): 529-533.

    :param input_shape: A three-item tuple telling image dimensions in (C, H, W)
    :param output_dim: Dimensionality of the output vector
    """

    def __init__(self, input_shape, output_dim):
        super().__init__()
        n_input_channels = input_shape[0]
        # CNN layers
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(th.zeros(1, *input_shape)).shape[1]

        self.linear = nn.Sequential(
            nn.Linear(n_flatten, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)
        )

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

# ImpalaCNN

Larger network than NatureCNN: adding residual blocks and similar techniques can improve performance of deep RL agents. https://arxiv.org/abs/1802.01561

In [None]:
class ImpalaCNN(nn.Module):
    def __init__(self, input_shape, output_dim):
        super().__init__()

        layers = []
        depth_in = input_shape[0]
        filter_sizes = (32, 64, 64)
        # Scaler for FixUp mid-most convolutions.
        first_conv_weight_scale = 1 / (math.sqrt(len(filter_sizes) * 2))

        for depth_out in filter_sizes:
            layers.extend([
                nn.Conv2d(depth_in, depth_out, kernel_size=3, stride=1, padding=1),
                nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
                FixupResidual(depth_out, first_conv_weight_scale),
                FixupResidual(depth_out, first_conv_weight_scale),
            ])
            depth_in = depth_out
        # Extra residual layer without max pooling
        layers.extend([
            FixupResidual(depth_in, first_conv_weight_scale),
            FixupResidual(depth_in, first_conv_weight_scale),
        ])

        self.conv_layers = nn.Sequential(*layers)
        self.linear = nn.Linear(filter_sizes[-1] *
                                math.ceil(input_shape[1] ** (1/2)) *
                                math.ceil(input_shape[2] ** (1/2)),
                                output_dim)

    def forward(self, observations: th.Tensor) -> th.Tensor:
        # Transpose observations to be channel-first (BCHW instead of BHWC)
        # observations shape: (batch_size, height, width, channels)
        # transpose to: (batch_size, channels, height, width)
        observations = observations.permute(0, 3, 1, 2).contiguous()
        # Normalize observations. Do this here to avoid using too much memory (images are uint8 by default)
        observations /= 255.0
        x = self.conv_layers(observations)
        x = F.relu(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        x = F.relu(x)
        return x


class ImpalaResidual(nn.Module):
    """
    A residual block for an IMPALA CNN.
    """

    def __init__(self, depth):
        super().__init__()
        self.conv1 = nn.Conv2d(depth, depth, kernel_size=3, stride=1, padding=1, bias=False)
        self.conv2 = nn.Conv2d(depth, depth, kernel_size=3, stride=1, padding=1, bias=False)

    def forward(self, x):
        out = F.relu(x)
        out = self.conv1(out)
        out = F.relu(out)
        out = self.conv2(out)
        return out + x

class FixupResidual(nn.Module):
    def __init__(self, depth, first_conv_weight_scale):
        super().__init__()
        self.conv1 = nn.Conv2d(depth, depth, kernel_size=3, stride=1, padding=1, bias=False)
        self.conv2 = nn.Conv2d(depth, depth, kernel_size=3, stride=1, padding=1, bias=False)

        self.bias1 = nn.Parameter(th.zeros([depth, 1, 1]))
        self.bias2 = nn.Parameter(th.zeros([depth, 1, 1]))
        self.bias3 = nn.Parameter(th.zeros([depth, 1, 1]))
        self.bias4 = nn.Parameter(th.zeros([depth, 1, 1]))
        self.scale = nn.Parameter(th.ones([depth, 1, 1]))

        # Final Convs in residual branches initializedto zero
        # Other convs in residual branches initialized to a scaled value
        with th.no_grad():
            self.conv2.weight *= 0
            self.conv1.weight *= first_conv_weight_scale

    def forward(self, x):
        x = F.relu(x)

        out = x + self.bias1
        out = self.conv1(out)
        out = out + self.bias2

        out = F.relu(out)

        out = out + self.bias3
        out = self.conv2(out)
        out = out * self.scale
        out = out + self.bias4

        return out + x

# Setup training

In [None]:
def train():
    # We will only use ObtainPickaxe data which is smaller,
    # but has the similar steps as ObtainDiamond in the beginning.
    # "VectorObf" stands for vectorized (vector observation and action), where there is no
    # clear mapping between original actions and the vectors (i.e. AI needs to learn it)
    iron_pick_data = minerl.data.make("MineRLObtainIronPickaxeVectorObf-v0",  data_dir='data', num_workers=1)
    treechop_data = minerl.data.make("MineRLTreechopVectorObf-v0",  data_dir='data', num_workers=1)
    # diamond_data = minerl.data.make("MineRLObtainDiamondVectorObf-v0",  data_dir='data', num_workers=1)

    datasets = [iron_pick_data, treechop_data];

    # Use k-means to find actions that represent most of them.
    # Go over the dataset once and collect all actions and the observations (the "pov" image).
    # We do this to later on have uniform sampling of the dataset and to avoid high memory use spikes.
    all_actions = []
    all_pov_obs = []

    print("Loading data")
    for i in range(len(datasets)):
      data = datasets[i]
      trajectory_names = data.get_trajectory_names()
      random.shuffle(trajectory_names)

      # Add trajectories to the data until we reach the required DATA_SAMPLES.
      for trajectory_name in trajectory_names:
          trajectory = data.load_data(trajectory_name, skip_interval=0, include_metadata=False)
          for dataset_observation, dataset_action, _, _, _ in trajectory:
              all_actions.append(dataset_action["vector"])
              all_pov_obs.append(dataset_observation["pov"])
          if len(all_actions) >= (DATA_SAMPLES / len(datasets)) * (i + 1):
              break

    all_actions = np.array(all_actions)
    all_pov_obs = np.array(all_pov_obs)

    # Run k-means clustering using scikit-learn.
    print("Running KMeans on the action vectors")
    kmeans = KMeans(n_clusters=NUM_ACTION_CENTROIDS)
    kmeans.fit(all_actions)
    action_centroids = kmeans.cluster_centers_
    print("KMeans done")

    # Do behavioural cloning on the discrete actions, where we turn the
    # original vectors into discrete choices by mapping them to the closest
    # centroid (based on Euclidian distance).

    network = ImpalaCNN((3, 64, 64), NUM_ACTION_CENTROIDS).cuda()
    optimizer = th.optim.Adam(network.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss()

    num_samples = all_actions.shape[0]
    update_count = 0
    losses = []
    # We have the data loaded up already in all_actions and all_pov_obs arrays.
    # Training loop
    print("Training")
    for _ in range(EPOCHS):
        # Randomize the order in which we go over the samples
        epoch_indices = np.arange(num_samples)
        np.random.shuffle(epoch_indices)
        for batch_i in range(0, num_samples, BATCH_SIZE):
            # NOTE: this will cut off incomplete batches from end of the random indices
            batch_indices = epoch_indices[batch_i:batch_i + BATCH_SIZE]

            # Load the inputs and preprocess
            obs = all_pov_obs[batch_indices].astype(np.float32)
            # # Transpose observations to be channel-first (BCHW instead of BHWC)
            # obs = obs.transpose(0, 3, 1, 2)
            # # Normalize observations. Do this here to avoid using too much memory (images are uint8 by default)
            # obs /= 255.0

            # Map actions to their closest centroids
            action_vectors = all_actions[batch_indices]
            # Use numpy broadcasting to compute the distance between all
            # actions and centroids at once.
            # "None" in indexing adds a new dimension that allows the broadcasting
            distances = np.sum((action_vectors - action_centroids[:, None]) ** 2, axis=2)
            # Get the index of the closest centroid to each action.
            # This is an array of (batch_size,)
            actions = np.argmin(distances, axis=0)

            # Obtain logits of each action
            logits = network(th.from_numpy(obs).float().cuda())

            # Minimize cross-entropy with target labels.
            # We could also compute the probability of demonstration actions and
            # maximize them.
            loss = loss_function(logits, th.from_numpy(actions).long().cuda())

            # Standard PyTorch update
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            update_count += 1
            losses.append(loss.item())
            if (update_count % 1000) == 0:
                mean_loss = sum(losses) / len(losses)
                tqdm.write("Iteration {}. Loss {:<10.3f}".format(update_count, mean_loss))
                losses.clear()

            # Start saving the model after 1 mil iterations just incase of a crash
            if (update_count > 1000000 and update_count % 100000):
                np.save(TRAIN_KMEANS_MODEL_NAME, action_centroids)
                th.save(network.state_dict(), TRAIN_MODEL_NAME)
    print("Training done")

    # Save network and the centroids into separate files
    np.save(TRAIN_KMEANS_MODEL_NAME, action_centroids)
    th.save(network.state_dict(), TRAIN_MODEL_NAME)
    for data in datasets:
        del data
    del datasets


# Parameters

In [None]:
# Parameters:
# epochs = 256
# batch_size = 32
# lr = 0.0000625
EPOCHS = 32  # how many times we train over dataset.
LEARNING_RATE = 0.0000625  # Learning rate for the neural network.
BATCH_SIZE = 32
NUM_ACTION_CENTROIDS = 60  # Number of KMeans centroids used to cluster the data.

DATA_SAMPLES = 1000000  # how many samples to use from the dataset. Impacts RAM usage

TRAIN_MODEL_NAME = './drive/MyDrive/Colab Notebooks/trained_agent.pth'  # name to use when saving the trained agent.
TEST_MODEL_NAME = './drive/MyDrive/Colab Notebooks/trained_agent.pth'  # name to use when loading the trained agent.
TRAIN_KMEANS_MODEL_NAME = './drive/MyDrive/Colab Notebooks/kmeans_model.npy'  # name to use when saving the KMeans model.
TEST_KMEANS_MODEL_NAME = './drive/MyDrive/Colab Notebooks/kmeans_model.npy'  # name to use when loading the KMeans model.

TEST_EPISODES = 25  # number of episodes to test the agent for.
MAX_TEST_EPISODE_LEN = 6000  # 18k is the default for MineRLObtainDiamondVectorObf.

# Download the data

In [None]:
import os

data_paths = ['./data/MineRLObtainIronPickaxeVectorObf-v0',
              './data/MineRLTreechopVectorObf-v0',
              './data/MineRLObtainDiamondVectorObf-v0']

# Don't downloadMineRLObtainDiamondVectorObf-v0 to save time since we are not using it
for path in data_paths[:-1]:
  if not os.path.exists(path):
    minerl.data.download(directory='data', environment=path[7:])
  else:
    print(f"{path[7:]} already exists!")

# Train

In [None]:
%%capture
!apt-get update
!apt-get install -y xvfb

In [None]:
display = Display(visible=0, size=(400, 300))
display.start();

In [None]:
train()  # only need to run this once.

# Start Minecraft

In [None]:
env = gym.make('MineRLObtainDiamondDenseVectorObf-v0')
env = Recorder(env, './video', fps=60)

# Run your agent
As the code below runs you should see episode videos and rewards show up. You can run the below cell multiple times to see different episodes.

In [None]:
action_centroids = np.load(TEST_KMEANS_MODEL_NAME)
network = ImpalaCNN((3, 64, 64), NUM_ACTION_CENTROIDS).cuda()
network.load_state_dict(th.load(TEST_MODEL_NAME))


num_actions = action_centroids.shape[0]
action_list = np.arange(num_actions)
total_rewards = []

for episode in range(TEST_EPISODES):
    rewards = []
    env.seed(95)  # https://drive.google.com/file/d/1JawPwdfOyxTaYeF7imhhXp2VJR2wMnzk/view
    obs = env.reset()
    done = False
    total_reward = 0
    steps = 0

    while not done:
        # Process the action:
        #   - Add/remove batch dimensions
        #   - Transposing and normalizing done in network
        # obs = th.from_numpy(obs['pov'].transpose(2, 0, 1)[None].astype(np.float32) / 255).cuda()
        obs = th.from_numpy(obs['pov'][None].astype(np.float32)).cuda()
        # Turn logits into probabilities
        probabilities = th.softmax(network(obs), dim=1)[0]
        # Into numpy
        probabilities = probabilities.detach().cpu().numpy()
        # Sample action according to the probabilities
        discrete_action = np.random.choice(action_list, p=probabilities)

        # Map the discrete action to the corresponding action centroid (vector)
        action = action_centroids[discrete_action]
        minerl_action = {"vector": action}

        obs, reward, done, info = env.step(minerl_action)
        total_reward += reward
        if reward > 0:
            rewards.append(reward)
        steps += 1
        if steps >= MAX_TEST_EPISODE_LEN:
            break

    env.release()
    env.play()
    total_rewards.append(total_reward)
    print(f'Episode #{episode + 1} reward: {total_reward}\t\t episode length: {steps}\t\t rewards: {rewards}\n')

avg_total_reward = sum(total_rewards) / len(total_rewards)
print(f'Average dense reward: {avg_total_reward}.')