In [1]:
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torchsummary import summary
from sklearn.metrics import explained_variance_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = np.load('valid_100K_disc1_cleaned.npz')

In [3]:
x_train = data['observations']
p_train = data['actions']
v_train = data['rewards']
x_train.shape, p_train.shape, v_train.shape

((95623, 11, 11, 18), (95623,), (95623,))

In [52]:
class LinearModel(nn.Module):
    def __init__(self, input_size=11*11*18, hidden_size=128):
        super().__init__()
        self.fc0  = nn.Linear(input_size, hidden_size)
        self.fc_p = nn.Linear(hidden_size, 6)
        self.fc_v = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = x.view(x.size(0), -1).float()
        x = F.relu(self.fc0(x))
        p = F.softmax(self.fc_p(x), dim=-1)
        v = torch.tanh(self.fc_v(x))
        return p, v

In [62]:
model = LinearModel(hidden_size=512)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [65]:
batch_size = 100
epochs = 20

for epoch in range(epochs):
    batch_idx = np.random.randint(0, len(p_train), size=batch_size)
    x = torch.tensor(x_train[batch_idx])
    p = torch.tensor(p_train[batch_idx])
    v = torch.tensor(v_train[batch_idx])

    p_pred, v_pred = model(x)
    loss_p = F.cross_entropy(p_pred, p)
    loss_v = torch.mean((v - v_pred)**2)
    loss = loss_p + loss_v
    
    print(f"Epoch {epoch:02d} - Loss = {loss.item():-5.3f} - Loss_p = {loss_p.item():05.3f} - Loss_v = {loss_v.item():05.3f}")
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 00 - Loss = 2.964 - Loss_p = 1.964 - Loss_v = 1.000
Epoch 01 - Loss = 2.554 - Loss_p = 1.874 - Loss_v = 0.680
Epoch 02 - Loss = 2.804 - Loss_p = 1.884 - Loss_v = 0.920
Epoch 03 - Loss = 2.924 - Loss_p = 1.804 - Loss_v = 1.120
Epoch 04 - Loss = 2.974 - Loss_p = 1.894 - Loss_v = 1.080
Epoch 05 - Loss = 2.794 - Loss_p = 1.874 - Loss_v = 0.920
Epoch 06 - Loss = 2.884 - Loss_p = 1.924 - Loss_v = 0.960
Epoch 07 - Loss = 3.124 - Loss_p = 1.924 - Loss_v = 1.200
Epoch 08 - Loss = 2.984 - Loss_p = 1.824 - Loss_v = 1.160
Epoch 09 - Loss = 3.034 - Loss_p = 1.914 - Loss_v = 1.120
Epoch 10 - Loss = 2.594 - Loss_p = 1.914 - Loss_v = 0.680
Epoch 11 - Loss = 3.014 - Loss_p = 1.894 - Loss_v = 1.120
Epoch 12 - Loss = 2.944 - Loss_p = 1.864 - Loss_v = 1.080
Epoch 13 - Loss = 3.094 - Loss_p = 1.934 - Loss_v = 1.160
Epoch 14 - Loss = 2.934 - Loss_p = 1.854 - Loss_v = 1.080
Epoch 15 - Loss = 2.794 - Loss_p = 1.914 - Loss_v = 0.880
Epoch 16 - Loss = 2.774 - Loss_p = 1.934 - Loss_v = 0.840
Epoch 17 - Los

In [64]:
torch.mean((predictions[1].squeeze() - torch.tensor(v_train[batch_idx]))**2)

RuntimeError: The size of tensor a (2) must match the size of tensor b (100) at non-singleton dimension 0

In [132]:
F.mse_loss(predictions[1].squeeze(), torch.tensor(v_train[batch_idx]))

tensor(0.6862, grad_fn=<MseLossBackward>)

In [20]:
torch.tensor(v_train[batch_idx], dtype=torch.float).dtype

torch.float32

In [21]:
predictions[1].squeeze().dtype

torch.float32

In [24]:
loss.dtype

torch.float32