# Load data

In [1]:
import numpy as np
from sklearn.utils import shuffle

In [2]:
base_dir = '/home/vitchyr/git/rllab-rail/railrl/data/replay_buffer'

all_actions = np.loadtxt(base_dir + "/actions.csv", delimiter=',')
all_obs = np.loadtxt(base_dir + "/obs.csv", delimiter=',')
all_rewards = np.loadtxt(base_dir + "/rewards.csv", delimiter=',')
all_terminals = np.loadtxt(base_dir + "/terminals.csv", delimiter=',')

In [3]:
nonzero = [i for i, e in enumerate(all_terminals) if e != 0]
last_full_episode_idx = nonzero[-1] + 1

In [4]:
terminals = all_terminals[:last_full_episode_idx]
obs = all_obs[:last_full_episode_idx]
next_obs = all_obs[1:last_full_episode_idx+1]
actions = all_actions[:last_full_episode_idx]
rewards = all_rewards[:last_full_episode_idx]

In [5]:
positive_idxs = np.array([i for i, reward in enumerate(rewards) if reward == 1.0])
negative_idxs = np.array([i for i, reward in enumerate(rewards) if reward == -1.0])

In [7]:
# -5 to give the first observation in the sequence
Xpos = np.hstack((
    actions[positive_idxs][:, :],
    obs[positive_idxs-5][:, :],
    next_obs[positive_idxs]
))
Xneg = np.hstack((
    actions[negative_idxs][:, :],
    obs[negative_idxs-5][:, :],
    next_obs[negative_idxs]
))
num_pos = Xpos.shape[0]
num_neg = Xneg.shape[0]
num_total = num_pos + num_neg


raw_X_posneg = np.vstack((Xpos, Xneg))
raw_y_posneg = np.hstack((np.ones(num_pos), np.zeros(num_neg)))

In [8]:
X_posneg, y_posneg = shuffle(raw_X_posneg, raw_y_posneg, random_state=0)

# TensorFlow model to train

# Shuffle and build data set

In [9]:
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [10]:
scores = cross_val_score(LogisticRegression(), X_posneg, y_posneg, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

[ 0.81944444  0.75694444  0.79861111  0.8041958   0.72027972  0.78873239
  0.78169014  0.71830986  0.74647887  0.77464789]
0.770933467941


In [11]:
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
for train_index, test_index in rs.split(X_posneg):
    X = X_posneg[train_index]
    y = y_posneg[train_index]
    model = LogisticRegression()
    model = model.fit(X, y)
    print(model.score(X, y))

0.771241830065
0.759103641457
0.77497665733


# PyTorch

In [12]:
from itertools import count

import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim

In [13]:
def get_torch_iterator(X, y, batch_size=32):
    i = 0
    num_elements = len(X)
    while True:
        yield Variable(X[i:i+batch_size]), Variable(y[i:i+batch_size])
        i = (i + batch_size) % num_elements
def label(y):
    return np.round(y)

In [14]:
X_torch_pn = torch.from_numpy(X_posneg).float()
y_torch_pn = torch.from_numpy(y_posneg).float()

##  Regression

In [13]:
class RegressionNet(nn.Module):
    def __init__(self, feature_dim, hidden_sizes):
        super().__init__()
        # an affine operation: y = Wx + b
        self.fcs = []
        last_size = feature_dim
        for size in hidden_sizes:
            self.fcs.append(nn.Linear(last_size, size))
            last_size = size
        self.last_fc = nn.Linear(last_size, 1)

    def forward(self, x):
        x = x.view(-1, self.num_flat_features(x))
        for fc in self.fcs:
            x = F.relu(fc(x))
        x = self.last_fc(x)
        x = F.sigmoid(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [14]:
feature_dim = X_posneg.shape[1]
hidden_sizes = [100, 64, 32]
regression_net = RegressionNet(feature_dim, hidden_sizes)
batch_iterator = get_torch_iterator(X_torch_pn, y_torch_pn)
criterion = nn.MSELoss()
optimizer = optim.Adam(regression_net.parameters(), lr=0.001)
for _ in range(10000):
    # Get data
    batch_x, batch_y = next(batch_iterator)

    # Reset gradients
    optimizer.zero_grad()

    # Forward pass
    output = regression_net(batch_x)
    loss = criterion(output, batch_y)
    
    # Backward pass
    loss.backward()

    # Apply gradients
    optimizer.step()

In [15]:
yhats_var_pn = regression_net(Variable(X_torch_pn))
loss = criterion(yhats_var_pn, Variable(y_torch_pn)).data.numpy()[0]

yhats_numpy_pn = yhats_var_pn.data.numpy().flatten()

print("MSE Loss", loss)
print("Accuracy", np.mean(label(y_posneg) == label(yhats_numpy_pn)))

MSE Loss 0.238116
Accuracy 0.579831932773


## One-hot

In [11]:
class OneHotNet(nn.Module):

    def __init__(self, feature_dim, hidden_sizes, num_classes):
        super().__init__()
        # an affine operation: y = Wx + b
        self.fcs = []
        last_size = feature_dim
        for size in hidden_sizes:
            self.fcs.append(nn.Linear(last_size, size))
            last_size = size
        self.last_fc = nn.Linear(last_size, num_classes)
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = x.view(-1, self.num_flat_features(x))
        for fc in self.fcs:
            x = F.relu(fc(x))
        x = self.last_fc(x)
        x = self.softmax(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [12]:
def to_onehot_n(inds, dim):
    ret = np.zeros((len(inds), dim))
    ret[np.arange(len(inds)), inds] = 1
    return ret
all_y_onehot_3 = to_onehot_n((rewards+1).astype(int), 3)

## two-way one-hot vector for rward of +/- 1

In [13]:
feature_dim = X_posneg.shape[1]
num_classes = 2
hidden_sizes = [100, 64, 64]
net2 = OneHotNet(feature_dim, hidden_sizes, num_classes)
batch_iterator2 = get_torch_iterator(X_torch_pn, y_torch_pn.long())
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net2.parameters(), lr=0.001)
for _ in range(10000):
    # Get data
    batch_x, batch_y = next(batch_iterator2)

    # Reset gradients
    optimizer.zero_grad()

    # Forward pass
    output = net2(batch_x)
    loss = criterion(output, batch_y)
    
    # Backward pass
    loss.backward()

    # Apply gradients
    optimizer.step()

In [14]:
yhat2_torch = net2(Variable(X_torch_pn))
yhat2_numpy = yhat2_torch.data.numpy()
yhat2_label = np.argmax(yhat2_numpy, axis=1)
y_onehot_pn = to_onehot_n(y_posneg.astype(int), 2)


loss = criterion(yhat2_torch, Variable(y_torch_pn.long()))
print("Accuracy", np.mean(y_posneg == yhat2_label))
print("Cross Entropy", loss.data.numpy()[0])
print("MSE Onehots", np.mean((yhat2_numpy-y_onehot_pn)**2))

Accuracy 0.59243697479
Cross Entropy 0.665516
MSE Onehots 0.237172399611


In [20]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_posneg, yhat2_label)
print(cnf_matrix)

[[723   0]
 [705   0]]


## Three-way one-hot vector for reward of +1, 0, or -1

In [15]:
def to_onehot_n(inds, dim):
    ret = np.zeros((len(inds), dim))
    ret[np.arange(len(inds)), inds] = 1
    return ret
all_y_onehot = to_onehot_n((rewards+1).astype(int), 3)

In [16]:
all_X = np.hstack((
    actions,
    obs,
    next_obs
))
all_X_torch = torch.from_numpy(all_X).float()
all_y_torch = torch.from_numpy(rewards + 1).long()
all_Xv = Variable(all_X_torch)
all_Yv = Variable(all_y_torch)

In [20]:
feature_dim = all_X_torch.size()[1]
num_classes = 3
hidden_sizes = [100, 3]
net = OneHotNet(feature_dim, hidden_sizes, num_classes)
batch_iterator = get_torch_iterator(all_X_torch, all_y_torch)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
for _ in range(1000):
    # Get data
    batch_x, batch_y = next(batch_iterator)
    
    # Reset gradients
    optimizer.zero_grad()

    # Forward pass
    output = net(batch_x)
    loss = criterion(output, batch_y)
    
    # Backward pass
    loss.backward()

    # Apply gradients
    optimizer.step()

# Analyze model

In [21]:
loss = criterion(net(all_Xv), all_Yv).data.numpy()
yhat_soft = net(all_Xv).data.numpy()
y = all_Yv.data.numpy().astype(int)
yhat = np.argmax(yhat_soft, axis=1)
print("Accuracy", np.mean(y == yhat))
print("Cross Entropy", np.mean(loss))
print("MSE Onehots", np.mean((yhat_soft-all_y_onehot)**2))

Accuracy 0.857128564282
Cross Entropy 0.772818
MSE Onehots 0.0924693607724


In [22]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y, yhat)
print(cnf_matrix)

[[   0  723    0]
 [   0 8567    0]
 [   0  705    0]]
