In [None]:
import os
os.chdir("/home/jakob/doktor/projects/EnsembleUncertainty/code")
"""Learing "logit" distribution in regression example"""
import logging
import zipfile
from copy import copy, deepcopy
import urllib.request
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Optimizer
from torch.optim.sgd import SGD
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from sklearn.model_selection import KFold
from src.dataloaders.uci import uci_base, wine, bost
from src import metrics
from src import utils
from src.ensemble import simple_regressor, ensemble
from src import loss as custom_loss



LOGGER = logging.getLogger(__name__)
EXPERIMENT_NAME = "red_regression_logits"

# Settings
class Args():
    pass
args = Args()
args.seed = 1
args.gpu = False
args.log_dir = Path("./logs")
args.log_level = logging.INFO
args.retrain = True

args.num_ensemble_members=1
args.num_epochs=1
args.lr = 0.01

# General constructs
test_metrics = list()
mse = metrics.Metric(name="MSE", function=metrics.mean_squared_error)
#test_metrics.append(mse)
rmse = metrics.Metric(name="RMSE", function=metrics.root_mean_squared_error)
test_metrics.append(rmse)


BATCH_SIZE = 32
torch.cuda.device(0)
torch.cuda.get_device_name(torch.cuda.current_device())
device = torch.device("cuda")
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'svg'

In [None]:
def to_variable(var=(), cuda=True, volatile=False):
    out = []
    for v in var:
        
        if isinstance(v, np.ndarray):
            v = torch.from_numpy(v).type(torch.FloatTensor)

        if not v.is_cuda and cuda:
            v = v.cuda()

        if not isinstance(v, Variable):
            v = Variable(v, volatile=volatile)

        out.append(v)
    return out

def get_loss_and_rmse(network,
                      loss_function,
                      x, y,
                      mean_shift=None, std_scale=None):
    x, y = to_variable(var=(x, y), cuda=True)

    logits = network.forward(x)
    output = network.transform_logits(logits)
    mean, std = output

    if mean_shift is not None and std_scale is not None:
        mean_shift = torch.tensor(mean_shift).float().cuda()
        std_scale = torch.tensor(std_scale).float().cuda()
        mean = mean * std_scale + mean_shift
        y = y * std_scale + mean_shift
        std *= std_scale

    loss = loss_function((mean, std), y)

    rmse = ((mean - y)**2).mean()**0.5

    return loss.detach().cpu(), rmse.detach().cpu()

def create_ensemble(num_ensemble_members,
                    model,
                    ensemble_output_size):
    prob_ensemble = ensemble.Ensemble(ensemble_output_size)
    for _ in range(num_ensemble_members):
        prob_ensemble.add_member(copy(model))
    return prob_ensemble

# UCI datasets

In [None]:
def train_mc_dropout(data,
                     drop_prob,
                     n_splits,
                     num_epochs,
                     num_units,
                     learn_rate,
                     weight_decay,
                     metrics,
                     batch_size):
    
    kf = KFold(n_splits=n_splits)
    in_dim = data.shape[1] - 1
    train_nll, test_nll = [], []
    train_rmses, test_rmses = [], []
    for metric in metrics:
        metric.reset()
        

    network = simple_regressor.Model(layer_sizes=[in_dim, num_units, 2],
                                     device=device,
                                     variance_transform=utils.variance_linear_asymptote(),
                                     loss_function=custom_loss.gaussian_neg_log_likelihood_1d)

    network.optimizer = torch.optim.SGD(network.parameters(),
                                lr=learn_rate,
                                weight_decay=weight_decay)
    prob_ensemble = create_ensemble(num_ensemble_members=args.num_ensemble_members,
                               model=network,
                               ensemble_output_size=1)

    prob_ensemble.add_metrics(test_metrics)

        
    for fold_count, idx in enumerate(kf.split(data)):
        print("Fold: {}".format(fold_count))
        train_index, test_index = idx
        x_train, y_train = data[train_index, :in_dim], data[train_index, in_dim:]
        x_test, y_test = data[test_index, :in_dim], data[test_index, in_dim:]

        x_means, x_stds = x_train.mean(axis = 0), x_train.var(axis = 0)**0.5
        y_means, y_stds = y_train.mean(axis = 0), y_train.var(axis = 0)**0.5

        x_train = (x_train - x_means) / x_stds
        y_train = (y_train - y_means) / y_stds

        x_test = (x_test - x_means) / x_stds
        y_test = (y_test - y_means) / y_stds
                
        trainloader = uci_base.uci_dataloader(x_train, y_train, batch_size)
        
        losses = []
        prob_ensemble.train(train_loader=trainloader,
                            num_epochs=num_epochs)
    
        
        mean_shift = None
        std_scale = None
        #mean_shift = y_means
        #std_scale = y_stds
        
        train_loss, train_rmse = get_loss_and_rmse(network,
                                                   network.loss,
                                                   x_train,
                                                   y_train,
                                                   mean_shift=mean_shift,
                                                   std_scale=std_scale)
        
        test_loss, test_rmse = get_loss_and_rmse(network,
                                                 network.loss,
                                                 x_test,
                                                 y_test,
                                                 mean_shift=mean_shift,
                                                 std_scale=std_scale)
        
        testloader = uci_base.uci_dataloader(x_test, y_test, len(y_test))
        
        metrics, test_loss = network.test(testloader, metrics, network.loss)
        train_nll.append((train_loss.cpu().data.numpy()/len(x_train) + np.log(y_stds)[0]))
        test_nll.append((test_loss.cpu().data.numpy()/len(x_test) + np.log(y_stds)[0]))

        train_rmses.append(y_stds[0]*train_rmse.cpu().data.numpy())
        test_rmses.append(y_stds[0]*test_rmse.cpu().data.numpy())
        
        fold_metric = metrics[0].memory[-1]
        print("Train loss: {:.3f} Test loss: {:.3f} RMSE: {:.3f}".format(
            train_loss/len(x_train),
            test_loss/len(x_test),
            y_stds[0]* fold_metric))

    my_rmse_mean, my_rmse_std = y_stds[0] * metrics[0].mean(), y_stds[0] * metrics[0].std()
    print("Train NLL\t = {:.3f} +/- {:.3f}".format(np.array(train_nll).mean(),
                                                   np.array(train_nll).var()**0.5))
    print("Test  NLL\t = {:.3f} +/- {:.3f}".format(np.array(test_nll).mean(),
                                                   np.array(test_nll).var()**0.5))
    print("Train RMSE\t = {:.3f} +/- {:.3f}".format(np.array(train_rmses).mean(),
                                                    np.array(train_rmses).var()**0.5))
    print("Test RMSE\t = {:.3f} +/- {:.3f}".format(np.array(test_rmses).mean(),
                                                   np.array(test_rmses).var()**0.5))
    print("My RMSE\t = {:.3f} +/- {:.3f}".format(my_rmse_mean, my_rmse_std))

# Red wine dataset

In [None]:
wine_data = wine.WineData("data/uci/wine/winequality-red.csv").data
train_mc_dropout(data=wine_data,
                       drop_prob=0.0,
                       num_epochs=40,
                       n_splits=10,
                       num_units=50,
                       learn_rate=1e-4,
                       weight_decay=0.0, #1e-1/len(data)**0.5,
                       metrics=test_metrics,
                       batch_size=1000)

# Housing dataset

In [None]:
bost_data = bost.BostonData("data/uci/bost/housing.data").data

In [None]:
net = train_mc_dropout(data=bost_data,
                       drop_prob=0.0,
                       num_epochs=40,
                       n_splits=10,
                       num_units=50,
                       learn_rate=1e-4,
                       weight_decay=0.0, #1e-1/len(data)**0.5,
                       num_samples=20,
                       log_every=50,
                       batch_size=1411)