In [21]:
# import
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from itertools import repeat

import numpy as np
import pandas as pd
from Utils import custom_data_loader, preprocess_data, preprocess_activeL_data
from Utils.SummaryWriter import LogSummary
from Models.simpleFFBNN import SimpleFFBNN
from Models.denseRegression import DenseRegressor
from Models.paperModel import SimpleFFBNNPaper

from torch.utils.data import DataLoader
from torch.utils.data import SubsetRandomSampler

from sklearn.metrics import r2_score, mean_squared_error
import os
from scipy.stats import entropy

In [2]:
def get_device():
    """Function to get the device to be used for training the model
    """
    cuda = torch.cuda.is_available()
    print("CUDA Available: ", cuda)

    if cuda:
        gpu = GPUtil.getFirstAvailable()
        print("GPU Available: ", gpu)
        torch.cuda.set_device(gpu)
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print("Device: ", device)
    return device

device = get_device()


CUDA Available:  False
Device:  cpu


In [3]:
# load data
#dataloader_train, dataloader_test, dataloader_val = preprocess_data(pd.read_csv('/Users/kristian/Documents/Skole/9. Semester/Thesis Preparation/Code/BNNs/Data/quality_of_food.csv'), batch_size = 128)


In [4]:
model = SimpleFFBNNPaper(4, 1)

In [5]:
#checkpoint = torch.load('/Users/kristian/Documents/Skole/9. Semester/Thesis Preparation/Code/BNNs/trainedModels/simple_model.pth', map_location=torch.device('cpu'))
#print(checkpoint)
#model.load_state_dict(checkpoint['model'])
#model.load_state_dict(checkpoint)
    

In [6]:
class SaveOutput():
    def __init__(self, instances, batch_size, rounds):
        self.T = instances
        self.batch_size = batch_size
        self.outputs = []
        self.rounds = rounds
        self.counter = 0


    def __call__(self, module, module_in, module_out):
        if self.counter < 3:
            sample_data = np.random.randint(self.batch_size)
            #outs = module_out.view(self.batch_size, -1)
            outs = module_out.view(self.T, self.batch_size, -1)[:, 0, :]
            layer_size = outs.shape[1]

            
            write_summary.per_round_layer_output(layer_size, outs, self.rounds)
            
            # print the output of the layer
            
            self.counter += 1


    def clear(self):
        self.outputs = []
        


        

In [7]:
dataset_train, dataset_test, dataset_activeL = preprocess_activeL_data(pd.read_csv('/Users/kristian/Documents/Skole/9. Semester/Thesis Preparation/Code/BNNs/Data/quality_of_food.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.X['savings'] = np.where(self.X['savings'] == 'low', 0, np.where(self.X['savings'] == 'medium', 1, 2))


In [8]:
# load data using the old function
#dataloader_train, dataloader_test, dataloader_val, dataset_train, dataset_test, dataset_val = preprocess_data(pd.read_csv('/Users/kristian/Documents/Skole/9. Semester/Thesis Preparation/Code/BNNs/Data/quality_of_food.csv'), batch_size = 64)

In [42]:
class runActiveLearning():
    def __init__(self, model_name, model, top_unc, dataloader_train, dataloader_test, dataset_active_l, epochs, rounds, learning_rate, 
    batch_size, instances, seed_sample, retrain, resume_round, optimizer):
        self.model_name = model_name
        self.model = model
        self.top_unc = top_unc
        self.dataloader_train = dataloader_train
        self.dataloader_test = dataloader_test
        self.dataset_active_l = dataset_active_l
        self.epochs = epochs
        self.rounds = rounds
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.instances = instances
        self.seed_sample = seed_sample
        self.optimizer = optimizer

        # a set of lists to store the selected indices with highest uncertainty
        self.selected_data = set([])
        # unexplored data
        self.unexplored_data = set(range(len(dataloader_train)))

        # make sure sklearn.metrics.r2_score is imported
        #self.r2_score = r2_score


    
    def objective(self, output, target, kl, beta):
        '''Objective function to calculate the loss function / KL divergence'''
        loss_fun = nn.MSELoss()
        discrimination_error = loss_fun(output.view(-1), target)
        variational_bound = discrimination_error + beta * kl
        return variational_bound, discrimination_error, kl

    def get_entropy(self, y):
        '''Function to calculate the entropy of the ensemble outputs'''
        
        # get the distribution of the ensemble outputs in the first dimension (shape: 30, 64, 1)
        y = y.squeeze()

        y = y.view(self.instances, -1)

        y = y.permute(1, 0)

        y = F.softmax(y, dim=1)
    
        _entropy = -torch.sum(y * torch.log(y + 1e-8), dim=1)
        # H = entropy(y, axis=1) # calculate the entropy of the ensemble outputs using scipy.stats.entropy
        ''' H and _entropy are very similar, but the _entropy is calculated using pytorch 
        and the H is calculated using scipy.stats.entropy. The _entropy is used in the code,
        but the H is kept in the comments for reference.'''
        return _entropy

        




        
        



    def get_validation_data(self, is_validation):
        if not is_validation:
            # train sampler randomly samples data from the selected data set
            train_sampler = SubsetRandomSampler(list(self.selected_data))
            # train loader will load the data from the train sampler
            self.train_loader = DataLoader(self.dataloader_train, batch_size=self.batch_size, sampler=train_sampler, num_workers=1)

        indices = list(self.unexplored_data)
        np.random.shuffle(indices)
        split = int(np.floor(0.1 * len(indices)))  # this line is to split the training_data into 90% training and 10% validation
        validation_idx = np.random.choice(indices, size = split) # this line is to randomly select 10% of the data for validation
        train_sampler = SubsetRandomSampler(list(self.selected_data))
        validation_sampler = SubsetRandomSampler(validation_idx)
        self.train_loader = DataLoader(self.dataloader_train, batch_size=self.batch_size, sampler=train_sampler, num_workers=1)
        self.validation_loader = DataLoader(self.dataloader_train, batch_size=self.batch_size, sampler=validation_sampler, num_workers=1)

    def random_data(self, rounds):
        if rounds == 0:    
            # randomly select data
            self.selected_data = set(range(self.dataloader_train))  # seed sample in Rakeesh & Jain paper
            print(f'selected data points first round: {self.selected_data}')
            #self.unexplored_data = self.unexplored_data.difference(self.selected_data) # all 

        else:
            minimum_index = np.random.choice(list(self.unexplored_data), self.top_unc)
            print(f'minimum index: {minimum_index}')
            self.selected_data = self.selected_data.union(minimum_index)
            print(f'selected data points second round and onwards: {self.selected_data}')
            self.unexplored_data = self.unexplored_data.difference(self.selected_data)
            print(f'unexplored data points second round and onwards: {self.unexplored_data}')


    def activeDataSelection(self, rounds):
        
        # for the first round select all the data
        if rounds == 1:
            # select all the active data as all data should be predicted with uncertainty
            self.active_data = dataset_activeL
            all_data = DataLoader(self.dataset_active_l, batch_size=self.batch_size, num_workers=1)
            correct = 0
            metrics = []
            hook_handles = []
            save_output = SaveOutput(self.instances, self.batch_size, self.rounds)
            self.model.eval()
            for layer in self.model.kl_layers:
                handle = layer.register_forward_hook(save_output)
                hook_handles.append(handle)


            with torch.no_grad():
                for batch_index, (X, y) in enumerate(all_data):
                    batch_size = X.shape[0]
                    save_output.batch_size = batch_size
                    print(f'x: {X.shape}, y: {y.shape}')
                    print(f'x: {X[0:5]}, y: {y[0:5]}')
                    X = X.repeat(self.instances, 1)
                    y = y.squeeze()
                    y = y.repeat(self.instances)
                    
                    X, y = X.to(device), y.to(device)

                    y_pred = self.model(X)
                   # print(f'y_pred: {y_pred.shape}, {y_pred[0:5]}')


                    ensemble_outputs = y_pred.reshape(self.instances, batch_size, 1)
          
                    entropy = self.get_entropy(ensemble_outputs)
        
                    metrics.append(entropy)

                save_output.clear()
                save_output.counter = 0
                for handle in hook_handles:
                    handle.remove()

                metrics = torch.cat(metrics)
                
                # print all the uniwue values in the metrics

                
                new_indices = torch.argsort(metrics, descending=True).tolist()
                print(f'new indices: {new_indices}')
           
                self.selected_data = set(new_indices[:self.top_unc])
                print(f'selected data points: {self.selected_data}')
                self.unexplored_data = self.unexplored_data.difference(self.selected_data)
                print(f'unexplored data points: {self.unexplored_data}')
                print(f'length of unexplored data: {len(self.unexplored_data)}')

        else:
            pass

    def annotateSelectedData(self, rounds):
        if rounds == 1:
            # do nothing
            pass
        #else:
            # get the indices of the selected data
            #indices = list(self.selected_data)
            # get the data from the dataloader
            
            

    def TrainModel(self, rounds, epochs, is_validation):
    
        
        #print('running model')
        t_total, v_total = 0, 0
        t_r2_scores = []
        if epochs == 1:
            self.get_validation_data(is_validation)
        self.model.train()
        t_loss, v_loss = [], []
        t_likelihood, v_likelihood = [], []
        t_kl, v_kl = [], []
        self.model.train()
        m = len(self.train_loader)
       # print(f'this is the train loader: {self.train_loader}, {len(self.train_loader)}')

       # print('before loop, this is the train loader: {}'.format(self.train_loader), len(self.train_loader))
        for batch_index, (inputs, targets) in enumerate(self.train_loader):
         #   print('running loop')
            X = inputs.repeat(1, 1) # (number of mcmc samples, input size)
            Y = targets.repeat(1)
            X, Y = X.to(device), Y.to(device)
            outputs = self.model(X)
            loss, log_likelihood, kl = self.objective(outputs, Y, self.model.kl_divergence(), 1 / m)
            t_likelihood.append(log_likelihood.item())
            t_kl.append(kl.item())
            t_total += targets.size(0)
          
            # calculate r2 score manually
            r2_score_value = 1 - (np.sum((outputs.detach().cpu().numpy() - targets.detach().cpu().numpy()) ** 2) / np.sum((targets.detach().cpu().numpy() - np.mean(targets.detach().cpu().numpy())) ** 2))
            t_r2_scores.append(r2_score_value)
            
            t_loss.append(loss.item())
            loss.backward()

            # define the optimizer
            optimizer = self.optimizer

            optimizer.step()
            for layer in self.model.kl_layers:
                layer.clip_variances()
        
        if is_validation:
            #print(f'this is the validation data {self.validation_loader}, these are the characteristics {len(self.validation_loader)}')
            m_val = len(self.validation_loader)
            self.model.eval()
            for batch_index, (inputs, targets) in enumerate(self.validation_loader):
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = self.model(inputs)
                loss_val, log_likelihood_val, kl_val = self.objective(outputs, targets, self.model.kl_divergence(), 1 / m_val)
                v_total += targets.size(0)
                v_loss.append(loss_val.item())
                v_likelihood.append(log_likelihood_val.item())
                v_kl.append(kl_val.item())

            
            avg_v_loss = np.average(v_loss)
            avg_t_loss = np.average(t_loss)
            avg_v_likelihood = np.average(v_likelihood)
            avg_t_likelihood = np.average(t_likelihood)
            avg_v_kl = np.average(v_kl)
            avg_t_kl = np.average(t_kl)


            print(
                'epochs: {}, train loss: {}, train likelihood: {}, train kl: {}'.format(
                    epochs, avg_t_loss, \
                    avg_t_likelihood, avg_t_kl))

            print(
                'epochs: {}, validation loss: {}, validation likelihood: {}, validation kl: {}'.format(
                    epochs, avg_v_loss, \
                    avg_v_likelihood, avg_v_kl))

            return avg_v_loss

        else:
            avg_t_loss = np.average(t_loss)
            avg_t_likelihood = np.average(t_likelihood)
            avg_t_kl = np.average(t_kl)
            avg_t_r2 = np.average(t_r2_scores)

         #   print(
          #      'epochs: {}, train loss: {}, train likelihood: {}, train kl: {}, train_avg_R2: {}'.format(
           #         epochs, avg_t_loss, \
            #        avg_t_likelihood, avg_t_kl, avg_t_r2))

            return avg_t_loss, avg_t_r2

    
    def TestModel(self, rounds):
        if device.type == 'cpu':
            state = torch.load(self.train_weight_path, map_location=torch.device('cpu'))
        else:
            state = torch.load(self.train_weight_path)

        self.model.load_state_dict(state['weights'])
        print(f'Model loaded: {self.model}')

        self.model.eval()
        predictions = []
        actual = []
        mse_scores = []
        with torch.no_grad():
            for batch_index, (inputs, targets) in enumerate(self.dataloader_test):
                X, Y = inputs.to(device), targets.to(device)
                outputs = self.model(inputs)

                # Calculate the MSE loss for the batch
                mse_loss = nn.MSELoss()
                loss = mse_loss(outputs, Y)

                # Get the MSE score as a Python scalar
                mse_score = loss.item()
                mse_scores.append(mse_score)

                # Convert predictions and actual values to numpy arrays
                predictions.append(outputs.detach().cpu().numpy())      
                actual.append(Y.detach().cpu().numpy())
                

        predictions = np.concatenate(predictions)
        actual = np.concatenate(actual)
        df = pd.DataFrame(data = {'Predictions': predictions, 'Actual': actual})
        df.loc['R2'] = 1 - np.sum((df.Actual - df.Predictions) ** 2) / np.sum((df.Actual - np.mean(df.Actual)) ** 2)
        df.loc['MSE'] = mean_squared_error(df.Actual, df.Predictions)
        
        #print('Non-Ensemble Test MSE:{:.3f}, TestR2:{:.3f}'.format(df.loc["MSE"][0], df.loc["R2"][0]))
                


    def getTrainedModel(self, rounds):
        # path to save the trained model
        self.train_weight_path = 'trainedModels/trained_weights/' + self.model_name + '_' + 'e' + str(self.epochs) + '_' + '-r' + str(rounds) + '-b' + str(self.batch_size) + '.pkl'
        return (self.model, self.train_weight_path)


    def saveModel(self, model, optimizer, path_to_save):
        state = {
            'rounds': self.rounds,
            'weights': model.state_dict(),
            'selected_data': self.selected_data,
            'optimizer': self.optimizer.state_dict()
            }

        path_to_save = 'trainedModels/trained_weights/' + self.model_name + '_' + 'e' + str(self.epochs) + '_' + '-r' + str(self.rounds) + '-b' + str(self.batch_size) + '.pkl'

        torch.save(state, path_to_save)
        
        

In [43]:
if not os.path.isdir('trainedModels/trained_weights'):
    os.makedirs('trainedModels/trained_weights')


# use the class to run the active learning
active_learning = runActiveLearning(model_name='simple', model=model, dataloader_train=dataset_train, top_unc = 5, dataloader_test=dataset_test, dataset_active_l= dataset_activeL, epochs=10, rounds=2, learning_rate=0.001, batch_size=64, instances = 30, seed_sample=4, retrain=False, resume_round=False, optimizer= torch.optim.Adam(model.parameters(), lr=0.001))

write_summary = LogSummary('active_learning')

# get data to train the model
active_learning.get_validation_data(is_validation=True)

# train just the seed model
active_learning.TrainModel(1, 5, True)

for r in range(1, active_learning.rounds):
    print(f'Round: {r}')
    active = active_learning.activeDataSelection(r)
    


    


epochs: 5, train loss: nan, train likelihood: nan, train kl: nan
epochs: 5, validation loss: 136.4751449584961, validation likelihood: 1.1777017891407013, validation kl: 1352.974365234375
Round: 1
x: torch.Size([64, 4]), y: torch.Size([64, 1])
x: tensor([[ 0.8127, -0.8946, -1.2525, -0.3387],
        [ 1.6392, -0.5597, -1.2525, -0.3387],
        [-1.5953, -1.4526, -0.0195, -0.3387],
        [-0.5488, -1.0062, -0.0195, -0.3387],
        [-0.7723,  0.2216, -1.2525, -0.3387]]), y: tensor([[-0.8543],
        [-0.6537],
        [ 0.2214],
        [ 1.5523],
        [-1.8935]])
H: (64,), [3.4006028 3.4003687 3.400602  3.4008713 3.4009676]
_entropy: torch.Size([64]), tensor([3.4006, 3.4004, 3.4006, 3.4009, 3.4010])
x: torch.Size([64, 4]), y: torch.Size([64, 1])
x: tensor([[ 1.4592, -1.2294, -1.2525, -0.3387],
        [-0.7338,  0.1099, -0.0195, -0.3387],
        [-0.9604, -1.0062, -0.0195,  2.9527],
        [ 1.0703,  1.3377, -0.0195, -0.3387],
        [ 0.1623,  0.2216, -1.2525, -0.3387]]), y