In [1]:
import pandas
import torch
import numpy

from torch.utils.data import DataLoader, SubsetRandomSampler
from sklearn import model_selection

# Import mil containing BagModel and MilDataset from mil_pytorch
import mil_pytorch.mil as mil

## Load data

In [2]:
# Load data from files
data = pandas.read_csv('data_musk1/data.csv', header = None).values
ids = pandas.read_csv('data_musk1/ids.csv', squeeze = True, header = None).values
labels = pandas.read_csv('data_musk1/labels.csv', squeeze = True, header = None).values

# Create tensors containing data
data = torch.tensor(data)
ids = torch.tensor(ids)
labels = torch.tensor(labels)

# Create instance of MilDataset
dataset = mil.MilDataset(data, ids, labels, normalize = True)

# Create train and test data loaders (instances of DataLoader)
batch_size = 10

indices = numpy.arange(len(dataset))
train_indices, test_indices = model_selection.train_test_split(indices, shuffle = True,test_size = 0.2)

train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_dl = DataLoader(dataset, sampler = train_sampler, batch_size = batch_size, collate_fn=mil.collate) # Using custom collate_fn mil.collate
test_dl = DataLoader(dataset, sampler = test_sampler, batch_size = len(test_indices), collate_fn=mil.collate)

## Define model, criterion and optimizer

In [3]:
# Loss function for criterion
class MyHingeLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, output, target):
        target = target.double()
        hinge_loss = 1 - torch.mul(output, target)
        hinge_loss[hinge_loss<0] = 0
        
        return (torch.sum(hinge_loss, dim = 0, keepdim = True) / hinge_loss.size(0))

# Model parameters
n_neurons = 10
input_len = len(dataset.data[0])

# Defining neural networks for proccesing inputs before and after aggregation function
prepNN = torch.nn.Sequential(
    torch.nn.Linear(input_len, n_neurons, bias = True),
    torch.nn.ReLU(),
)

afterNN = torch.nn.Sequential(
    torch.nn.Linear(n_neurons, 1),
    torch.nn.Tanh()
)

# Create model, using custom created prepNN, afterNN and aggregation function
model = mil.BagModel(prepNN, afterNN, aggregation_func = torch.mean).double()

# Loss function
criterion = MyHingeLoss()

# Optimizer parameters
learning_rate = 1e-4
weight_decay = 1e-6

optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = weight_decay)

## Train model

In [4]:
import time

# Training parameters
epochs = 1000

start = time.time()
print('TRAINING:')

# Tensor for collecting losses over batches
train_losses = torch.empty(0)

for epoch in range(epochs): 
    for data, ids, labels in train_dl:
        pred = model((data, ids))
        loss = criterion(pred[:,0], labels)
        
        # Update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Save loss on this batch
        train_losses = torch.cat((train_losses, loss.float()))
    
    # Compute avarega loss on this epoch
    train_loss = torch.mean(train_losses, dim = 0, keepdim = True)
    
    # Clear tensor for saving losses over batches
    train_losses = torch.empty(0)

    # Print info about learning every 100 epochs
    if (epoch+1)%100 == 0:
        print('[{}/{}] | train_loss: {}'.format(epoch+1, epochs, train_loss.item()))

print('Finished training - elapsed time: {}'.format(time.time() - start))

TRAINING:
[100/1000] | train_loss: 0.5095319151878357
[200/1000] | train_loss: 0.3462904691696167
[300/1000] | train_loss: 0.3434804081916809
[400/1000] | train_loss: 0.19068318605422974
[500/1000] | train_loss: 0.13699862360954285
[600/1000] | train_loss: 0.11907389760017395
[700/1000] | train_loss: 0.07415377348661423
[800/1000] | train_loss: 0.06440272182226181
[900/1000] | train_loss: 0.04708762839436531
[1000/1000] | train_loss: 0.038948893547058105
Finished training - elapsed time: 20.27362608909607


## Evaluation

In [5]:
from sklearn import metrics

def eer(pred, labels):
    fpr, tpr, threshold = metrics.roc_curve(labels.detach(), pred.detach(), pos_label=1)
    fnr = 1 - tpr
    EER_fpr = fpr[numpy.nanargmin(numpy.absolute((fnr - fpr)))]
    EER_fnr = fnr[numpy.nanargmin(numpy.absolute((fnr - fpr)))]
    return EER_fpr, EER_fnr

def accuracy(pred, target, threshold = 0):
    pred = pred.detach().numpy()
    target = target.detach().numpy()

    pred[pred >= threshold] = 1
    pred[pred < threshold] = -1

    return numpy.sum(target == pred)/target.shape[0]

print('EVALUATION:')

# Train dataloader for evaluation
train_dl = DataLoader(dataset, sampler = train_sampler, batch_size = len(train_indices), collate_fn=mil.collate)

for data, ids, labels in train_dl:
    pred = model((data, ids))
    loss = criterion(pred[:,0], labels)
    acc = accuracy(pred[:,0], labels)
    eer_fpr, eer_fnr = eer(pred[:,0], labels)

print('TRAIN DATA')
print('Loss: {:6}'.format(loss.item()))
print('Accuracy: {:.2%}'.format(acc))
print('Equal error rate approximation using false positive rate: {:.3}'.format(eer_fpr))
print('Equal error rate approximation using false negative rate: {:.3}'.format(eer_fnr))


for data, ids, labels in test_dl:
    pred = model((data, ids))
    loss = criterion(pred[:,0], labels)
    acc = accuracy(pred[:,0], labels)
    eer_fpr, eer_fnr = eer(pred[:,0], labels)

print('TEST DATA')
print('Loss: {:6}'.format(loss.item()))
print('Accuracy: {:.2%}'.format(acc))
print('Equal error rate approximation using false positive rate: {:.3}'.format(eer_fpr))
print('Equal error rate approximation using false negative rate: {:.3}'.format(eer_fnr))

EVALUATION:
TRAIN DATA
Loss: 0.04240001035528072
Accuracy: 98.63%
Equal error rate approximation using false positive rate: 0.0286
Equal error rate approximation using false negative rate: 0.0
TEST DATA
Loss: 0.37781251377836034
Accuracy: 84.21%
Equal error rate approximation using false positive rate: 0.3
Equal error rate approximation using false negative rate: 0.0
