## Load data

In [7]:
import sys
sys.path.append('/users/kuba/code/aic/mil')

import pandas
import torch
import numpy

from torch.utils.data import DataLoader, SubsetRandomSampler
import mil_pytorch.mil as mil
from sklearn import model_selection

# Load data from files
data = pandas.read_csv('musk2/data.csv', header = None).values
ids = pandas.read_csv('musk2/ids.csv', squeeze = True, header = None).values
labels = pandas.read_csv('musk2/labels.csv', squeeze = True, header = None).values

# Load data to torch Tensors
data = torch.tensor(data)
ids = torch.tensor(ids)
labels = torch.tensor(labels)

# Create dataset
dataset = mil.MilDataset(data, ids, labels, normalize = True)

# Create train and test data loaders
batch_size = 10

indices = numpy.arange(len(dataset))
train_indices, test_indices = model_selection.train_test_split(indices, shuffle = True, test_size = 0.2)

train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_dl = DataLoader(dataset, sampler = train_sampler, batch_size = batch_size, collate_fn=mil.collate)
test_dl = DataLoader(dataset, sampler = test_sampler, batch_size = len(test_indices), collate_fn=mil.collate)

## Define model, criterion and optimizer

In [8]:
import mil_pytorch.mil as mil

# Parameters
n_neurons1 = 10
n_neurons2 = 10
input_len = len(dataset.data[0])

# Defining neural networks for proccesing inputs before and after aggregation function
prepNN = torch.nn.Sequential(
    torch.nn.Linear(input_len, n_neurons1, bias = True),
    torch.nn.ReLU(),
)

afterNN = torch.nn.Sequential(
    torch.nn.Linear(n_neurons2, 1),
    torch.nn.Tanh()
)

# Create model, using custom created prepNN, afterNN and aggregation function
model = mil.BagModel(prepNN, afterNN, aggregation_func = torch.mean).double()

criterion = mil.MyHingeLoss()

# Optimizer parameters
learning_rate = 1e-4
weight_decay = 1e-6

optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = weight_decay)

## Train model

In [9]:
import time

epochs = 3000

start = time.time()
print('TRAINING:')

# Empty tensor for collecting losses over batches
train_losses = torch.empty(0)

for epoch in range(epochs):
    
    # Optimization
    for data, ids, labels in train_dl:
        pred = model((data, ids))
        loss = criterion(pred[:,0], labels)

        # Optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses = torch.cat((train_losses, loss.float()))

    train_loss = torch.mean(train_losses, dim = 0, keepdim = True)
    train_losses = torch.empty(0)

    # Print message
    if (epoch+1)%100 == 0:
        print('[{}/{}] | train_loss: {}'.format(epoch+1, epochs, train_loss.item()))

print('Finished training - elapsed time: {}'.format(time.time() - start))

TRAINING:
[100/3000] | train_loss: 0.6262992024421692
[200/3000] | train_loss: 0.40315574407577515
[300/3000] | train_loss: 0.2873789966106415
[400/3000] | train_loss: 0.2934229075908661
[500/3000] | train_loss: 0.21647173166275024
[600/3000] | train_loss: 0.17064376175403595
[700/3000] | train_loss: 0.14397527277469635
[800/3000] | train_loss: 0.12611819803714752
[900/3000] | train_loss: 0.10890789330005646
[1000/3000] | train_loss: 0.09039434790611267
[1100/3000] | train_loss: 0.08774199336767197
[1200/3000] | train_loss: 0.06867848336696625
[1300/3000] | train_loss: 0.05312468484044075
[1400/3000] | train_loss: 0.04415442794561386
[1500/3000] | train_loss: 0.03857382759451866
[1600/3000] | train_loss: 0.0364614836871624
[1700/3000] | train_loss: 0.02362518571317196
[1800/3000] | train_loss: 0.04299159348011017
[1900/3000] | train_loss: 0.015069548971951008
[2000/3000] | train_loss: 0.014552582055330276
[2100/3000] | train_loss: 0.01944815181195736
[2200/3000] | train_loss: 0.0077649

## Evaluation

In [10]:
from sklearn import metrics

def eer(pred, labels):
    fpr, tpr, threshold = metrics.roc_curve(labels.detach(), pred.detach(), pos_label=1)
    fnr = 1 - tpr
    EER_fpr = fpr[numpy.nanargmin(numpy.absolute((fnr - fpr)))]
    EER_fnr = fnr[numpy.nanargmin(numpy.absolute((fnr - fpr)))]
    return EER_fpr, EER_fnr

def accuracy(pred, target, threshold = 0):
    pred = pred.detach().numpy()
    target = target.detach().numpy()

    pred[pred >= threshold] = 1
    pred[pred < threshold] = -1

    return numpy.sum(target == pred)/target.shape[0]

print('EVALUATION:')

# Train dataloader for evaluation (batch size = size of dataset)
train_dl = DataLoader(dataset, sampler = train_sampler, batch_size = len(train_indices), collate_fn=mil.collate)

for data, ids, labels in train_dl:
    pred = model((data, ids))
    loss = criterion(pred[:,0], labels)
    acc = accuracy(pred[:,0], labels)
    eer_fpr, eer_fnr = eer(pred[:,0], labels)

print('Train data - ')
print(' Loss: {:6}'.format(loss.item()))
print(' Accuracy: {:.2%}'.format(acc))
print(' Equal error rate approximation using false positive rate: {:.3}'.format(eer_fpr))
print(' Equal error rate approximation using false negative rate: {:.3}'.format(eer_fnr))


for data, ids, labels in test_dl:
    pred = model((data, ids))
    loss = criterion(pred[:,0], labels)
    acc = accuracy(pred[:,0], labels)
    eer_fpr, eer_fnr = eer(pred[:,0], labels)

print('Test data - ')
print(' Loss: {:6}'.format(loss.item()))
print(' Accuracy: {:.2%}'.format(acc))
print(' Equal error rate approximation using false positive rate: {:.3}'.format(eer_fpr))
print(' Equal error rate approximation using false negative rate: {:.3}'.format(eer_fnr))

EVALUATION:
Train data - 
 Loss: 0.0010248994824667349
 Accuracy: 100.00%
 Equal error rate approximation using false positive rate: 0.0
 Equal error rate approximation using false negative rate: 0.0
Test data - 
 Loss: 0.26328833648962274
 Accuracy: 85.71%
 Equal error rate approximation using false positive rate: 0.25
 Equal error rate approximation using false negative rate: 0.0
