# Test pytorch implementation

In [2]:
# Load dependencies
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import os
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from ontram_pytorch import OntramModel, InterceptNeuralNetwork, LinearShiftNeuralNetwork
from ontram_pytorch import fit_ontram, predict_ontram, classification_metrics

In [3]:
DIR = '/home/hezo/'
OUTPUT_DIR = DIR + 'ontram_pytorch/checkpoints_wine/'

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
    print(f"Created folder {OUTPUT_DIR}")
else:
    print(f"Folder {OUTPUT_DIR} already exists.")

Folder /home/hezo/ontram_pytorch/checkpoints_wine/ already exists.


# Data

In [4]:
# Load the dataset
wine = load_wine()
wine

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## General implementation

In [5]:
x_train, x_test, y_train, y_test = train_test_split(wine['data'], wine['target'], test_size=0.2, random_state=42)

# standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

x_train = torch.tensor(x_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.int64)
y_test = torch.tensor(y_test, dtype=torch.int64)
print("train: ", x_train.shape, y_train.shape)
print("test: ", x_test.shape, y_test.shape)

train:  torch.Size([142, 13]) torch.Size([142])
test:  torch.Size([36, 13]) torch.Size([36])


In [6]:
class WineNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(WineNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Define model
input_size = x_train.shape[1]  # 13 features
hidden_size = 16
output_size = len(wine.target_names)  # 3 classes
model = WineNN(input_size, hidden_size, output_size)

print(model)  # Display model architecture

WineNN(
  (fc1): Linear(in_features=13, out_features=16, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=16, out_features=3, bias=True)
)


In [7]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()  # Reset gradients
    outputs = model(x_train)  # Forward pass
    loss = criterion(outputs, y_train)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [10/100], Loss: 0.5689
Epoch [20/100], Loss: 0.2008
Epoch [30/100], Loss: 0.0795
Epoch [40/100], Loss: 0.0435
Epoch [50/100], Loss: 0.0284
Epoch [60/100], Loss: 0.0186
Epoch [70/100], Loss: 0.0130
Epoch [80/100], Loss: 0.0096
Epoch [90/100], Loss: 0.0074
Epoch [100/100], Loss: 0.0059


In [10]:
# Test the model
with torch.no_grad():
    test_outputs = model(x_test)
    predicted = torch.argmax(test_outputs, dim=1)  # Get predicted class
    accuracy = (predicted == y_test).float().mean()

print(f"Test Accuracy: {accuracy.item() * 100:.2f}%")

Test Accuracy: 100.00%


## ONTRAM implemenation

In [11]:
x_train, x_test, y_train, y_test = train_test_split(wine['data'], wine['target'], test_size=0.2, random_state=42)

# standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

x_train = torch.tensor(x_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)
y_train = F.one_hot(torch.tensor(y_train, dtype=torch.int64), num_classes=3)
y_test = F.one_hot(torch.tensor(y_test, dtype=torch.int64), num_classes=3)
print("train: ", x_train.shape, y_train.shape)
print("test: ", x_test.shape, y_test.shape)

train:  torch.Size([142, 13]) torch.Size([142, 3])
test:  torch.Size([36, 13]) torch.Size([36, 3])


In [12]:
bs = 64
epochs = 1000

In [13]:
# Create folder for model
MODEL_NAME = 'si_lsx'
MODEL_DIR = OUTPUT_DIR + MODEL_NAME

if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
    print("Created folder: ", MODEL_DIR)
else:
    print("Folder already exists: ", MODEL_DIR)

Folder already exists:  /home/hezo/ontram_pytorch/checkpoints_wine/si_lsx


In [14]:
# Directories ------------------------------------------------------------
CHECKPOINT_PATH = MODEL_DIR
CHECKPOINT_MODEL = CHECKPOINT_PATH + 'checkpoint_best_ontram.pth'

# Define datasets as data loader ------------------------------------------
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False)

# Model --------------------------------------------------------------------
nn_int = InterceptNeuralNetwork(C=y_train.shape[1])
nn_shift = LinearShiftNeuralNetwork(n_features=x_train.shape[1])
model = OntramModel(nn_int, [nn_shift])

# Training -----------------------------------------------------------------
history = fit_ontram(model, train_loader, epochs=epochs, si=True, 
                     optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9))
torch.save({'model_state_dict': model.state_dict()}, CHECKPOINT_PATH + 'checkpoint_final_ontram.pth')
history = pd.DataFrame(history['train_loss'])
history.to_csv(CHECKPOINT_PATH + 'history.csv', index=False)

# Testing ------------------------------------------------------------------
results = predict_ontram(model, test_loader, si=True)
# classification_metrics(results, y_test)

Train with GPU support.
Epoch 1/1000, Train Loss: 1.1601, Validation Loss: 0.0000
Epoch 2/1000, Train Loss: 1.1374, Validation Loss: 0.0000
Epoch 3/1000, Train Loss: 1.0082, Validation Loss: 0.0000
Epoch 4/1000, Train Loss: 0.8949, Validation Loss: 0.0000
Epoch 5/1000, Train Loss: 0.7759, Validation Loss: 0.0000
Epoch 6/1000, Train Loss: 0.7297, Validation Loss: 0.0000
Epoch 7/1000, Train Loss: 0.6717, Validation Loss: 0.0000
Epoch 8/1000, Train Loss: 0.6384, Validation Loss: 0.0000
Epoch 9/1000, Train Loss: 0.6569, Validation Loss: 0.0000
Epoch 10/1000, Train Loss: 0.6607, Validation Loss: 0.0000
Epoch 11/1000, Train Loss: 0.5889, Validation Loss: 0.0000
Epoch 12/1000, Train Loss: 0.6025, Validation Loss: 0.0000
Epoch 13/1000, Train Loss: 0.5285, Validation Loss: 0.0000
Epoch 14/1000, Train Loss: 0.5001, Validation Loss: 0.0000
Epoch 15/1000, Train Loss: 0.5190, Validation Loss: 0.0000
Epoch 16/1000, Train Loss: 0.4684, Validation Loss: 0.0000
Epoch 17/1000, Train Loss: 0.4402, Valida

In [15]:
train_loader.batch_size

64

In [16]:
int_tr = torch.from_numpy(np.ones(shape=[len(y_train[:5]),1])).float()
int_tr = int_tr.to('cuda')
x_train = x_train.to('cuda')
pred = model(int_tr, [x_train[:5]])
pred

{'int_out': tensor([[-4.9288,  2.5063],
         [-4.9288,  2.5063],
         [-4.9288,  2.5063],
         [-4.9288,  2.5063],
         [-4.9288,  2.5063]], device='cuda:0', grad_fn=<MmBackward0>),
 'shift_out': [tensor([[13.8116],
          [13.6634],
          [-4.2145],
          [13.8751],
          [-8.8776]], device='cuda:0', grad_fn=<MmBackward0>)]}

In [17]:
def transform_intercepts(int_in):
    # get batch size
    bs = int_in.shape[0]

    # Initialize class 0 and K as constants (on same device as input)
    int0 = torch.full((bs, 1), -float('inf'), device=int_in.device)
    intK = torch.full((bs, 1), float('inf'), device=int_in.device)

    # Reshape to match the batch size
    int1 = int_in[:, 0].reshape(bs, 1)

    # Exponentiate and accumulate the values for the transformation
    intk = torch.cumsum(torch.exp(int_in[:, 1:]), dim=1)
    # intk = torch.cumsum(torch.square(int_in[:, 1:]), dim=1)

    # Concatenate intercepts along the second axis (columns)
    int_out = torch.cat([int0, int1, int1 + intk, intK], dim=1)

    return int_out

In [18]:
int = transform_intercepts(pred['int_out'])
int

tensor([[   -inf, -4.9288,  7.3308,     inf],
        [   -inf, -4.9288,  7.3308,     inf],
        [   -inf, -4.9288,  7.3308,     inf],
        [   -inf, -4.9288,  7.3308,     inf],
        [   -inf, -4.9288,  7.3308,     inf]], device='cuda:0',
       grad_fn=<CatBackward0>)

In [19]:
targets = y_train[:5]
target_class_low = torch.argmax(targets, dim=1)
target_class_up = target_class_low+1
target_class_up

tensor([3, 3, 2, 3, 1])

In [20]:
shift_in = pred['shift_out']
shift = torch.stack(shift_in, dim=1).sum(dim=1).view(-1)
shift

tensor([13.8116, 13.6634, -4.2145, 13.8751, -8.8776], device='cuda:0',
       grad_fn=<ViewBackward0>)

In [21]:
int[torch.arange(int.size(0)), target_class_up]

tensor([    inf,     inf,  7.3308,     inf, -4.9288], device='cuda:0',
       grad_fn=<IndexBackward0>)

In [22]:
int[torch.arange(int.size(0)), target_class_low]

tensor([ 7.3308,  7.3308, -4.9288,  7.3308,    -inf], device='cuda:0',
       grad_fn=<IndexBackward0>)

In [23]:
int[torch.arange(int.size(0)), target_class_up]-shift

tensor([    inf,     inf, 11.5453,     inf,  3.9488], device='cuda:0',
       grad_fn=<SubBackward0>)

In [24]:
torch.sigmoid(int[torch.arange(int.size(0)), target_class_low]-shift)

tensor([0.0015, 0.0018, 0.3286, 0.0014, 0.0000], device='cuda:0',
       grad_fn=<SigmoidBackward0>)

In [25]:
lli = torch.sigmoid(int[torch.arange(int.size(0)), target_class_up]-shift) - torch.sigmoid(int[torch.arange(int.size(0)), target_class_low]-shift)
lli

tensor([0.9985, 0.9982, 0.6713, 0.9986, 0.9811], device='cuda:0',
       grad_fn=<SubBackward0>)

In [26]:
lnll = torch.log(lli)
lnll

tensor([-0.0015, -0.0018, -0.3985, -0.0014, -0.0191], device='cuda:0',
       grad_fn=<LogBackward0>)

In [27]:
-torch.mean(lnll)

tensor(0.0845, device='cuda:0', grad_fn=<NegBackward0>)

In [28]:
def ontram_nll(outputs, targets):
    # intercepts and shift terms
    int_in = outputs['int_out']
    shift_in = outputs['shift_out']
    target_class_low = torch.argmax(targets, dim=1)
    target_class_up = target_class_low+1
    #print("target class: ", target_class_up)
    # transform intercepts
    int = transform_intercepts(int_in)
    
    # likelihood contribution for each batch sample
    if shift_in is not None:
        # sum up shift terms and flatten
        shift = torch.stack(shift_in, dim=1).sum(dim=1).view(-1)
        # target_class+1 because we start with -inf when transforming tensors
        # print("up: ", int[torch.arange(int.size(0)), target_class_up])
        # print("low: ", int[torch.arange(int.size(0)), target_class_low])
        # print("shift: ", shift)
        # print("diff: ", int[torch.arange(int.size(0)), target_class_up]-shift)
        # print("class prob: ", torch.sigmoid(int[torch.arange(int.size(0)), target_class_up]-shift))
        lli = torch.sigmoid(int[torch.arange(int.size(0)), target_class_up]-shift) - torch.sigmoid(int[torch.arange(int.size(0)), target_class_low]-shift)
    else:
        lli = torch.sigmoid(int[torch.arange(int.size(0)), target_class_up]) - torch.sigmoid(int[torch.arange(int.size(0)), target_class_low])
    nll = -torch.mean(torch.log(torch.clamp(lli, min=1e-8)))
    return nll