In [1]:
import torch
from torch.utils.data import Dataset
import pickle
import numpy as np


In [195]:
# CONSTANTS
NUM_CLASSES = 100

In [196]:
def create_index(pickle_file, index_file):
    offsets = []
    with open(pickle_file, 'rb') as file:
        while True:
            offset = file.tell()
            try:
                pickle.load(file)
                offsets.append(offset)
            except EOFError:
                break
    with open(index_file, 'wb') as file:
        pickle.dump(offsets, file)

# Usage
create_index('training.pkl', 'training.idx')


In [197]:
import importlib
from torch.utils.data import DataLoader, random_split
import dataset
importlib.reload(dataset)


lazy_chem_dataset = dataset.LazyChemDataset(pickle_file='training.pkl', index_file='training.idx', n_mixture=2, num_classes=NUM_CLASSES)
train_size = int(0.8 * len(lazy_chem_dataset))
val_size = len(lazy_chem_dataset) - train_size
train_dataset, val_dataset = random_split(lazy_chem_dataset, [train_size, val_size])


# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [199]:
len(lazy_chem_dataset)

1000000

In [200]:
import torch
import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.dropout = nn.Dropout(0.5)  # Dropout layer
        self.fc2 = nn.Linear(100, num_classes)  # Output logits, not probabilities

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)  # No sigmoid here
        return x


In [201]:
import torch.optim as optim

# Initialize model, criterion, and example inputs and targets
model = SimpleNet(input_size=1781, num_classes=NUM_CLASSES)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

# Loss Function
loss_function = nn.BCEWithLogitsLoss()  # Combines sigmoid and BCE loss



# # Example input and target
# input_tensor = torch.randn((1, 300))
# target = torch.FloatTensor([[1, 0, 1, 1, 0]])  # Example multilabel target

# # Forward pass
# output = model(input_tensor)

In [102]:
def train_model(model, loss_function, optimizer, num_epochs):
    # Training Loop
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        train_loss = 0.0
        correct_preds = 0
        total_preds = 0

        for data, target in train_loader:
            data, target = data.to(device), target.to(device)  # Move data to the appropriate device

            # Forward pass
            outputs = model(data).squeeze(1)
            loss = loss_function(outputs, target)

            # Convert outputs to predicted labels
            predicted_probs = torch.sigmoid(outputs)  # Sigmoid to convert logits to probabilities
            predicted_labels = (predicted_probs > 0.5).float()  # Threshold probabilities
            # print(predicted_labels[0], target[0])

            # Calculate accuracy
            correct_preds += (predicted_labels == target).float().sum().item()
            total_preds += target.numel()

            # Backward and optimize
            optimizer.zero_grad()  # Clear gradients w.r.t. parameters
            loss.backward()  # Backpropagation
            optimizer.step()  # Update parameters

            train_loss += loss.item() * data.size(0)

        # Calculate average loss and accuracy
        train_loss /= len(train_loader.dataset)
        train_accuracy = correct_preds / total_preds * 100

        # Validation phase
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        correct_preds = 0
        total_preds = 0

        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                outputs = model(data).squeeze(1)
                loss = loss_function(outputs, target)

                # Convert outputs to predicted labels
                predicted_probs = torch.sigmoid(outputs)
                predicted_labels = (predicted_probs > 0.5).float()

                # Calculate accuracy
                correct_preds += (predicted_labels == target).float().sum().item()
                total_preds += target.numel()

                val_loss += loss.item() * data.size(0)

        val_loss /= len(val_loader.dataset)
        val_accuracy = correct_preds / total_preds * 100

        # Print training statistics
        print(f'Epoch: {epoch+1}/{num_epochs} \t'
          f'Training Loss: {train_loss:.4f} \tTraining Accuracy: {train_accuracy:.2f}% \t'
          f'Validation Loss: {val_loss:.4f} \tValidation Accuracy: {val_accuracy:.2f}%')

In [202]:
train_model(model, loss_function, optimizer, 5)

Epoch: 1/5 	Training Loss: 0.0556 	Training Accuracy: 98.45% 	Validation Loss: 0.0410 	Validation Accuracy: 98.61%
Epoch: 2/5 	Training Loss: 0.0510 	Training Accuracy: 98.57% 	Validation Loss: 0.0413 	Validation Accuracy: 98.67%
Epoch: 3/5 	Training Loss: 0.0509 	Training Accuracy: 98.57% 	Validation Loss: 0.0419 	Validation Accuracy: 98.62%
Epoch: 4/5 	Training Loss: 0.0509 	Training Accuracy: 98.57% 	Validation Loss: 0.0420 	Validation Accuracy: 98.51%
Epoch: 5/5 	Training Loss: 0.0509 	Training Accuracy: 98.58% 	Validation Loss: 0.0414 	Validation Accuracy: 98.66%


In [37]:
import pickle as pkl
with open('distributions.pkl','rb') as readfile:
  # ** x is a dictionary, with idx: [chems]
  distributions = pkl.load(readfile)

In [None]:
create_index('testing.pkl', 'testing.idx')
test_set = dataset.LazyChemDataset('testing.pkl', 'testing.idx', 3, NUM_CLASSES)

In [219]:
# TESTING
x = 0
running_correct = 0
abs_correct = 0
total = 0
for (mix, label) in test_set:
    if not mix.shape: continue
    outputs = model(mix).squeeze(1)
    predicted_probs = torch.sigmoid(outputs)  # Sigmoid to convert logits to probabilities
    predicted_labels = (predicted_probs > 0.5).float()
    running_correct += (predicted_labels == label).float().sum().item()
    total += label.numel()
    print(predicted_labels.reshape(100).shape, label.shape)
    abs_correct += torch.equal(predicted_labels.reshape(100), label)
    x += 1
    if x >= 100: break

print(running_correct/total)
print(abs_correct/x)
# idx1, idx2 = 7, 6
# odor1 = np.random.multivariate_normal(distributions[idx1]["mean"], distributions[idx1]["covariances"], size=1)
# odor2 = np.random.multivariate_normal(distributions[idx2]["mean"], distributions[idx2]["covariances"], size=1)

# # # With actual data from dataset
# # data = train_dataset[0][0]
# # label = train_dataset[0][1]
# # print(label)

# data = torch.tensor(odor1 + odor2, dtype=torch.float32)

# outputs = model(data).squeeze(1)
# predicted_probs = torch.sigmoid(outputs)  # Sigmoid to convert logits to probabilities
# predicted_labels = (predicted_probs > 0.5).float()

torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size([100])
torch.Size([100]) torch.Size

Accuracy 0.9481004148148148 for mixture of 3 on num_classes=30 with 20 samples per mixture

In [220]:
torch.save(model.state_dict(), 'N100_sigmoids.pt')


100 odors, 10 per train: 10.5 min; 0.9905 on test (100 test samples)
Epoch: 1/5 	Training Loss: 0.0556 	Training Accuracy: 98.45% 	Validation Loss: 0.0410 	Validation Accuracy: 98.61%
Epoch: 2/5 	Training Loss: 0.0510 	Training Accuracy: 98.57% 	Validation Loss: 0.0413 	Validation Accuracy: 98.67%
Epoch: 3/5 	Training Loss: 0.0509 	Training Accuracy: 98.57% 	Validation Loss: 0.0419 	Validation Accuracy: 98.62%
Epoch: 4/5 	Training Loss: 0.0509 	Training Accuracy: 98.57% 	Validation Loss: 0.0420 	Validation Accuracy: 98.51%
Epoch: 5/5 	Training Loss: 0.0509 	Training Accuracy: 98.58% 	Validation Loss: 0.0414 