# Train and test new model

Using the saved vector representations as input, i coded a simple MLP to predict the affinity scores. After testing I will apply the same evaluation metrics as for the tankbind model and then compare the 2. Hopefully we where able to improve the error.

In [None]:
# imports
import torch
import torchmetrics
import pandas as pd

## Load data (vector representations)

In [None]:
data = pd.read_csv("vector_representations/kiba_vector_representations.csv")

## Model

In [None]:
# TODO: change model (this is just a placeholder for now)
class SimpleMLP(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleMLP, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
model = SimpleMLP(input_dim=data.shape[1] - 1, output_dim=1)  # Assuming last column is target

## Training loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

In [None]:
# create dataloader
from torch.utils.data import DataLoader, TensorDataset
X = torch.tensor(data.iloc[:, :-1].values, dtype=torch.float32).to(device)  # Features
y = torch.tensor(data.iloc[:, -1].values, dtype=torch.float32).to(device)  # Target
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# TODO: adjust dataset size so not the etire dataset is used for training (and split into train and validation set)

In [None]:
# training loop - TODO: adjust hyperparameters & validation and adjust training loop
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
metric = torchmetrics.MeanSquaredError().to(device)
num_epochs = 10

for epoch in range(num_epochs):
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()

        metric.update(outputs.squeeze(), targets)

    epoch_loss = metric.compute()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss.item():.4f}")
    metric.reset()

## Inference

In [None]:
# prep test data
test_data = pd.read_csv("vector_representations/kiba_vector_representations_test.csv")
X_test = torch.tensor(test_data.iloc[:, :-1].values, dtype=torch.float32).to(device)  # Features
y_test = torch.tensor(test_data.iloc[:, -1].values, dtype=torch.float32).to(device)  # Target
test_dataset = TensorDataset(X_test, y_test) 
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# TODO: adjust testset size so not the entire dataset is used for testing, since some is used for training

In [None]:
# inference
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        inputs, _ = batch
        inputs = inputs.to(device)
        outputs = model(inputs)
        predictions.append(outputs.cpu().numpy())

predictions = torch.cat(predictions).numpy()

## TODO: Evaluation

In [None]:
def eval_metrics(predictions, targets):
    mse = torchmetrics.functional.mean_squared_error(torch.tensor(predictions), torch.tensor(targets))
    mae = torchmetrics.functional.mean_absolute_error(torch.tensor(predictions), torch.tensor(targets))
    r2 = torchmetrics.functional.r2_score(torch.tensor(predictions), torch.tensor(targets))
    return mse.item(), mae.item(), r2.item()

In [None]:
mse, mae, r2 = eval_metrics(predictions, y_test.cpu().numpy())
print(f"Test MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")