### Data Loader

Before we can estimate any model, we should load in the data that we created in `linear.Rmd`. We'll reshape it so that we can sample random subjects in each batch.

In [None]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

class LinearData(Dataset):
  def __init__(self, data):
    self.data = data
    self.subjects = data["subject"].unique()

  def __len__(self):
    return len(self.subjects)

  def __getitem__(self, index):
    samples = self.data[self.data["subject"] == self.subjects[index]]
    x = samples.pivot(index="time", columns="taxon", values="Freq")
    y = [1.0 * (samples["class"].values[0] == "healthy")]
    return np.array(x), torch.Tensor(y)

# use the data from ../generate
samples_df = pd.read_csv("../data/linear.csv")
dataset = LinearData(samples_df)
loader = DataLoader(dataset, batch_size = 16)
next(iter(loader))


Next, we let's write a model with a forward function that lets us get predicted probabilities for the two classes given the historical microbiome profile so far.

In [None]:
from torch import nn
from transformers import GPT2Config, GPT2Model

class Transformer(nn.Module):
  def __init__(self, n_embd=300, n_positions=25, n_layer=5, n_class=2):
    super(Transformer, self).__init__()
    config = GPT2Config(n_embd=n_embd, n_positions=n_positions, n_layer=n_layer)
    self.backbone = GPT2Model(config)
    self.logits = nn.Linear(n_embd * n_positions, n_class - 1)

  def forward(self, x):
    z = self.backbone(inputs_embeds=x)
    return z, self.logits(z.last_hidden_state.view(x.shape[0], -1))

model = Transformer()
z, probs = model(torch.randn((16, 25, 300)))

We can now train the model based on the input data loader, using a lightning trainer.

In [None]:
import lightning as L
from torch import optim

# define the LightningModule
class LitTransformer(L.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def training_step(self, batch, index):
        x, y = batch
        _, p_hat = self.model(x)
        loss = nn.functional.binary_cross_entropy_with_logits(p_hat, y)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


# init the autoencoder
lit_model = LitTransformer(model)
trainer = L.Trainer(max_epochs=1)
trainer.fit(model=lit_model, train_dataloaders=loader)

In [None]:
lit_model.model.eval()

p_hat = []
with torch.no_grad():
  for x, _ in loader:
    p_hat.append(lit_model.model(x)[1])

pd.DataFrame(torch.concatenate(p_hat)).to_csv("../data/p_hat.csv")

For future reference, here were the packages we installed for this package.

```
conda install conda-forge::pytorch-lightning
conda install conda-forge::pandas
conda install conda-forge::tensorboard
```